[llvm] Stop including llvm/ADT/DenseMap.h (NFC)
[llvm-project.git] / llvm / lib / Target / X86 / X86ISelLowering.cpp
blob8a883ad26a78d969a5c41787be143630e8f7f5af
1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that X86 uses to lower LLVM code into a
10 // selection DAG.
12 //===----------------------------------------------------------------------===//
14 #include "X86ISelLowering.h"
15 #include "MCTargetDesc/X86ShuffleDecode.h"
16 #include "X86.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86TargetMachine.h"
23 #include "X86TargetObjectFile.h"
24 #include "llvm/ADT/SmallBitVector.h"
25 #include "llvm/ADT/SmallSet.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringExtras.h"
28 #include "llvm/ADT/StringSwitch.h"
29 #include "llvm/Analysis/BlockFrequencyInfo.h"
30 #include "llvm/Analysis/ObjCARCUtil.h"
31 #include "llvm/Analysis/ProfileSummaryInfo.h"
32 #include "llvm/Analysis/VectorUtils.h"
33 #include "llvm/CodeGen/IntrinsicLowering.h"
34 #include "llvm/CodeGen/MachineFrameInfo.h"
35 #include "llvm/CodeGen/MachineFunction.h"
36 #include "llvm/CodeGen/MachineInstrBuilder.h"
37 #include "llvm/CodeGen/MachineJumpTableInfo.h"
38 #include "llvm/CodeGen/MachineLoopInfo.h"
39 #include "llvm/CodeGen/MachineModuleInfo.h"
40 #include "llvm/CodeGen/MachineRegisterInfo.h"
41 #include "llvm/CodeGen/TargetLowering.h"
42 #include "llvm/CodeGen/WinEHFuncInfo.h"
43 #include "llvm/IR/CallingConv.h"
44 #include "llvm/IR/Constants.h"
45 #include "llvm/IR/DerivedTypes.h"
46 #include "llvm/IR/EHPersonalities.h"
47 #include "llvm/IR/Function.h"
48 #include "llvm/IR/GlobalAlias.h"
49 #include "llvm/IR/GlobalVariable.h"
50 #include "llvm/IR/IRBuilder.h"
51 #include "llvm/IR/Instructions.h"
52 #include "llvm/IR/Intrinsics.h"
53 #include "llvm/IR/PatternMatch.h"
54 #include "llvm/MC/MCAsmInfo.h"
55 #include "llvm/MC/MCContext.h"
56 #include "llvm/MC/MCExpr.h"
57 #include "llvm/MC/MCSymbol.h"
58 #include "llvm/Support/CommandLine.h"
59 #include "llvm/Support/Debug.h"
60 #include "llvm/Support/ErrorHandling.h"
61 #include "llvm/Support/KnownBits.h"
62 #include "llvm/Support/MathExtras.h"
63 #include "llvm/Target/TargetOptions.h"
64 #include <algorithm>
65 #include <bitset>
66 #include <cctype>
67 #include <numeric>
68 using namespace llvm;
70 #define DEBUG_TYPE "x86-isel"
72 static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
73 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
74 cl::desc(
75 "Sets the preferable loop alignment for experiments (as log2 bytes) "
76 "for innermost loops only. If specified, this option overrides "
77 "alignment set by x86-experimental-pref-loop-alignment."),
78 cl::Hidden);
80 static cl::opt<bool> MulConstantOptimization(
81 "mul-constant-optimization", cl::init(true),
82 cl::desc("Replace 'mul x, Const' with more effective instructions like "
83 "SHIFT, LEA, etc."),
84 cl::Hidden);
86 static cl::opt<bool> ExperimentalUnorderedISEL(
87 "x86-experimental-unordered-atomic-isel", cl::init(false),
88 cl::desc("Use LoadSDNode and StoreSDNode instead of "
89 "AtomicSDNode for unordered atomic loads and "
90 "stores respectively."),
91 cl::Hidden);
93 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
94 const X86Subtarget &STI)
95 : TargetLowering(TM), Subtarget(STI) {
96 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
97 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
99 // Set up the TargetLowering object.
101 // X86 is weird. It always uses i8 for shift amounts and setcc results.
102 setBooleanContents(ZeroOrOneBooleanContent);
103 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
104 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
106 // For 64-bit, since we have so many registers, use the ILP scheduler.
107 // For 32-bit, use the register pressure specific scheduling.
108 // For Atom, always use ILP scheduling.
109 if (Subtarget.isAtom())
110 setSchedulingPreference(Sched::ILP);
111 else if (Subtarget.is64Bit())
112 setSchedulingPreference(Sched::ILP);
113 else
114 setSchedulingPreference(Sched::RegPressure);
115 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
116 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
118 // Bypass expensive divides and use cheaper ones.
119 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
120 if (Subtarget.hasSlowDivide32())
121 addBypassSlowDiv(32, 8);
122 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
123 addBypassSlowDiv(64, 32);
126 // Setup Windows compiler runtime calls.
127 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
128 static const struct {
129 const RTLIB::Libcall Op;
130 const char * const Name;
131 const CallingConv::ID CC;
132 } LibraryCalls[] = {
133 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
134 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
135 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
136 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
137 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
140 for (const auto &LC : LibraryCalls) {
141 setLibcallName(LC.Op, LC.Name);
142 setLibcallCallingConv(LC.Op, LC.CC);
146 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
147 // MSVCRT doesn't have powi; fall back to pow
148 setLibcallName(RTLIB::POWI_F32, nullptr);
149 setLibcallName(RTLIB::POWI_F64, nullptr);
152 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
153 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
154 // FIXME: Should we be limiting the atomic size on other configs? Default is
155 // 1024.
156 if (!Subtarget.canUseCMPXCHG8B())
157 setMaxAtomicSizeInBitsSupported(32);
159 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
161 setMaxLargeFPConvertBitWidthSupported(128);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
185 setCondCodeAction(ISD::SETOEQ, VT, Expand);
186 setCondCodeAction(ISD::SETUNE, VT, Expand);
189 // Integer absolute.
190 if (Subtarget.canUseCMOV()) {
191 setOperationAction(ISD::ABS , MVT::i16 , Custom);
192 setOperationAction(ISD::ABS , MVT::i32 , Custom);
193 if (Subtarget.is64Bit())
194 setOperationAction(ISD::ABS , MVT::i64 , Custom);
197 // Absolute difference.
198 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
199 setOperationAction(Op , MVT::i8 , Custom);
200 setOperationAction(Op , MVT::i16 , Custom);
201 setOperationAction(Op , MVT::i32 , Custom);
202 if (Subtarget.is64Bit())
203 setOperationAction(Op , MVT::i64 , Custom);
206 // Signed saturation subtraction.
207 setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);
208 setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);
209 setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);
213 // Funnel shifts.
214 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
215 // For slow shld targets we only lower for code size.
216 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
218 setOperationAction(ShiftOp , MVT::i8 , Custom);
219 setOperationAction(ShiftOp , MVT::i16 , Custom);
220 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
221 if (Subtarget.is64Bit())
222 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
225 if (!Subtarget.useSoftFloat()) {
226 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
227 // operation.
228 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
229 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
230 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
231 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
232 // We have an algorithm for SSE2, and we turn this into a 64-bit
233 // FILD or VCVTUSI2SS/SD for other targets.
234 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
235 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
236 // We have an algorithm for SSE2->double, and we turn this into a
237 // 64-bit FILD followed by conditional FADD for other targets.
238 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
239 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
241 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
242 // this operation.
243 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
244 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
245 // SSE has no i16 to fp conversion, only i32. We promote in the handler
246 // to allow f80 to use i16 and f64 to use i16 with sse1 only
247 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
248 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
249 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
250 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
251 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
252 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
253 // are Legal, f80 is custom lowered.
254 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
255 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
257 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
258 // this operation.
259 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
260 // FIXME: This doesn't generate invalid exception when it should. PR44019.
261 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
262 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
263 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
264 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
265 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
266 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
267 // are Legal, f80 is custom lowered.
268 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
269 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
271 // Handle FP_TO_UINT by promoting the destination to a larger signed
272 // conversion.
273 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
274 // FIXME: This doesn't generate invalid exception when it should. PR44019.
275 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
276 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
278 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
279 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
280 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
281 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
282 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
284 setOperationAction(ISD::LRINT, MVT::f32, Custom);
285 setOperationAction(ISD::LRINT, MVT::f64, Custom);
286 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
287 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
289 if (!Subtarget.is64Bit()) {
290 setOperationAction(ISD::LRINT, MVT::i64, Custom);
291 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
295 if (Subtarget.hasSSE2()) {
296 // Custom lowering for saturating float to int conversions.
297 // We handle promotion to larger result types manually.
298 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
299 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
300 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
302 if (Subtarget.is64Bit()) {
303 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
304 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
308 // Handle address space casts between mixed sized pointers.
309 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
310 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
312 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
313 if (!Subtarget.hasSSE2()) {
314 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
315 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
316 if (Subtarget.is64Bit()) {
317 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
318 // Without SSE, i64->f64 goes through memory.
319 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
321 } else if (!Subtarget.is64Bit())
322 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
324 // Scalar integer divide and remainder are lowered to use operations that
325 // produce two results, to match the available instructions. This exposes
326 // the two-result form to trivial CSE, which is able to combine x/y and x%y
327 // into a single instruction.
329 // Scalar integer multiply-high is also lowered to use two-result
330 // operations, to match the available instructions. However, plain multiply
331 // (low) operations are left as Legal, as there are single-result
332 // instructions for this in x86. Using the two-result multiply instructions
333 // when both high and low results are needed must be arranged by dagcombine.
334 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
335 setOperationAction(ISD::MULHS, VT, Expand);
336 setOperationAction(ISD::MULHU, VT, Expand);
337 setOperationAction(ISD::SDIV, VT, Expand);
338 setOperationAction(ISD::UDIV, VT, Expand);
339 setOperationAction(ISD::SREM, VT, Expand);
340 setOperationAction(ISD::UREM, VT, Expand);
343 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
344 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
345 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
346 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
347 setOperationAction(ISD::BR_CC, VT, Expand);
348 setOperationAction(ISD::SELECT_CC, VT, Expand);
350 if (Subtarget.is64Bit())
351 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
352 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
353 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
354 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
356 setOperationAction(ISD::FREM , MVT::f32 , Expand);
357 setOperationAction(ISD::FREM , MVT::f64 , Expand);
358 setOperationAction(ISD::FREM , MVT::f80 , Expand);
359 setOperationAction(ISD::FREM , MVT::f128 , Expand);
361 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
362 setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom);
363 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
364 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
365 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
366 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
369 // Promote the i8 variants and force them on up to i32 which has a shorter
370 // encoding.
371 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
372 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
373 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
374 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
375 // promote that too.
376 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
377 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32);
379 if (!Subtarget.hasBMI()) {
380 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
381 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
382 if (Subtarget.is64Bit()) {
383 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
384 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
388 if (Subtarget.hasLZCNT()) {
389 // When promoting the i8 variants, force them to i32 for a shorter
390 // encoding.
391 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
392 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
393 } else {
394 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
395 if (VT == MVT::i64 && !Subtarget.is64Bit())
396 continue;
397 setOperationAction(ISD::CTLZ , VT, Custom);
398 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
402 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
403 ISD::STRICT_FP_TO_FP16}) {
404 // Special handling for half-precision floating point conversions.
405 // If we don't have F16C support, then lower half float conversions
406 // into library calls.
407 setOperationAction(
408 Op, MVT::f32,
409 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
410 // There's never any support for operations beyond MVT::f32.
411 setOperationAction(Op, MVT::f64, Expand);
412 setOperationAction(Op, MVT::f80, Expand);
413 setOperationAction(Op, MVT::f128, Expand);
416 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
417 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
418 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
419 setTruncStoreAction(VT, MVT::f16, Expand);
420 setTruncStoreAction(VT, MVT::bf16, Expand);
422 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
423 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
426 setOperationAction(ISD::PARITY, MVT::i8, Custom);
427 setOperationAction(ISD::PARITY, MVT::i16, Custom);
428 setOperationAction(ISD::PARITY, MVT::i32, Custom);
429 if (Subtarget.is64Bit())
430 setOperationAction(ISD::PARITY, MVT::i64, Custom);
431 if (Subtarget.hasPOPCNT()) {
432 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
433 // popcntw is longer to encode than popcntl and also has a false dependency
434 // on the dest that popcntl hasn't had since Cannon Lake.
435 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
436 } else {
437 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
438 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
439 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
440 if (Subtarget.is64Bit())
441 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
442 else
443 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
446 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
448 if (!Subtarget.hasMOVBE())
449 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
451 // X86 wants to expand cmov itself.
452 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
453 setOperationAction(ISD::SELECT, VT, Custom);
454 setOperationAction(ISD::SETCC, VT, Custom);
455 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
456 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
458 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
459 if (VT == MVT::i64 && !Subtarget.is64Bit())
460 continue;
461 setOperationAction(ISD::SELECT, VT, Custom);
462 setOperationAction(ISD::SETCC, VT, Custom);
465 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
466 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
467 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
469 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
470 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
471 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
472 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
473 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
474 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
475 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
476 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
478 // Darwin ABI issue.
479 for (auto VT : { MVT::i32, MVT::i64 }) {
480 if (VT == MVT::i64 && !Subtarget.is64Bit())
481 continue;
482 setOperationAction(ISD::ConstantPool , VT, Custom);
483 setOperationAction(ISD::JumpTable , VT, Custom);
484 setOperationAction(ISD::GlobalAddress , VT, Custom);
485 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
486 setOperationAction(ISD::ExternalSymbol , VT, Custom);
487 setOperationAction(ISD::BlockAddress , VT, Custom);
490 // 64-bit shl, sra, srl (iff 32-bit x86)
491 for (auto VT : { MVT::i32, MVT::i64 }) {
492 if (VT == MVT::i64 && !Subtarget.is64Bit())
493 continue;
494 setOperationAction(ISD::SHL_PARTS, VT, Custom);
495 setOperationAction(ISD::SRA_PARTS, VT, Custom);
496 setOperationAction(ISD::SRL_PARTS, VT, Custom);
499 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
500 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
502 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
504 // Expand certain atomics
505 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
506 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
507 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
508 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
509 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
510 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
511 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
512 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
515 if (!Subtarget.is64Bit())
516 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
518 if (Subtarget.canUseCMPXCHG16B())
519 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
521 // FIXME - use subtarget debug flags
522 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
523 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
524 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
525 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
528 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
529 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
531 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
532 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
534 setOperationAction(ISD::TRAP, MVT::Other, Legal);
535 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
536 if (Subtarget.isTargetPS())
537 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
538 else
539 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
541 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
542 setOperationAction(ISD::VASTART , MVT::Other, Custom);
543 setOperationAction(ISD::VAEND , MVT::Other, Expand);
544 bool Is64Bit = Subtarget.is64Bit();
545 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
546 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
548 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
549 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
551 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
553 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
554 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
555 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
557 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
559 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
560 setOperationAction(ISD::FABS, VT, Action);
561 setOperationAction(ISD::FNEG, VT, Action);
562 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
563 setOperationAction(ISD::FREM, VT, Action);
564 setOperationAction(ISD::FMA, VT, Action);
565 setOperationAction(ISD::FMINNUM, VT, Action);
566 setOperationAction(ISD::FMAXNUM, VT, Action);
567 setOperationAction(ISD::FMINIMUM, VT, Action);
568 setOperationAction(ISD::FMAXIMUM, VT, Action);
569 setOperationAction(ISD::FSIN, VT, Action);
570 setOperationAction(ISD::FCOS, VT, Action);
571 setOperationAction(ISD::FSINCOS, VT, Action);
572 setOperationAction(ISD::FSQRT, VT, Action);
573 setOperationAction(ISD::FPOW, VT, Action);
574 setOperationAction(ISD::FLOG, VT, Action);
575 setOperationAction(ISD::FLOG2, VT, Action);
576 setOperationAction(ISD::FLOG10, VT, Action);
577 setOperationAction(ISD::FEXP, VT, Action);
578 setOperationAction(ISD::FEXP2, VT, Action);
579 setOperationAction(ISD::FEXP10, VT, Action);
580 setOperationAction(ISD::FCEIL, VT, Action);
581 setOperationAction(ISD::FFLOOR, VT, Action);
582 setOperationAction(ISD::FNEARBYINT, VT, Action);
583 setOperationAction(ISD::FRINT, VT, Action);
584 setOperationAction(ISD::BR_CC, VT, Action);
585 setOperationAction(ISD::SETCC, VT, Action);
586 setOperationAction(ISD::SELECT, VT, Custom);
587 setOperationAction(ISD::SELECT_CC, VT, Action);
588 setOperationAction(ISD::FROUND, VT, Action);
589 setOperationAction(ISD::FROUNDEVEN, VT, Action);
590 setOperationAction(ISD::FTRUNC, VT, Action);
591 setOperationAction(ISD::FLDEXP, VT, Action);
594 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
595 // f16, f32 and f64 use SSE.
596 // Set up the FP register classes.
597 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
598 : &X86::FR16RegClass);
599 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
600 : &X86::FR32RegClass);
601 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
602 : &X86::FR64RegClass);
604 // Disable f32->f64 extload as we can only generate this in one instruction
605 // under optsize. So its easier to pattern match (fpext (load)) for that
606 // case instead of needing to emit 2 instructions for extload in the
607 // non-optsize case.
608 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
610 for (auto VT : { MVT::f32, MVT::f64 }) {
611 // Use ANDPD to simulate FABS.
612 setOperationAction(ISD::FABS, VT, Custom);
614 // Use XORP to simulate FNEG.
615 setOperationAction(ISD::FNEG, VT, Custom);
617 // Use ANDPD and ORPD to simulate FCOPYSIGN.
618 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
620 // These might be better off as horizontal vector ops.
621 setOperationAction(ISD::FADD, VT, Custom);
622 setOperationAction(ISD::FSUB, VT, Custom);
624 // We don't support sin/cos/fmod
625 setOperationAction(ISD::FSIN , VT, Expand);
626 setOperationAction(ISD::FCOS , VT, Expand);
627 setOperationAction(ISD::FSINCOS, VT, Expand);
630 // Half type will be promoted by default.
631 setF16Action(MVT::f16, Promote);
632 setOperationAction(ISD::FADD, MVT::f16, Promote);
633 setOperationAction(ISD::FSUB, MVT::f16, Promote);
634 setOperationAction(ISD::FMUL, MVT::f16, Promote);
635 setOperationAction(ISD::FDIV, MVT::f16, Promote);
636 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
637 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
638 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
640 setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);
641 setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);
642 setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);
643 setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);
644 setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);
645 setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);
646 setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);
647 setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);
648 setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);
649 setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);
650 setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);
651 setOperationAction(ISD::STRICT_FLDEXP, MVT::f16, Promote);
652 setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);
653 setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);
654 setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);
655 setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);
656 setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);
657 setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);
658 setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);
659 setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);
660 setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);
661 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);
662 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);
663 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
664 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
665 setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
666 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
667 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
668 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
670 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
671 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
673 // Lower this to MOVMSK plus an AND.
674 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
675 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
677 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
678 (UseX87 || Is64Bit)) {
679 // Use SSE for f32, x87 for f64.
680 // Set up the FP register classes.
681 addRegisterClass(MVT::f32, &X86::FR32RegClass);
682 if (UseX87)
683 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
685 // Use ANDPS to simulate FABS.
686 setOperationAction(ISD::FABS , MVT::f32, Custom);
688 // Use XORP to simulate FNEG.
689 setOperationAction(ISD::FNEG , MVT::f32, Custom);
691 if (UseX87)
692 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
694 // Use ANDPS and ORPS to simulate FCOPYSIGN.
695 if (UseX87)
696 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
697 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
699 // We don't support sin/cos/fmod
700 setOperationAction(ISD::FSIN , MVT::f32, Expand);
701 setOperationAction(ISD::FCOS , MVT::f32, Expand);
702 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
704 if (UseX87) {
705 // Always expand sin/cos functions even though x87 has an instruction.
706 setOperationAction(ISD::FSIN, MVT::f64, Expand);
707 setOperationAction(ISD::FCOS, MVT::f64, Expand);
708 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
710 } else if (UseX87) {
711 // f32 and f64 in x87.
712 // Set up the FP register classes.
713 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
714 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
716 for (auto VT : { MVT::f32, MVT::f64 }) {
717 setOperationAction(ISD::UNDEF, VT, Expand);
718 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
720 // Always expand sin/cos functions even though x87 has an instruction.
721 setOperationAction(ISD::FSIN , VT, Expand);
722 setOperationAction(ISD::FCOS , VT, Expand);
723 setOperationAction(ISD::FSINCOS, VT, Expand);
727 // Expand FP32 immediates into loads from the stack, save special cases.
728 if (isTypeLegal(MVT::f32)) {
729 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
730 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
731 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
732 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
733 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
734 } else // SSE immediates.
735 addLegalFPImmediate(APFloat(+0.0f)); // xorps
737 // Expand FP64 immediates into loads from the stack, save special cases.
738 if (isTypeLegal(MVT::f64)) {
739 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
740 addLegalFPImmediate(APFloat(+0.0)); // FLD0
741 addLegalFPImmediate(APFloat(+1.0)); // FLD1
742 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
743 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
744 } else // SSE immediates.
745 addLegalFPImmediate(APFloat(+0.0)); // xorpd
747 // Support fp16 0 immediate.
748 if (isTypeLegal(MVT::f16))
749 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
751 // Handle constrained floating-point operations of scalar.
752 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
753 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
754 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
755 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
756 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
757 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
758 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
759 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
760 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
761 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
762 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
763 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
765 // We don't support FMA.
766 setOperationAction(ISD::FMA, MVT::f64, Expand);
767 setOperationAction(ISD::FMA, MVT::f32, Expand);
769 // f80 always uses X87.
770 if (UseX87) {
771 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
772 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
773 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
775 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
776 addLegalFPImmediate(TmpFlt); // FLD0
777 TmpFlt.changeSign();
778 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
780 bool ignored;
781 APFloat TmpFlt2(+1.0);
782 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
783 &ignored);
784 addLegalFPImmediate(TmpFlt2); // FLD1
785 TmpFlt2.changeSign();
786 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
789 // Always expand sin/cos functions even though x87 has an instruction.
790 setOperationAction(ISD::FSIN , MVT::f80, Expand);
791 setOperationAction(ISD::FCOS , MVT::f80, Expand);
792 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
794 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
795 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
796 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
797 setOperationAction(ISD::FRINT, MVT::f80, Expand);
798 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
799 setOperationAction(ISD::FMA, MVT::f80, Expand);
800 setOperationAction(ISD::LROUND, MVT::f80, Expand);
801 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
802 setOperationAction(ISD::LRINT, MVT::f80, Custom);
803 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
805 // Handle constrained floating-point operations of scalar.
806 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
807 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
808 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
809 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
810 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
811 if (isTypeLegal(MVT::f16)) {
812 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
813 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
814 } else {
815 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
817 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
818 // as Custom.
819 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
822 // f128 uses xmm registers, but most operations require libcalls.
823 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
824 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
825 : &X86::VR128RegClass);
827 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
829 setOperationAction(ISD::FADD, MVT::f128, LibCall);
830 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
831 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
832 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
833 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
834 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
835 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
836 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
837 setOperationAction(ISD::FMA, MVT::f128, LibCall);
838 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
840 setOperationAction(ISD::FABS, MVT::f128, Custom);
841 setOperationAction(ISD::FNEG, MVT::f128, Custom);
842 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
844 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
845 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
846 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
847 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
848 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
849 // No STRICT_FSINCOS
850 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
851 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
853 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
854 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
855 // We need to custom handle any FP_ROUND with an f128 input, but
856 // LegalizeDAG uses the result type to know when to run a custom handler.
857 // So we have to list all legal floating point result types here.
858 if (isTypeLegal(MVT::f32)) {
859 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
860 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
862 if (isTypeLegal(MVT::f64)) {
863 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
864 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
866 if (isTypeLegal(MVT::f80)) {
867 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
868 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
871 setOperationAction(ISD::SETCC, MVT::f128, Custom);
873 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
874 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
875 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
876 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
877 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
878 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
881 // Always use a library call for pow.
882 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
883 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
884 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
885 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
887 setOperationAction(ISD::FLOG, MVT::f80, Expand);
888 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
889 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
890 setOperationAction(ISD::FEXP, MVT::f80, Expand);
891 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
892 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
893 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
894 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
896 // Some FP actions are always expanded for vector types.
897 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
898 MVT::v4f32, MVT::v8f32, MVT::v16f32,
899 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
900 setOperationAction(ISD::FSIN, VT, Expand);
901 setOperationAction(ISD::FSINCOS, VT, Expand);
902 setOperationAction(ISD::FCOS, VT, Expand);
903 setOperationAction(ISD::FREM, VT, Expand);
904 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
905 setOperationAction(ISD::FPOW, VT, Expand);
906 setOperationAction(ISD::FLOG, VT, Expand);
907 setOperationAction(ISD::FLOG2, VT, Expand);
908 setOperationAction(ISD::FLOG10, VT, Expand);
909 setOperationAction(ISD::FEXP, VT, Expand);
910 setOperationAction(ISD::FEXP2, VT, Expand);
911 setOperationAction(ISD::FEXP10, VT, Expand);
914 // First set operation action for all vector types to either promote
915 // (for widening) or expand (for scalarization). Then we will selectively
916 // turn on ones that can be effectively codegen'd.
917 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
918 setOperationAction(ISD::SDIV, VT, Expand);
919 setOperationAction(ISD::UDIV, VT, Expand);
920 setOperationAction(ISD::SREM, VT, Expand);
921 setOperationAction(ISD::UREM, VT, Expand);
922 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
923 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
924 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
925 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
926 setOperationAction(ISD::FMA, VT, Expand);
927 setOperationAction(ISD::FFLOOR, VT, Expand);
928 setOperationAction(ISD::FCEIL, VT, Expand);
929 setOperationAction(ISD::FTRUNC, VT, Expand);
930 setOperationAction(ISD::FRINT, VT, Expand);
931 setOperationAction(ISD::FNEARBYINT, VT, Expand);
932 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
933 setOperationAction(ISD::MULHS, VT, Expand);
934 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
935 setOperationAction(ISD::MULHU, VT, Expand);
936 setOperationAction(ISD::SDIVREM, VT, Expand);
937 setOperationAction(ISD::UDIVREM, VT, Expand);
938 setOperationAction(ISD::CTPOP, VT, Expand);
939 setOperationAction(ISD::CTTZ, VT, Expand);
940 setOperationAction(ISD::CTLZ, VT, Expand);
941 setOperationAction(ISD::ROTL, VT, Expand);
942 setOperationAction(ISD::ROTR, VT, Expand);
943 setOperationAction(ISD::BSWAP, VT, Expand);
944 setOperationAction(ISD::SETCC, VT, Expand);
945 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
946 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
947 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
948 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
949 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
950 setOperationAction(ISD::TRUNCATE, VT, Expand);
951 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
952 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
953 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
954 setOperationAction(ISD::SELECT_CC, VT, Expand);
955 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
956 setTruncStoreAction(InnerVT, VT, Expand);
958 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
959 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
961 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
962 // types, we have to deal with them whether we ask for Expansion or not.
963 // Setting Expand causes its own optimisation problems though, so leave
964 // them legal.
965 if (VT.getVectorElementType() == MVT::i1)
966 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
968 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
969 // split/scalarized right now.
970 if (VT.getVectorElementType() == MVT::f16 ||
971 VT.getVectorElementType() == MVT::bf16)
972 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
976 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
977 // with -msoft-float, disable use of MMX as well.
978 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
979 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
980 // No operations on x86mmx supported, everything uses intrinsics.
983 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
984 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
985 : &X86::VR128RegClass);
987 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
988 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
990 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
991 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
992 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
993 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
994 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
995 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
996 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
997 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
999 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1000 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1002 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
1003 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
1004 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
1005 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
1006 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
1009 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1010 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1011 : &X86::VR128RegClass);
1013 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1014 // registers cannot be used even for integer operations.
1015 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1016 : &X86::VR128RegClass);
1017 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1018 : &X86::VR128RegClass);
1019 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1020 : &X86::VR128RegClass);
1021 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1022 : &X86::VR128RegClass);
1023 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1024 : &X86::VR128RegClass);
1026 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1027 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1028 setOperationAction(ISD::FMINIMUM, VT, Custom);
1031 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1032 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1033 setOperationAction(ISD::SDIV, VT, Custom);
1034 setOperationAction(ISD::SREM, VT, Custom);
1035 setOperationAction(ISD::UDIV, VT, Custom);
1036 setOperationAction(ISD::UREM, VT, Custom);
1039 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1040 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1041 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1043 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1044 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1045 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1046 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1047 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1048 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1049 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1050 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1051 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1052 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1053 setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
1054 setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
1056 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1057 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1058 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1060 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1061 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1062 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
1064 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1065 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1066 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1067 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1068 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1071 setOperationAction(ISD::ABDU, MVT::v16i8, Custom);
1072 setOperationAction(ISD::ABDS, MVT::v16i8, Custom);
1073 setOperationAction(ISD::ABDU, MVT::v8i16, Custom);
1074 setOperationAction(ISD::ABDS, MVT::v8i16, Custom);
1075 setOperationAction(ISD::ABDU, MVT::v4i32, Custom);
1076 setOperationAction(ISD::ABDS, MVT::v4i32, Custom);
1078 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
1079 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
1080 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
1081 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
1082 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
1083 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
1084 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
1085 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
1086 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
1087 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
1089 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1090 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
1091 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
1092 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
1094 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1095 setOperationAction(ISD::SETCC, VT, Custom);
1096 setOperationAction(ISD::CTPOP, VT, Custom);
1097 setOperationAction(ISD::ABS, VT, Custom);
1099 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1100 // setcc all the way to isel and prefer SETGT in some isel patterns.
1101 setCondCodeAction(ISD::SETLT, VT, Custom);
1102 setCondCodeAction(ISD::SETLE, VT, Custom);
1105 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1106 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1107 setOperationAction(ISD::STRICT_FSETCC, MVT::v2f64, Custom);
1108 setOperationAction(ISD::STRICT_FSETCC, MVT::v4f32, Custom);
1109 setOperationAction(ISD::STRICT_FSETCCS, MVT::v2f64, Custom);
1110 setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f32, Custom);
1112 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1113 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1114 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1115 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1116 setOperationAction(ISD::VSELECT, VT, Custom);
1117 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1120 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1121 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1122 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1123 setOperationAction(ISD::VSELECT, VT, Custom);
1125 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1126 continue;
1128 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1129 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1131 setF16Action(MVT::v8f16, Expand);
1132 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1133 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1134 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1135 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1137 // Custom lower v2i64 and v2f64 selects.
1138 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1139 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1140 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1141 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1142 setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
1143 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1145 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
1146 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1147 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1148 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1149 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);
1150 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1152 // Custom legalize these to avoid over promotion or custom promotion.
1153 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1154 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1155 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1156 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1157 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1160 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
1161 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);
1162 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1163 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1165 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1166 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1168 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1169 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1171 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1172 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1173 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1174 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1175 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1177 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1178 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1179 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1180 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1182 // We want to legalize this to an f64 load rather than an i64 load on
1183 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1184 // store.
1185 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1186 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1187 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1188 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1189 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1190 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1192 // Add 32-bit vector stores to help vectorization opportunities.
1193 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1194 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1196 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1197 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1198 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1199 if (!Subtarget.hasAVX512())
1200 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1202 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1203 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1204 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1206 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1208 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1209 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1210 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1211 setOperationAction(ISD::TRUNCATE, MVT::v2i64, Custom);
1212 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1213 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1214 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1215 setOperationAction(ISD::TRUNCATE, MVT::v4i64, Custom);
1216 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1217 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1218 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1219 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1220 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1221 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1222 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1223 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1225 // In the customized shift lowering, the legal v4i32/v2i64 cases
1226 // in AVX2 will be recognized.
1227 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1228 setOperationAction(ISD::SRL, VT, Custom);
1229 setOperationAction(ISD::SHL, VT, Custom);
1230 setOperationAction(ISD::SRA, VT, Custom);
1231 if (VT == MVT::v2i64) continue;
1232 setOperationAction(ISD::ROTL, VT, Custom);
1233 setOperationAction(ISD::ROTR, VT, Custom);
1234 setOperationAction(ISD::FSHL, VT, Custom);
1235 setOperationAction(ISD::FSHR, VT, Custom);
1238 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1239 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1240 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1241 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1242 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1245 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1246 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1247 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1248 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1249 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1250 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1251 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1252 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1253 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1255 // These might be better off as horizontal vector ops.
1256 setOperationAction(ISD::ADD, MVT::i16, Custom);
1257 setOperationAction(ISD::ADD, MVT::i32, Custom);
1258 setOperationAction(ISD::SUB, MVT::i16, Custom);
1259 setOperationAction(ISD::SUB, MVT::i32, Custom);
1262 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1263 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1264 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1265 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1266 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1267 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1268 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1269 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1270 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1271 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1272 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1273 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1274 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1275 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1277 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1280 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1281 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1282 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1283 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1284 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1285 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1286 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1287 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1289 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1290 setOperationAction(ISD::ABDS, VT, Custom);
1291 setOperationAction(ISD::ABDU, VT, Custom);
1294 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1295 setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
1296 setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
1298 // FIXME: Do we need to handle scalar-to-vector here?
1299 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1300 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1302 // We directly match byte blends in the backend as they match the VSELECT
1303 // condition form.
1304 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1306 // SSE41 brings specific instructions for doing vector sign extend even in
1307 // cases where we don't have SRA.
1308 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1309 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1310 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1313 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1314 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1315 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1316 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1317 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1318 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1319 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1320 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1323 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1324 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1325 // do the pre and post work in the vector domain.
1326 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1327 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1328 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1329 // so that DAG combine doesn't try to turn it into uint_to_fp.
1330 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1331 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1335 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1336 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1339 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1340 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1341 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1342 setOperationAction(ISD::ROTL, VT, Custom);
1343 setOperationAction(ISD::ROTR, VT, Custom);
1346 // XOP can efficiently perform BITREVERSE with VPPERM.
1347 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1348 setOperationAction(ISD::BITREVERSE, VT, Custom);
1350 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1351 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1352 setOperationAction(ISD::BITREVERSE, VT, Custom);
1355 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1356 bool HasInt256 = Subtarget.hasInt256();
1358 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1359 : &X86::VR256RegClass);
1360 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1361 : &X86::VR256RegClass);
1362 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1363 : &X86::VR256RegClass);
1364 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1365 : &X86::VR256RegClass);
1366 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1367 : &X86::VR256RegClass);
1368 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1369 : &X86::VR256RegClass);
1370 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1371 : &X86::VR256RegClass);
1373 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1374 setOperationAction(ISD::FFLOOR, VT, Legal);
1375 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1376 setOperationAction(ISD::FCEIL, VT, Legal);
1377 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1378 setOperationAction(ISD::FTRUNC, VT, Legal);
1379 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1380 setOperationAction(ISD::FRINT, VT, Legal);
1381 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1382 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1383 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1384 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1385 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1387 setOperationAction(ISD::FROUND, VT, Custom);
1389 setOperationAction(ISD::FNEG, VT, Custom);
1390 setOperationAction(ISD::FABS, VT, Custom);
1391 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1393 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1394 setOperationAction(ISD::FMINIMUM, VT, Custom);
1397 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1398 // even though v8i16 is a legal type.
1399 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1400 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1401 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1402 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1403 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);
1404 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1405 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);
1407 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);
1408 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);
1409 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1410 setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);
1411 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1412 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);
1414 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1415 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1416 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1417 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1418 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1419 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1420 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1421 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1422 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1423 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1424 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1426 if (!Subtarget.hasAVX512())
1427 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1429 // In the customized shift lowering, the legal v8i32/v4i64 cases
1430 // in AVX2 will be recognized.
1431 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1432 setOperationAction(ISD::SRL, VT, Custom);
1433 setOperationAction(ISD::SHL, VT, Custom);
1434 setOperationAction(ISD::SRA, VT, Custom);
1435 setOperationAction(ISD::ABDS, VT, Custom);
1436 setOperationAction(ISD::ABDU, VT, Custom);
1437 if (VT == MVT::v4i64) continue;
1438 setOperationAction(ISD::ROTL, VT, Custom);
1439 setOperationAction(ISD::ROTR, VT, Custom);
1440 setOperationAction(ISD::FSHL, VT, Custom);
1441 setOperationAction(ISD::FSHR, VT, Custom);
1444 // These types need custom splitting if their input is a 128-bit vector.
1445 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1446 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1447 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1448 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1450 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1451 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1452 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1453 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1454 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1455 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1456 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1458 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1459 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1460 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1461 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1464 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1465 setOperationAction(ISD::TRUNCATE, MVT::v32i16, Custom);
1466 setOperationAction(ISD::TRUNCATE, MVT::v32i32, Custom);
1467 setOperationAction(ISD::TRUNCATE, MVT::v32i64, Custom);
1469 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1471 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1472 setOperationAction(ISD::SETCC, VT, Custom);
1473 setOperationAction(ISD::CTPOP, VT, Custom);
1474 setOperationAction(ISD::CTLZ, VT, Custom);
1476 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1477 // setcc all the way to isel and prefer SETGT in some isel patterns.
1478 setCondCodeAction(ISD::SETLT, VT, Custom);
1479 setCondCodeAction(ISD::SETLE, VT, Custom);
1482 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1483 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1484 setOperationAction(ISD::STRICT_FSETCC, MVT::v4f64, Custom);
1485 setOperationAction(ISD::STRICT_FSETCC, MVT::v8f32, Custom);
1486 setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f64, Custom);
1487 setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f32, Custom);
1489 if (Subtarget.hasAnyFMA()) {
1490 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1491 MVT::v2f64, MVT::v4f64 }) {
1492 setOperationAction(ISD::FMA, VT, Legal);
1493 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1497 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1498 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1499 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1502 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1503 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1504 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1505 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1507 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1508 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1509 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1510 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1511 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1512 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1513 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1514 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1516 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1517 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1519 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1520 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1521 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1522 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1523 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1525 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1526 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1527 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1528 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1529 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1530 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1531 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1532 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1533 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1534 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1535 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1536 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1538 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1539 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1540 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1541 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1542 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1543 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1546 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1547 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1548 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1551 if (HasInt256) {
1552 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1553 // when we have a 256bit-wide blend with immediate.
1554 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1555 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1557 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1558 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1559 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1560 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1561 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1562 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1563 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1564 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1568 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1569 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1570 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1571 setOperationAction(ISD::MSTORE, VT, Legal);
1574 // Extract subvector is special because the value type
1575 // (result) is 128-bit but the source is 256-bit wide.
1576 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1577 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1578 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1581 // Custom lower several nodes for 256-bit types.
1582 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1583 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1584 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1585 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1586 setOperationAction(ISD::VSELECT, VT, Custom);
1587 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1588 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1589 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1590 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1591 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1592 setOperationAction(ISD::STORE, VT, Custom);
1594 setF16Action(MVT::v16f16, Expand);
1595 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1596 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1597 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1598 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1600 if (HasInt256) {
1601 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1603 // Custom legalize 2x32 to get a little better code.
1604 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1605 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1607 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1608 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1609 setOperationAction(ISD::MGATHER, VT, Custom);
1613 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1614 Subtarget.hasF16C()) {
1615 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1616 setOperationAction(ISD::FP_ROUND, VT, Custom);
1617 setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
1619 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1620 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1621 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
1623 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1624 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1625 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1629 // This block controls legalization of the mask vector sizes that are
1630 // available with AVX512. 512-bit vectors are in a separate block controlled
1631 // by useAVX512Regs.
1632 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1633 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1634 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1635 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1636 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1637 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1639 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1640 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1641 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1643 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1644 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1645 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1646 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1647 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1648 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1649 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1650 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1651 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1652 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1653 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1654 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1656 // There is no byte sized k-register load or store without AVX512DQ.
1657 if (!Subtarget.hasDQI()) {
1658 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1659 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1660 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1661 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1663 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1664 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1665 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1666 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1669 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1670 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1671 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1672 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1673 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1676 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1677 setOperationAction(ISD::VSELECT, VT, Expand);
1679 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1680 setOperationAction(ISD::SETCC, VT, Custom);
1681 setOperationAction(ISD::SELECT, VT, Custom);
1682 setOperationAction(ISD::TRUNCATE, VT, Custom);
1684 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1685 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1686 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1687 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1688 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1689 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1692 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1693 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1696 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1697 // elements. 512-bits can be disabled based on prefer-vector-width and
1698 // required-vector-width function attributes.
1699 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1700 bool HasBWI = Subtarget.hasBWI();
1702 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1703 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1704 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1705 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1706 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1707 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1708 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1710 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1711 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1712 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1713 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1714 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1715 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1716 if (HasBWI)
1717 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1720 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1721 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1722 setOperationAction(ISD::FMINIMUM, VT, Custom);
1723 setOperationAction(ISD::FNEG, VT, Custom);
1724 setOperationAction(ISD::FABS, VT, Custom);
1725 setOperationAction(ISD::FMA, VT, Legal);
1726 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1727 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1730 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1731 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1732 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1733 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1734 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1737 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1738 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1739 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1740 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1741 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1744 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);
1745 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);
1746 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
1747 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
1748 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1749 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);
1751 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1752 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1753 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1754 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1755 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1756 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1757 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1758 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1759 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1760 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1761 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1763 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1764 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1765 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1766 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1767 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1768 if (HasBWI)
1769 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1771 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1772 // to 512-bit rather than use the AVX2 instructions so that we can use
1773 // k-masks.
1774 if (!Subtarget.hasVLX()) {
1775 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1776 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1777 setOperationAction(ISD::MLOAD, VT, Custom);
1778 setOperationAction(ISD::MSTORE, VT, Custom);
1782 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1783 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1784 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1785 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1786 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1787 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1788 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1789 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1790 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1791 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1792 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1793 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1795 if (HasBWI) {
1796 // Extends from v64i1 masks to 512-bit vectors.
1797 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1798 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1799 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1802 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1803 setOperationAction(ISD::FFLOOR, VT, Legal);
1804 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1805 setOperationAction(ISD::FCEIL, VT, Legal);
1806 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1807 setOperationAction(ISD::FTRUNC, VT, Legal);
1808 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1809 setOperationAction(ISD::FRINT, VT, Legal);
1810 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1811 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1812 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1813 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1814 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1816 setOperationAction(ISD::FROUND, VT, Custom);
1819 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1820 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1821 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1824 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1825 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1826 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1827 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1829 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1830 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1831 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1832 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1834 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1835 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1836 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1837 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1838 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1839 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1840 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1841 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1843 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1844 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1846 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1848 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1849 setOperationAction(ISD::SRL, VT, Custom);
1850 setOperationAction(ISD::SHL, VT, Custom);
1851 setOperationAction(ISD::SRA, VT, Custom);
1852 setOperationAction(ISD::ROTL, VT, Custom);
1853 setOperationAction(ISD::ROTR, VT, Custom);
1854 setOperationAction(ISD::SETCC, VT, Custom);
1855 setOperationAction(ISD::ABDS, VT, Custom);
1856 setOperationAction(ISD::ABDU, VT, Custom);
1858 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1859 // setcc all the way to isel and prefer SETGT in some isel patterns.
1860 setCondCodeAction(ISD::SETLT, VT, Custom);
1861 setCondCodeAction(ISD::SETLE, VT, Custom);
1864 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1865 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1866 setOperationAction(ISD::STRICT_FSETCC, MVT::v8f64, Custom);
1867 setOperationAction(ISD::STRICT_FSETCC, MVT::v16f32, Custom);
1868 setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f64, Custom);
1869 setOperationAction(ISD::STRICT_FSETCCS, MVT::v16f32, Custom);
1871 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1872 setOperationAction(ISD::SMAX, VT, Legal);
1873 setOperationAction(ISD::UMAX, VT, Legal);
1874 setOperationAction(ISD::SMIN, VT, Legal);
1875 setOperationAction(ISD::UMIN, VT, Legal);
1876 setOperationAction(ISD::ABS, VT, Legal);
1877 setOperationAction(ISD::CTPOP, VT, Custom);
1880 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1881 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1882 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1883 setOperationAction(ISD::CTLZ, VT, Custom);
1884 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1885 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1886 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1887 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1888 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1889 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1890 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1891 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1894 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1895 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1896 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1897 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1898 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1899 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1901 if (Subtarget.hasDQI()) {
1902 for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1903 ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
1904 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1905 setOperationAction(Opc, MVT::v8i64, Custom);
1906 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1909 if (Subtarget.hasCDI()) {
1910 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1911 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1912 setOperationAction(ISD::CTLZ, VT, Legal);
1914 } // Subtarget.hasCDI()
1916 if (Subtarget.hasVPOPCNTDQ()) {
1917 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1918 setOperationAction(ISD::CTPOP, VT, Legal);
1921 // Extract subvector is special because the value type
1922 // (result) is 256-bit but the source is 512-bit wide.
1923 // 128-bit was made Legal under AVX1.
1924 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1925 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1926 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1928 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1929 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
1930 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1931 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1932 setOperationAction(ISD::SELECT, VT, Custom);
1933 setOperationAction(ISD::VSELECT, VT, Custom);
1934 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1935 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1936 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1937 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1938 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1940 setF16Action(MVT::v32f16, Expand);
1941 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
1942 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
1943 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
1944 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom);
1945 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1946 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1947 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
1950 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1951 setOperationAction(ISD::MLOAD, VT, Legal);
1952 setOperationAction(ISD::MSTORE, VT, Legal);
1953 setOperationAction(ISD::MGATHER, VT, Custom);
1954 setOperationAction(ISD::MSCATTER, VT, Custom);
1956 if (HasBWI) {
1957 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1958 setOperationAction(ISD::MLOAD, VT, Legal);
1959 setOperationAction(ISD::MSTORE, VT, Legal);
1961 } else {
1962 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1963 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1966 if (Subtarget.hasVBMI2()) {
1967 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1968 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1969 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1970 setOperationAction(ISD::FSHL, VT, Custom);
1971 setOperationAction(ISD::FSHR, VT, Custom);
1974 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1975 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1976 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1977 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1979 }// useAVX512Regs
1981 // This block controls legalization for operations that don't have
1982 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1983 // narrower widths.
1984 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1985 // These operations are handled on non-VLX by artificially widening in
1986 // isel patterns.
1988 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
1989 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
1990 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1992 if (Subtarget.hasDQI()) {
1993 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1994 // v2f32 UINT_TO_FP is already custom under SSE2.
1995 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1996 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
1997 "Unexpected operation action!");
1998 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1999 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
2000 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
2001 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
2002 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
2005 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2006 setOperationAction(ISD::SMAX, VT, Legal);
2007 setOperationAction(ISD::UMAX, VT, Legal);
2008 setOperationAction(ISD::SMIN, VT, Legal);
2009 setOperationAction(ISD::UMIN, VT, Legal);
2010 setOperationAction(ISD::ABS, VT, Legal);
2013 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2014 setOperationAction(ISD::ROTL, VT, Custom);
2015 setOperationAction(ISD::ROTR, VT, Custom);
2018 // Custom legalize 2x32 to get a little better code.
2019 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2020 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2022 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2023 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2024 setOperationAction(ISD::MSCATTER, VT, Custom);
2026 if (Subtarget.hasDQI()) {
2027 for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
2028 ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
2029 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {
2030 setOperationAction(Opc, MVT::v2i64, Custom);
2031 setOperationAction(Opc, MVT::v4i64, Custom);
2033 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2034 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2037 if (Subtarget.hasCDI()) {
2038 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2039 setOperationAction(ISD::CTLZ, VT, Legal);
2041 } // Subtarget.hasCDI()
2043 if (Subtarget.hasVPOPCNTDQ()) {
2044 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2045 setOperationAction(ISD::CTPOP, VT, Legal);
2049 // This block control legalization of v32i1/v64i1 which are available with
2050 // AVX512BW..
2051 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2052 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2053 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2055 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2056 setOperationAction(ISD::VSELECT, VT, Expand);
2057 setOperationAction(ISD::TRUNCATE, VT, Custom);
2058 setOperationAction(ISD::SETCC, VT, Custom);
2059 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2060 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
2061 setOperationAction(ISD::SELECT, VT, Custom);
2062 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2063 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2064 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
2065 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
2068 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2069 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
2071 // Extends from v32i1 masks to 256-bit vectors.
2072 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
2073 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
2074 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
2076 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2077 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2078 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2081 // These operations are handled on non-VLX by artificially widening in
2082 // isel patterns.
2083 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2085 if (Subtarget.hasBITALG()) {
2086 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2087 setOperationAction(ISD::CTPOP, VT, Legal);
2091 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2092 auto setGroup = [&] (MVT VT) {
2093 setOperationAction(ISD::FADD, VT, Legal);
2094 setOperationAction(ISD::STRICT_FADD, VT, Legal);
2095 setOperationAction(ISD::FSUB, VT, Legal);
2096 setOperationAction(ISD::STRICT_FSUB, VT, Legal);
2097 setOperationAction(ISD::FMUL, VT, Legal);
2098 setOperationAction(ISD::STRICT_FMUL, VT, Legal);
2099 setOperationAction(ISD::FDIV, VT, Legal);
2100 setOperationAction(ISD::STRICT_FDIV, VT, Legal);
2101 setOperationAction(ISD::FSQRT, VT, Legal);
2102 setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
2104 setOperationAction(ISD::FFLOOR, VT, Legal);
2105 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
2106 setOperationAction(ISD::FCEIL, VT, Legal);
2107 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
2108 setOperationAction(ISD::FTRUNC, VT, Legal);
2109 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
2110 setOperationAction(ISD::FRINT, VT, Legal);
2111 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
2112 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2113 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
2115 setOperationAction(ISD::FROUND, VT, Custom);
2117 setOperationAction(ISD::LOAD, VT, Legal);
2118 setOperationAction(ISD::STORE, VT, Legal);
2120 setOperationAction(ISD::FMA, VT, Legal);
2121 setOperationAction(ISD::STRICT_FMA, VT, Legal);
2122 setOperationAction(ISD::VSELECT, VT, Legal);
2123 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2124 setOperationAction(ISD::SELECT, VT, Custom);
2126 setOperationAction(ISD::FNEG, VT, Custom);
2127 setOperationAction(ISD::FABS, VT, Custom);
2128 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
2129 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2130 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2132 setOperationAction(ISD::SETCC, VT, Custom);
2133 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
2134 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
2137 // AVX512_FP16 scalar operations
2138 setGroup(MVT::f16);
2139 setOperationAction(ISD::FREM, MVT::f16, Promote);
2140 setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
2141 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
2142 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2143 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
2144 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2145 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
2146 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
2147 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
2148 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2149 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2150 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2151 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
2153 setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
2154 setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
2156 if (Subtarget.useAVX512Regs()) {
2157 setGroup(MVT::v32f16);
2158 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
2159 setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
2160 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
2161 setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
2162 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
2163 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2164 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
2165 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2166 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
2167 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2168 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
2169 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
2171 setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
2172 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
2173 setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
2174 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
2175 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2176 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
2177 MVT::v32i16);
2178 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2179 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
2180 MVT::v32i16);
2181 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2182 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
2183 MVT::v32i16);
2184 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2185 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
2186 MVT::v32i16);
2188 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
2189 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
2190 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
2192 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2193 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2196 if (Subtarget.hasVLX()) {
2197 setGroup(MVT::v8f16);
2198 setGroup(MVT::v16f16);
2200 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
2201 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
2202 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
2203 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
2204 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
2205 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
2206 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
2207 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
2208 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
2209 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
2211 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
2212 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
2213 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
2214 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
2215 setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
2216 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
2217 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2218 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
2219 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2220 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
2222 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2223 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
2224 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
2226 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
2227 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
2228 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
2230 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2231 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2232 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2235 // Need to custom widen these to prevent scalarization.
2236 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2237 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2241 if (!Subtarget.useSoftFloat() &&
2242 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2243 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2244 : &X86::VR128RegClass);
2245 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2246 : &X86::VR256RegClass);
2247 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2248 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2249 // Set the operation action Custom to do the customization later.
2250 setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);
2251 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::bf16, Custom);
2252 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2253 setF16Action(VT, Expand);
2254 setOperationAction(ISD::FADD, VT, Expand);
2255 setOperationAction(ISD::FSUB, VT, Expand);
2256 setOperationAction(ISD::FMUL, VT, Expand);
2257 setOperationAction(ISD::FDIV, VT, Expand);
2258 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2259 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2261 setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
2262 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2265 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2266 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2267 setF16Action(MVT::v32bf16, Expand);
2268 setOperationAction(ISD::FADD, MVT::v32bf16, Expand);
2269 setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);
2270 setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
2271 setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
2272 setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
2273 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2274 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
2277 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2278 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2279 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2280 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2281 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2282 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2284 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2285 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2286 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2287 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2288 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2290 if (Subtarget.hasBWI()) {
2291 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2292 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2295 if (Subtarget.hasFP16()) {
2296 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2297 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
2298 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2299 setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
2300 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2301 setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
2302 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2303 setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
2304 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2305 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2306 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
2307 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2308 setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
2309 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2310 setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
2311 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2312 setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
2313 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2314 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2315 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
2316 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
2317 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
2318 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
2319 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2320 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2321 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
2322 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2323 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
2327 if (Subtarget.hasAMXTILE()) {
2328 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2331 // We want to custom lower some of our intrinsics.
2332 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2333 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2334 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2335 if (!Subtarget.is64Bit()) {
2336 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2339 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2340 // handle type legalization for these operations here.
2342 // FIXME: We really should do custom legalization for addition and
2343 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2344 // than generic legalization for 64-bit multiplication-with-overflow, though.
2345 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2346 if (VT == MVT::i64 && !Subtarget.is64Bit())
2347 continue;
2348 // Add/Sub/Mul with overflow operations are custom lowered.
2349 setOperationAction(ISD::SADDO, VT, Custom);
2350 setOperationAction(ISD::UADDO, VT, Custom);
2351 setOperationAction(ISD::SSUBO, VT, Custom);
2352 setOperationAction(ISD::USUBO, VT, Custom);
2353 setOperationAction(ISD::SMULO, VT, Custom);
2354 setOperationAction(ISD::UMULO, VT, Custom);
2356 // Support carry in as value rather than glue.
2357 setOperationAction(ISD::UADDO_CARRY, VT, Custom);
2358 setOperationAction(ISD::USUBO_CARRY, VT, Custom);
2359 setOperationAction(ISD::SETCCCARRY, VT, Custom);
2360 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2361 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2364 if (!Subtarget.is64Bit()) {
2365 // These libcalls are not available in 32-bit.
2366 setLibcallName(RTLIB::SHL_I128, nullptr);
2367 setLibcallName(RTLIB::SRL_I128, nullptr);
2368 setLibcallName(RTLIB::SRA_I128, nullptr);
2369 setLibcallName(RTLIB::MUL_I128, nullptr);
2370 // The MULO libcall is not part of libgcc, only compiler-rt.
2371 setLibcallName(RTLIB::MULO_I64, nullptr);
2373 // The MULO libcall is not part of libgcc, only compiler-rt.
2374 setLibcallName(RTLIB::MULO_I128, nullptr);
2376 // Combine sin / cos into _sincos_stret if it is available.
2377 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2378 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2379 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2380 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2383 if (Subtarget.isTargetWin64()) {
2384 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2385 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2386 setOperationAction(ISD::SREM, MVT::i128, Custom);
2387 setOperationAction(ISD::UREM, MVT::i128, Custom);
2388 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
2389 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
2390 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
2391 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
2392 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
2393 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
2394 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
2395 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
2398 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2399 // is. We should promote the value to 64-bits to solve this.
2400 // This is what the CRT headers do - `fmodf` is an inline header
2401 // function casting to f64 and calling `fmod`.
2402 if (Subtarget.is32Bit() &&
2403 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2404 for (ISD::NodeType Op :
2405 {ISD::FCEIL, ISD::STRICT_FCEIL,
2406 ISD::FCOS, ISD::STRICT_FCOS,
2407 ISD::FEXP, ISD::STRICT_FEXP,
2408 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2409 ISD::FREM, ISD::STRICT_FREM,
2410 ISD::FLOG, ISD::STRICT_FLOG,
2411 ISD::FLOG10, ISD::STRICT_FLOG10,
2412 ISD::FPOW, ISD::STRICT_FPOW,
2413 ISD::FSIN, ISD::STRICT_FSIN})
2414 if (isOperationExpand(Op, MVT::f32))
2415 setOperationAction(Op, MVT::f32, Promote);
2417 // We have target-specific dag combine patterns for the following nodes:
2418 setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
2419 ISD::SCALAR_TO_VECTOR,
2420 ISD::INSERT_VECTOR_ELT,
2421 ISD::EXTRACT_VECTOR_ELT,
2422 ISD::CONCAT_VECTORS,
2423 ISD::INSERT_SUBVECTOR,
2424 ISD::EXTRACT_SUBVECTOR,
2425 ISD::BITCAST,
2426 ISD::VSELECT,
2427 ISD::SELECT,
2428 ISD::SHL,
2429 ISD::SRA,
2430 ISD::SRL,
2431 ISD::OR,
2432 ISD::AND,
2433 ISD::ADD,
2434 ISD::FADD,
2435 ISD::FSUB,
2436 ISD::FNEG,
2437 ISD::FMA,
2438 ISD::STRICT_FMA,
2439 ISD::FMINNUM,
2440 ISD::FMAXNUM,
2441 ISD::SUB,
2442 ISD::LOAD,
2443 ISD::MLOAD,
2444 ISD::STORE,
2445 ISD::MSTORE,
2446 ISD::TRUNCATE,
2447 ISD::ZERO_EXTEND,
2448 ISD::ANY_EXTEND,
2449 ISD::SIGN_EXTEND,
2450 ISD::SIGN_EXTEND_INREG,
2451 ISD::ANY_EXTEND_VECTOR_INREG,
2452 ISD::SIGN_EXTEND_VECTOR_INREG,
2453 ISD::ZERO_EXTEND_VECTOR_INREG,
2454 ISD::SINT_TO_FP,
2455 ISD::UINT_TO_FP,
2456 ISD::STRICT_SINT_TO_FP,
2457 ISD::STRICT_UINT_TO_FP,
2458 ISD::SETCC,
2459 ISD::MUL,
2460 ISD::XOR,
2461 ISD::MSCATTER,
2462 ISD::MGATHER,
2463 ISD::FP16_TO_FP,
2464 ISD::FP_EXTEND,
2465 ISD::STRICT_FP_EXTEND,
2466 ISD::FP_ROUND,
2467 ISD::STRICT_FP_ROUND});
2469 computeRegisterProperties(Subtarget.getRegisterInfo());
2471 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2472 MaxStoresPerMemsetOptSize = 8;
2473 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2474 MaxStoresPerMemcpyOptSize = 4;
2475 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2476 MaxStoresPerMemmoveOptSize = 4;
2478 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2479 // that needs to benchmarked and balanced with the potential use of vector
2480 // load/store types (PR33329, PR33914).
2481 MaxLoadsPerMemcmp = 2;
2482 MaxLoadsPerMemcmpOptSize = 2;
2484 // Default loop alignment, which can be overridden by -align-loops.
2485 setPrefLoopAlignment(Align(16));
2487 // An out-of-order CPU can speculatively execute past a predictable branch,
2488 // but a conditional move could be stalled by an expensive earlier operation.
2489 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2490 EnableExtLdPromotion = true;
2491 setPrefFunctionAlignment(Align(16));
2493 verifyIntrinsicTables();
2495 // Default to having -disable-strictnode-mutation on
2496 IsStrictFPEnabled = true;
2499 // This has so far only been implemented for 64-bit MachO.
2500 bool X86TargetLowering::useLoadStackGuardNode() const {
2501 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2504 bool X86TargetLowering::useStackGuardXorFP() const {
2505 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2506 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2509 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2510 const SDLoc &DL) const {
2511 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2512 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2513 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2514 return SDValue(Node, 0);
2517 TargetLoweringBase::LegalizeTypeAction
2518 X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2519 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2520 !Subtarget.hasBWI())
2521 return TypeSplitVector;
2523 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2524 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2525 return TypeSplitVector;
2527 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2528 VT.getVectorElementType() != MVT::i1)
2529 return TypeWidenVector;
2531 return TargetLoweringBase::getPreferredVectorAction(VT);
2534 FastISel *
2535 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2536 const TargetLibraryInfo *libInfo) const {
2537 return X86::createFastISel(funcInfo, libInfo);
2540 //===----------------------------------------------------------------------===//
2541 // Other Lowering Hooks
2542 //===----------------------------------------------------------------------===//
2544 bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
2545 bool AssumeSingleUse) {
2546 if (!AssumeSingleUse && !Op.hasOneUse())
2547 return false;
2548 if (!ISD::isNormalLoad(Op.getNode()))
2549 return false;
2551 // If this is an unaligned vector, make sure the target supports folding it.
2552 auto *Ld = cast<LoadSDNode>(Op.getNode());
2553 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2554 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2555 return false;
2557 // TODO: If this is a non-temporal load and the target has an instruction
2558 // for it, it should not be folded. See "useNonTemporalLoad()".
2560 return true;
2563 bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
2564 const X86Subtarget &Subtarget,
2565 bool AssumeSingleUse) {
2566 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2567 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2568 return false;
2570 // We can not replace a wide volatile load with a broadcast-from-memory,
2571 // because that would narrow the load, which isn't legal for volatiles.
2572 auto *Ld = cast<LoadSDNode>(Op.getNode());
2573 return !Ld->isVolatile() ||
2574 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2577 bool X86::mayFoldIntoStore(SDValue Op) {
2578 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2581 bool X86::mayFoldIntoZeroExtend(SDValue Op) {
2582 if (Op.hasOneUse()) {
2583 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
2584 return (ISD::ZERO_EXTEND == Opcode);
2586 return false;
2589 static bool isTargetShuffle(unsigned Opcode) {
2590 switch(Opcode) {
2591 default: return false;
2592 case X86ISD::BLENDI:
2593 case X86ISD::PSHUFB:
2594 case X86ISD::PSHUFD:
2595 case X86ISD::PSHUFHW:
2596 case X86ISD::PSHUFLW:
2597 case X86ISD::SHUFP:
2598 case X86ISD::INSERTPS:
2599 case X86ISD::EXTRQI:
2600 case X86ISD::INSERTQI:
2601 case X86ISD::VALIGN:
2602 case X86ISD::PALIGNR:
2603 case X86ISD::VSHLDQ:
2604 case X86ISD::VSRLDQ:
2605 case X86ISD::MOVLHPS:
2606 case X86ISD::MOVHLPS:
2607 case X86ISD::MOVSHDUP:
2608 case X86ISD::MOVSLDUP:
2609 case X86ISD::MOVDDUP:
2610 case X86ISD::MOVSS:
2611 case X86ISD::MOVSD:
2612 case X86ISD::MOVSH:
2613 case X86ISD::UNPCKL:
2614 case X86ISD::UNPCKH:
2615 case X86ISD::VBROADCAST:
2616 case X86ISD::VPERMILPI:
2617 case X86ISD::VPERMILPV:
2618 case X86ISD::VPERM2X128:
2619 case X86ISD::SHUF128:
2620 case X86ISD::VPERMIL2:
2621 case X86ISD::VPERMI:
2622 case X86ISD::VPPERM:
2623 case X86ISD::VPERMV:
2624 case X86ISD::VPERMV3:
2625 case X86ISD::VZEXT_MOVL:
2626 return true;
2630 static bool isTargetShuffleVariableMask(unsigned Opcode) {
2631 switch (Opcode) {
2632 default: return false;
2633 // Target Shuffles.
2634 case X86ISD::PSHUFB:
2635 case X86ISD::VPERMILPV:
2636 case X86ISD::VPERMIL2:
2637 case X86ISD::VPPERM:
2638 case X86ISD::VPERMV:
2639 case X86ISD::VPERMV3:
2640 return true;
2641 // 'Faux' Target Shuffles.
2642 case ISD::OR:
2643 case ISD::AND:
2644 case X86ISD::ANDNP:
2645 return true;
2649 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
2650 MachineFunction &MF = DAG.getMachineFunction();
2651 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2652 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2653 int ReturnAddrIndex = FuncInfo->getRAIndex();
2655 if (ReturnAddrIndex == 0) {
2656 // Set up a frame object for the return address.
2657 unsigned SlotSize = RegInfo->getSlotSize();
2658 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2659 -(int64_t)SlotSize,
2660 false);
2661 FuncInfo->setRAIndex(ReturnAddrIndex);
2664 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2667 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
2668 bool hasSymbolicDisplacement) {
2669 // Offset should fit into 32 bit immediate field.
2670 if (!isInt<32>(Offset))
2671 return false;
2673 // If we don't have a symbolic displacement - we don't have any extra
2674 // restrictions.
2675 if (!hasSymbolicDisplacement)
2676 return true;
2678 // FIXME: Some tweaks might be needed for medium code model.
2679 if (M != CodeModel::Small && M != CodeModel::Kernel)
2680 return false;
2682 // For small code model we assume that latest object is 16MB before end of 31
2683 // bits boundary. We may also accept pretty large negative constants knowing
2684 // that all objects are in the positive half of address space.
2685 if (M == CodeModel::Small && Offset < 16*1024*1024)
2686 return true;
2688 // For kernel code model we know that all object resist in the negative half
2689 // of 32bits address space. We may not accept negative offsets, since they may
2690 // be just off and we may accept pretty large positive ones.
2691 if (M == CodeModel::Kernel && Offset >= 0)
2692 return true;
2694 return false;
2697 /// Return true if the condition is an signed comparison operation.
2698 static bool isX86CCSigned(unsigned X86CC) {
2699 switch (X86CC) {
2700 default:
2701 llvm_unreachable("Invalid integer condition!");
2702 case X86::COND_E:
2703 case X86::COND_NE:
2704 case X86::COND_B:
2705 case X86::COND_A:
2706 case X86::COND_BE:
2707 case X86::COND_AE:
2708 return false;
2709 case X86::COND_G:
2710 case X86::COND_GE:
2711 case X86::COND_L:
2712 case X86::COND_LE:
2713 return true;
2717 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
2718 switch (SetCCOpcode) {
2719 default: llvm_unreachable("Invalid integer condition!");
2720 case ISD::SETEQ: return X86::COND_E;
2721 case ISD::SETGT: return X86::COND_G;
2722 case ISD::SETGE: return X86::COND_GE;
2723 case ISD::SETLT: return X86::COND_L;
2724 case ISD::SETLE: return X86::COND_LE;
2725 case ISD::SETNE: return X86::COND_NE;
2726 case ISD::SETULT: return X86::COND_B;
2727 case ISD::SETUGT: return X86::COND_A;
2728 case ISD::SETULE: return X86::COND_BE;
2729 case ISD::SETUGE: return X86::COND_AE;
2733 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2734 /// condition code, returning the condition code and the LHS/RHS of the
2735 /// comparison to make.
2736 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
2737 bool isFP, SDValue &LHS, SDValue &RHS,
2738 SelectionDAG &DAG) {
2739 if (!isFP) {
2740 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2741 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2742 // X > -1 -> X == 0, jump !sign.
2743 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2744 return X86::COND_NS;
2746 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2747 // X < 0 -> X == 0, jump on sign.
2748 return X86::COND_S;
2750 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2751 // X >= 0 -> X == 0, jump on !sign.
2752 return X86::COND_NS;
2754 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2755 // X < 1 -> X <= 0
2756 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2757 return X86::COND_LE;
2761 return TranslateIntegerX86CC(SetCCOpcode);
2764 // First determine if it is required or is profitable to flip the operands.
2766 // If LHS is a foldable load, but RHS is not, flip the condition.
2767 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
2768 !ISD::isNON_EXTLoad(RHS.getNode())) {
2769 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2770 std::swap(LHS, RHS);
2773 switch (SetCCOpcode) {
2774 default: break;
2775 case ISD::SETOLT:
2776 case ISD::SETOLE:
2777 case ISD::SETUGT:
2778 case ISD::SETUGE:
2779 std::swap(LHS, RHS);
2780 break;
2783 // On a floating point condition, the flags are set as follows:
2784 // ZF PF CF op
2785 // 0 | 0 | 0 | X > Y
2786 // 0 | 0 | 1 | X < Y
2787 // 1 | 0 | 0 | X == Y
2788 // 1 | 1 | 1 | unordered
2789 switch (SetCCOpcode) {
2790 default: llvm_unreachable("Condcode should be pre-legalized away");
2791 case ISD::SETUEQ:
2792 case ISD::SETEQ: return X86::COND_E;
2793 case ISD::SETOLT: // flipped
2794 case ISD::SETOGT:
2795 case ISD::SETGT: return X86::COND_A;
2796 case ISD::SETOLE: // flipped
2797 case ISD::SETOGE:
2798 case ISD::SETGE: return X86::COND_AE;
2799 case ISD::SETUGT: // flipped
2800 case ISD::SETULT:
2801 case ISD::SETLT: return X86::COND_B;
2802 case ISD::SETUGE: // flipped
2803 case ISD::SETULE:
2804 case ISD::SETLE: return X86::COND_BE;
2805 case ISD::SETONE:
2806 case ISD::SETNE: return X86::COND_NE;
2807 case ISD::SETUO: return X86::COND_P;
2808 case ISD::SETO: return X86::COND_NP;
2809 case ISD::SETOEQ:
2810 case ISD::SETUNE: return X86::COND_INVALID;
2814 /// Is there a floating point cmov for the specific X86 condition code?
2815 /// Current x86 isa includes the following FP cmov instructions:
2816 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2817 static bool hasFPCMov(unsigned X86CC) {
2818 switch (X86CC) {
2819 default:
2820 return false;
2821 case X86::COND_B:
2822 case X86::COND_BE:
2823 case X86::COND_E:
2824 case X86::COND_P:
2825 case X86::COND_A:
2826 case X86::COND_AE:
2827 case X86::COND_NE:
2828 case X86::COND_NP:
2829 return true;
2833 static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
2834 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
2835 VT.is512BitVector();
2838 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
2839 const CallInst &I,
2840 MachineFunction &MF,
2841 unsigned Intrinsic) const {
2842 Info.flags = MachineMemOperand::MONone;
2843 Info.offset = 0;
2845 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
2846 if (!IntrData) {
2847 switch (Intrinsic) {
2848 case Intrinsic::x86_aesenc128kl:
2849 case Intrinsic::x86_aesdec128kl:
2850 Info.opc = ISD::INTRINSIC_W_CHAIN;
2851 Info.ptrVal = I.getArgOperand(1);
2852 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2853 Info.align = Align(1);
2854 Info.flags |= MachineMemOperand::MOLoad;
2855 return true;
2856 case Intrinsic::x86_aesenc256kl:
2857 case Intrinsic::x86_aesdec256kl:
2858 Info.opc = ISD::INTRINSIC_W_CHAIN;
2859 Info.ptrVal = I.getArgOperand(1);
2860 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2861 Info.align = Align(1);
2862 Info.flags |= MachineMemOperand::MOLoad;
2863 return true;
2864 case Intrinsic::x86_aesencwide128kl:
2865 case Intrinsic::x86_aesdecwide128kl:
2866 Info.opc = ISD::INTRINSIC_W_CHAIN;
2867 Info.ptrVal = I.getArgOperand(0);
2868 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2869 Info.align = Align(1);
2870 Info.flags |= MachineMemOperand::MOLoad;
2871 return true;
2872 case Intrinsic::x86_aesencwide256kl:
2873 case Intrinsic::x86_aesdecwide256kl:
2874 Info.opc = ISD::INTRINSIC_W_CHAIN;
2875 Info.ptrVal = I.getArgOperand(0);
2876 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2877 Info.align = Align(1);
2878 Info.flags |= MachineMemOperand::MOLoad;
2879 return true;
2880 case Intrinsic::x86_cmpccxadd32:
2881 case Intrinsic::x86_cmpccxadd64:
2882 case Intrinsic::x86_atomic_bts:
2883 case Intrinsic::x86_atomic_btc:
2884 case Intrinsic::x86_atomic_btr: {
2885 Info.opc = ISD::INTRINSIC_W_CHAIN;
2886 Info.ptrVal = I.getArgOperand(0);
2887 unsigned Size = I.getType()->getScalarSizeInBits();
2888 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2889 Info.align = Align(Size);
2890 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
2891 MachineMemOperand::MOVolatile;
2892 return true;
2894 case Intrinsic::x86_atomic_bts_rm:
2895 case Intrinsic::x86_atomic_btc_rm:
2896 case Intrinsic::x86_atomic_btr_rm: {
2897 Info.opc = ISD::INTRINSIC_W_CHAIN;
2898 Info.ptrVal = I.getArgOperand(0);
2899 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2900 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2901 Info.align = Align(Size);
2902 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
2903 MachineMemOperand::MOVolatile;
2904 return true;
2906 case Intrinsic::x86_aadd32:
2907 case Intrinsic::x86_aadd64:
2908 case Intrinsic::x86_aand32:
2909 case Intrinsic::x86_aand64:
2910 case Intrinsic::x86_aor32:
2911 case Intrinsic::x86_aor64:
2912 case Intrinsic::x86_axor32:
2913 case Intrinsic::x86_axor64:
2914 case Intrinsic::x86_atomic_add_cc:
2915 case Intrinsic::x86_atomic_sub_cc:
2916 case Intrinsic::x86_atomic_or_cc:
2917 case Intrinsic::x86_atomic_and_cc:
2918 case Intrinsic::x86_atomic_xor_cc: {
2919 Info.opc = ISD::INTRINSIC_W_CHAIN;
2920 Info.ptrVal = I.getArgOperand(0);
2921 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2922 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2923 Info.align = Align(Size);
2924 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
2925 MachineMemOperand::MOVolatile;
2926 return true;
2929 return false;
2932 switch (IntrData->Type) {
2933 case TRUNCATE_TO_MEM_VI8:
2934 case TRUNCATE_TO_MEM_VI16:
2935 case TRUNCATE_TO_MEM_VI32: {
2936 Info.opc = ISD::INTRINSIC_VOID;
2937 Info.ptrVal = I.getArgOperand(0);
2938 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
2939 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
2940 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
2941 ScalarVT = MVT::i8;
2942 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
2943 ScalarVT = MVT::i16;
2944 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
2945 ScalarVT = MVT::i32;
2947 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
2948 Info.align = Align(1);
2949 Info.flags |= MachineMemOperand::MOStore;
2950 break;
2952 case GATHER:
2953 case GATHER_AVX2: {
2954 Info.opc = ISD::INTRINSIC_W_CHAIN;
2955 Info.ptrVal = nullptr;
2956 MVT DataVT = MVT::getVT(I.getType());
2957 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
2958 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
2959 IndexVT.getVectorNumElements());
2960 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
2961 Info.align = Align(1);
2962 Info.flags |= MachineMemOperand::MOLoad;
2963 break;
2965 case SCATTER: {
2966 Info.opc = ISD::INTRINSIC_VOID;
2967 Info.ptrVal = nullptr;
2968 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
2969 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
2970 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
2971 IndexVT.getVectorNumElements());
2972 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
2973 Info.align = Align(1);
2974 Info.flags |= MachineMemOperand::MOStore;
2975 break;
2977 default:
2978 return false;
2981 return true;
2984 /// Returns true if the target can instruction select the
2985 /// specified FP immediate natively. If false, the legalizer will
2986 /// materialize the FP immediate as a load from a constant pool.
2987 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
2988 bool ForCodeSize) const {
2989 for (const APFloat &FPImm : LegalFPImmediates)
2990 if (Imm.bitwiseIsEqual(FPImm))
2991 return true;
2992 return false;
2995 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
2996 ISD::LoadExtType ExtTy,
2997 EVT NewVT) const {
2998 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3000 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3001 // relocation target a movq or addq instruction: don't let the load shrink.
3002 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3003 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3004 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3005 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3007 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3008 // those uses are extracted directly into a store, then the extract + store
3009 // can be store-folded. Therefore, it's probably not worth splitting the load.
3010 EVT VT = Load->getValueType(0);
3011 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3012 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
3013 // Skip uses of the chain value. Result 0 of the node is the load value.
3014 if (UI.getUse().getResNo() != 0)
3015 continue;
3017 // If this use is not an extract + store, it's probably worth splitting.
3018 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
3019 UI->use_begin()->getOpcode() != ISD::STORE)
3020 return true;
3022 // All non-chain uses are extract + store.
3023 return false;
3026 return true;
3029 /// Returns true if it is beneficial to convert a load of a constant
3030 /// to just the constant itself.
3031 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3032 Type *Ty) const {
3033 assert(Ty->isIntegerTy());
3035 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3036 if (BitSize == 0 || BitSize > 64)
3037 return false;
3038 return true;
3041 bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
3042 // If we are using XMM registers in the ABI and the condition of the select is
3043 // a floating-point compare and we have blendv or conditional move, then it is
3044 // cheaper to select instead of doing a cross-register move and creating a
3045 // load that depends on the compare result.
3046 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3047 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3050 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
3051 // TODO: It might be a win to ease or lift this restriction, but the generic
3052 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3053 if (VT.isVector() && Subtarget.hasAVX512())
3054 return false;
3056 return true;
3059 bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
3060 SDValue C) const {
3061 // TODO: We handle scalars using custom code, but generic combining could make
3062 // that unnecessary.
3063 APInt MulC;
3064 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3065 return false;
3067 // Find the type this will be legalized too. Otherwise we might prematurely
3068 // convert this to shl+add/sub and then still have to type legalize those ops.
3069 // Another choice would be to defer the decision for illegal types until
3070 // after type legalization. But constant splat vectors of i64 can't make it
3071 // through type legalization on 32-bit targets so we would need to special
3072 // case vXi64.
3073 while (getTypeAction(Context, VT) != TypeLegal)
3074 VT = getTypeToTransformTo(Context, VT);
3076 // If vector multiply is legal, assume that's faster than shl + add/sub.
3077 // Multiply is a complex op with higher latency and lower throughput in
3078 // most implementations, sub-vXi32 vector multiplies are always fast,
3079 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3080 // is always going to be slow.
3081 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3082 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3083 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3084 return false;
3086 // shl+add, shl+sub, shl+add+neg
3087 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3088 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3091 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
3092 unsigned Index) const {
3093 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3094 return false;
3096 // Mask vectors support all subregister combinations and operations that
3097 // extract half of vector.
3098 if (ResVT.getVectorElementType() == MVT::i1)
3099 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3100 (Index == ResVT.getVectorNumElements()));
3102 return (Index % ResVT.getVectorNumElements()) == 0;
3105 bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
3106 unsigned Opc = VecOp.getOpcode();
3108 // Assume target opcodes can't be scalarized.
3109 // TODO - do we have any exceptions?
3110 if (Opc >= ISD::BUILTIN_OP_END)
3111 return false;
3113 // If the vector op is not supported, try to convert to scalar.
3114 EVT VecVT = VecOp.getValueType();
3115 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3116 return true;
3118 // If the vector op is supported, but the scalar op is not, the transform may
3119 // not be worthwhile.
3120 EVT ScalarVT = VecVT.getScalarType();
3121 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3124 bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
3125 bool) const {
3126 // TODO: Allow vectors?
3127 if (VT.isVector())
3128 return false;
3129 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3132 bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
3133 // Speculate cttz only if we can directly use TZCNT or can promote to i32.
3134 return Subtarget.hasBMI() ||
3135 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
3138 bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
3139 // Speculate ctlz only if we can directly use LZCNT.
3140 return Subtarget.hasLZCNT();
3143 bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
3144 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3145 // expensive than a straight movsd. On the other hand, it's important to
3146 // shrink long double fp constant since fldt is very slow.
3147 return !Subtarget.hasSSE2() || VT == MVT::f80;
3150 bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
3151 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3152 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3155 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
3156 const SelectionDAG &DAG,
3157 const MachineMemOperand &MMO) const {
3158 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3159 BitcastVT.getVectorElementType() == MVT::i1)
3160 return false;
3162 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3163 return false;
3165 // If both types are legal vectors, it's always ok to convert them.
3166 if (LoadVT.isVector() && BitcastVT.isVector() &&
3167 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3168 return true;
3170 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3173 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
3174 const MachineFunction &MF) const {
3175 // Do not merge to float value size (128 bytes) if no implicit
3176 // float attribute is set.
3177 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3179 if (NoFloat) {
3180 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3181 return (MemVT.getSizeInBits() <= MaxIntSize);
3183 // Make sure we don't merge greater than our preferred vector
3184 // width.
3185 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3186 return false;
3188 return true;
3191 bool X86TargetLowering::isCtlzFast() const {
3192 return Subtarget.hasFastLZCNT();
3195 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
3196 const Instruction &AndI) const {
3197 return true;
3200 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
3201 EVT VT = Y.getValueType();
3203 if (VT.isVector())
3204 return false;
3206 if (!Subtarget.hasBMI())
3207 return false;
3209 // There are only 32-bit and 64-bit forms for 'andn'.
3210 if (VT != MVT::i32 && VT != MVT::i64)
3211 return false;
3213 return !isa<ConstantSDNode>(Y);
3216 bool X86TargetLowering::hasAndNot(SDValue Y) const {
3217 EVT VT = Y.getValueType();
3219 if (!VT.isVector())
3220 return hasAndNotCompare(Y);
3222 // Vector.
3224 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3225 return false;
3227 if (VT == MVT::v4i32)
3228 return true;
3230 return Subtarget.hasSSE2();
3233 bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
3234 return X.getValueType().isScalarInteger(); // 'bt'
3237 bool X86TargetLowering::
3238 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
3239 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
3240 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3241 SelectionDAG &DAG) const {
3242 // Does baseline recommend not to perform the fold by default?
3243 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
3244 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3245 return false;
3246 // For scalars this transform is always beneficial.
3247 if (X.getValueType().isScalarInteger())
3248 return true;
3249 // If all the shift amounts are identical, then transform is beneficial even
3250 // with rudimentary SSE2 shifts.
3251 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3252 return true;
3253 // If we have AVX2 with it's powerful shift operations, then it's also good.
3254 if (Subtarget.hasAVX2())
3255 return true;
3256 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3257 return NewShiftOpcode == ISD::SHL;
3260 bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
3261 return N->getOpcode() != ISD::FP_EXTEND;
3264 bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
3265 const SDNode *N, CombineLevel Level) const {
3266 assert(((N->getOpcode() == ISD::SHL &&
3267 N->getOperand(0).getOpcode() == ISD::SRL) ||
3268 (N->getOpcode() == ISD::SRL &&
3269 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3270 "Expected shift-shift mask");
3271 // TODO: Should we always create i64 masks? Or only folded immediates?
3272 EVT VT = N->getValueType(0);
3273 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3274 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3275 // Only fold if the shift values are equal - so it folds to AND.
3276 // TODO - we should fold if either is a non-uniform vector but we don't do
3277 // the fold for non-splats yet.
3278 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3280 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
3283 bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
3284 EVT VT = Y.getValueType();
3286 // For vectors, we don't have a preference, but we probably want a mask.
3287 if (VT.isVector())
3288 return false;
3290 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3291 if (VT == MVT::i64 && !Subtarget.is64Bit())
3292 return false;
3294 return true;
3297 TargetLowering::ShiftLegalizationStrategy
3298 X86TargetLowering::preferredShiftLegalizationStrategy(
3299 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3300 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
3301 !Subtarget.isOSWindows())
3302 return ShiftLegalizationStrategy::LowerToLibcall;
3303 return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
3304 ExpansionFactor);
3307 bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
3308 // Any legal vector type can be splatted more efficiently than
3309 // loading/spilling from memory.
3310 return isTypeLegal(VT);
3313 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
3314 MVT VT = MVT::getIntegerVT(NumBits);
3315 if (isTypeLegal(VT))
3316 return VT;
3318 // PMOVMSKB can handle this.
3319 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3320 return MVT::v16i8;
3322 // VPMOVMSKB can handle this.
3323 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3324 return MVT::v32i8;
3326 // TODO: Allow 64-bit type for 32-bit target.
3327 // TODO: 512-bit types should be allowed, but make sure that those
3328 // cases are handled in combineVectorSizedSetCCEquality().
3330 return MVT::INVALID_SIMPLE_VALUE_TYPE;
3333 /// Val is the undef sentinel value or equal to the specified value.
3334 static bool isUndefOrEqual(int Val, int CmpVal) {
3335 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3338 /// Return true if every element in Mask is the undef sentinel value or equal to
3339 /// the specified value.
3340 static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3341 return llvm::all_of(Mask, [CmpVal](int M) {
3342 return (M == SM_SentinelUndef) || (M == CmpVal);
3346 /// Return true if every element in Mask, beginning from position Pos and ending
3347 /// in Pos+Size is the undef sentinel value or equal to the specified value.
3348 static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3349 unsigned Size) {
3350 return llvm::all_of(Mask.slice(Pos, Size),
3351 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3354 /// Val is either the undef or zero sentinel value.
3355 static bool isUndefOrZero(int Val) {
3356 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3359 /// Return true if every element in Mask, beginning from position Pos and ending
3360 /// in Pos+Size is the undef sentinel value.
3361 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3362 return llvm::all_of(Mask.slice(Pos, Size),
3363 [](int M) { return M == SM_SentinelUndef; });
3366 /// Return true if the mask creates a vector whose lower half is undefined.
3367 static bool isUndefLowerHalf(ArrayRef<int> Mask) {
3368 unsigned NumElts = Mask.size();
3369 return isUndefInRange(Mask, 0, NumElts / 2);
3372 /// Return true if the mask creates a vector whose upper half is undefined.
3373 static bool isUndefUpperHalf(ArrayRef<int> Mask) {
3374 unsigned NumElts = Mask.size();
3375 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3378 /// Return true if Val falls within the specified range (L, H].
3379 static bool isInRange(int Val, int Low, int Hi) {
3380 return (Val >= Low && Val < Hi);
3383 /// Return true if the value of any element in Mask falls within the specified
3384 /// range (L, H].
3385 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3386 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3389 /// Return true if the value of any element in Mask is the zero sentinel value.
3390 static bool isAnyZero(ArrayRef<int> Mask) {
3391 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3394 /// Return true if the value of any element in Mask is the zero or undef
3395 /// sentinel values.
3396 static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
3397 return llvm::any_of(Mask, [](int M) {
3398 return M == SM_SentinelZero || M == SM_SentinelUndef;
3402 /// Return true if Val is undef or if its value falls within the
3403 /// specified range (L, H].
3404 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3405 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3408 /// Return true if every element in Mask is undef or if its value
3409 /// falls within the specified range (L, H].
3410 static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3411 return llvm::all_of(
3412 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3415 /// Return true if Val is undef, zero or if its value falls within the
3416 /// specified range (L, H].
3417 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3418 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3421 /// Return true if every element in Mask is undef, zero or if its value
3422 /// falls within the specified range (L, H].
3423 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3424 return llvm::all_of(
3425 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3428 /// Return true if every element in Mask, beginning
3429 /// from position Pos and ending in Pos + Size, falls within the specified
3430 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3431 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3432 unsigned Size, int Low, int Step = 1) {
3433 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3434 if (!isUndefOrEqual(Mask[i], Low))
3435 return false;
3436 return true;
3439 /// Return true if every element in Mask, beginning
3440 /// from position Pos and ending in Pos+Size, falls within the specified
3441 /// sequential range (Low, Low+Size], or is undef or is zero.
3442 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3443 unsigned Size, int Low,
3444 int Step = 1) {
3445 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3446 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3447 return false;
3448 return true;
3451 /// Return true if every element in Mask, beginning
3452 /// from position Pos and ending in Pos+Size is undef or is zero.
3453 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3454 unsigned Size) {
3455 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3458 /// Helper function to test whether a shuffle mask could be
3459 /// simplified by widening the elements being shuffled.
3461 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3462 /// leaves it in an unspecified state.
3464 /// NOTE: This must handle normal vector shuffle masks and *target* vector
3465 /// shuffle masks. The latter have the special property of a '-2' representing
3466 /// a zero-ed lane of a vector.
3467 static bool canWidenShuffleElements(ArrayRef<int> Mask,
3468 SmallVectorImpl<int> &WidenedMask) {
3469 WidenedMask.assign(Mask.size() / 2, 0);
3470 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3471 int M0 = Mask[i];
3472 int M1 = Mask[i + 1];
3474 // If both elements are undef, its trivial.
3475 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3476 WidenedMask[i / 2] = SM_SentinelUndef;
3477 continue;
3480 // Check for an undef mask and a mask value properly aligned to fit with
3481 // a pair of values. If we find such a case, use the non-undef mask's value.
3482 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3483 WidenedMask[i / 2] = M1 / 2;
3484 continue;
3486 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3487 WidenedMask[i / 2] = M0 / 2;
3488 continue;
3491 // When zeroing, we need to spread the zeroing across both lanes to widen.
3492 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3493 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3494 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
3495 WidenedMask[i / 2] = SM_SentinelZero;
3496 continue;
3498 return false;
3501 // Finally check if the two mask values are adjacent and aligned with
3502 // a pair.
3503 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3504 WidenedMask[i / 2] = M0 / 2;
3505 continue;
3508 // Otherwise we can't safely widen the elements used in this shuffle.
3509 return false;
3511 assert(WidenedMask.size() == Mask.size() / 2 &&
3512 "Incorrect size of mask after widening the elements!");
3514 return true;
3517 static bool canWidenShuffleElements(ArrayRef<int> Mask,
3518 const APInt &Zeroable,
3519 bool V2IsZero,
3520 SmallVectorImpl<int> &WidenedMask) {
3521 // Create an alternative mask with info about zeroable elements.
3522 // Here we do not set undef elements as zeroable.
3523 SmallVector<int, 64> ZeroableMask(Mask);
3524 if (V2IsZero) {
3525 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3526 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3527 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3528 ZeroableMask[i] = SM_SentinelZero;
3530 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3533 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
3534 SmallVector<int, 32> WidenedMask;
3535 return canWidenShuffleElements(Mask, WidenedMask);
3538 // Attempt to narrow/widen shuffle mask until it matches the target number of
3539 // elements.
3540 static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3541 SmallVectorImpl<int> &ScaledMask) {
3542 unsigned NumSrcElts = Mask.size();
3543 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3544 "Illegal shuffle scale factor");
3546 // Narrowing is guaranteed to work.
3547 if (NumDstElts >= NumSrcElts) {
3548 int Scale = NumDstElts / NumSrcElts;
3549 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3550 return true;
3553 // We have to repeat the widening until we reach the target size, but we can
3554 // split out the first widening as it sets up ScaledMask for us.
3555 if (canWidenShuffleElements(Mask, ScaledMask)) {
3556 while (ScaledMask.size() > NumDstElts) {
3557 SmallVector<int, 16> WidenedMask;
3558 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3559 return false;
3560 ScaledMask = std::move(WidenedMask);
3562 return true;
3565 return false;
3568 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
3569 bool X86::isZeroNode(SDValue Elt) {
3570 return isNullConstant(Elt) || isNullFPConstant(Elt);
3573 // Build a vector of constants.
3574 // Use an UNDEF node if MaskElt == -1.
3575 // Split 64-bit constants in the 32-bit mode.
3576 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
3577 const SDLoc &dl, bool IsMask = false) {
3579 SmallVector<SDValue, 32> Ops;
3580 bool Split = false;
3582 MVT ConstVecVT = VT;
3583 unsigned NumElts = VT.getVectorNumElements();
3584 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3585 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3586 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3587 Split = true;
3590 MVT EltVT = ConstVecVT.getVectorElementType();
3591 for (unsigned i = 0; i < NumElts; ++i) {
3592 bool IsUndef = Values[i] < 0 && IsMask;
3593 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3594 DAG.getConstant(Values[i], dl, EltVT);
3595 Ops.push_back(OpNode);
3596 if (Split)
3597 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3598 DAG.getConstant(0, dl, EltVT));
3600 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3601 if (Split)
3602 ConstsNode = DAG.getBitcast(VT, ConstsNode);
3603 return ConstsNode;
3606 static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3607 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3608 assert(Bits.size() == Undefs.getBitWidth() &&
3609 "Unequal constant and undef arrays");
3610 SmallVector<SDValue, 32> Ops;
3611 bool Split = false;
3613 MVT ConstVecVT = VT;
3614 unsigned NumElts = VT.getVectorNumElements();
3615 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3616 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3617 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3618 Split = true;
3621 MVT EltVT = ConstVecVT.getVectorElementType();
3622 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3623 if (Undefs[i]) {
3624 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3625 continue;
3627 const APInt &V = Bits[i];
3628 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3629 if (Split) {
3630 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3631 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3632 } else if (EltVT == MVT::f32) {
3633 APFloat FV(APFloat::IEEEsingle(), V);
3634 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3635 } else if (EltVT == MVT::f64) {
3636 APFloat FV(APFloat::IEEEdouble(), V);
3637 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3638 } else {
3639 Ops.push_back(DAG.getConstant(V, dl, EltVT));
3643 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3644 return DAG.getBitcast(VT, ConstsNode);
3647 static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT,
3648 SelectionDAG &DAG, const SDLoc &dl) {
3649 APInt Undefs = APInt::getZero(Bits.size());
3650 return getConstVector(Bits, Undefs, VT, DAG, dl);
3653 /// Returns a vector of specified type with all zero elements.
3654 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
3655 SelectionDAG &DAG, const SDLoc &dl) {
3656 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
3657 VT.getVectorElementType() == MVT::i1) &&
3658 "Unexpected vector type");
3660 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
3661 // type. This ensures they get CSE'd. But if the integer type is not
3662 // available, use a floating-point +0.0 instead.
3663 SDValue Vec;
3664 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
3665 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
3666 } else if (VT.isFloatingPoint()) {
3667 Vec = DAG.getConstantFP(+0.0, dl, VT);
3668 } else if (VT.getVectorElementType() == MVT::i1) {
3669 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
3670 "Unexpected vector type");
3671 Vec = DAG.getConstant(0, dl, VT);
3672 } else {
3673 unsigned Num32BitElts = VT.getSizeInBits() / 32;
3674 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
3676 return DAG.getBitcast(VT, Vec);
3679 // Helper to determine if the ops are all the extracted subvectors come from a
3680 // single source. If we allow commute they don't have to be in order (Lo/Hi).
3681 static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
3682 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3683 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3684 LHS.getValueType() != RHS.getValueType() ||
3685 LHS.getOperand(0) != RHS.getOperand(0))
3686 return SDValue();
3688 SDValue Src = LHS.getOperand(0);
3689 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
3690 return SDValue();
3692 unsigned NumElts = LHS.getValueType().getVectorNumElements();
3693 if ((LHS.getConstantOperandAPInt(1) == 0 &&
3694 RHS.getConstantOperandAPInt(1) == NumElts) ||
3695 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
3696 LHS.getConstantOperandAPInt(1) == NumElts))
3697 return Src;
3699 return SDValue();
3702 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
3703 const SDLoc &dl, unsigned vectorWidth) {
3704 EVT VT = Vec.getValueType();
3705 EVT ElVT = VT.getVectorElementType();
3706 unsigned Factor = VT.getSizeInBits() / vectorWidth;
3707 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
3708 VT.getVectorNumElements() / Factor);
3710 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
3711 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
3712 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3714 // This is the index of the first element of the vectorWidth-bit chunk
3715 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3716 IdxVal &= ~(ElemsPerChunk - 1);
3718 // If the input is a buildvector just emit a smaller one.
3719 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
3720 return DAG.getBuildVector(ResultVT, dl,
3721 Vec->ops().slice(IdxVal, ElemsPerChunk));
3723 // Check if we're extracting the upper undef of a widening pattern.
3724 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
3725 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
3726 isNullConstant(Vec.getOperand(2)))
3727 return DAG.getUNDEF(ResultVT);
3729 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3730 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
3733 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
3734 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
3735 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
3736 /// instructions or a simple subregister reference. Idx is an index in the
3737 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
3738 /// lowering EXTRACT_VECTOR_ELT operations easier.
3739 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
3740 SelectionDAG &DAG, const SDLoc &dl) {
3741 assert((Vec.getValueType().is256BitVector() ||
3742 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
3743 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
3746 /// Generate a DAG to grab 256-bits from a 512-bit vector.
3747 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
3748 SelectionDAG &DAG, const SDLoc &dl) {
3749 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
3750 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
3753 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3754 SelectionDAG &DAG, const SDLoc &dl,
3755 unsigned vectorWidth) {
3756 assert((vectorWidth == 128 || vectorWidth == 256) &&
3757 "Unsupported vector width");
3758 // Inserting UNDEF is Result
3759 if (Vec.isUndef())
3760 return Result;
3761 EVT VT = Vec.getValueType();
3762 EVT ElVT = VT.getVectorElementType();
3763 EVT ResultVT = Result.getValueType();
3765 // Insert the relevant vectorWidth bits.
3766 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
3767 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3769 // This is the index of the first element of the vectorWidth-bit chunk
3770 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3771 IdxVal &= ~(ElemsPerChunk - 1);
3773 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3774 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
3777 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
3778 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
3779 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
3780 /// simple superregister reference. Idx is an index in the 128 bits
3781 /// we want. It need not be aligned to a 128-bit boundary. That makes
3782 /// lowering INSERT_VECTOR_ELT operations easier.
3783 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3784 SelectionDAG &DAG, const SDLoc &dl) {
3785 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
3786 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
3789 /// Widen a vector to a larger size with the same scalar type, with the new
3790 /// elements either zero or undef.
3791 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
3792 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3793 const SDLoc &dl) {
3794 assert(Vec.getValueSizeInBits().getFixedValue() <= VT.getFixedSizeInBits() &&
3795 Vec.getValueType().getScalarType() == VT.getScalarType() &&
3796 "Unsupported vector widening type");
3797 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
3798 : DAG.getUNDEF(VT);
3799 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
3800 DAG.getIntPtrConstant(0, dl));
3803 /// Widen a vector to a larger size with the same scalar type, with the new
3804 /// elements either zero or undef.
3805 static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
3806 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3807 const SDLoc &dl, unsigned WideSizeInBits) {
3808 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
3809 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
3810 "Unsupported vector widening type");
3811 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
3812 MVT SVT = Vec.getSimpleValueType().getScalarType();
3813 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
3814 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
3817 /// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
3818 /// and bitcast with integer types.
3819 static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
3820 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
3821 unsigned NumElts = VT.getVectorNumElements();
3822 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
3823 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
3824 return VT;
3827 /// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
3828 /// bitcast with integer types.
3829 static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
3830 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3831 const SDLoc &dl) {
3832 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
3833 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
3836 // Helper function to collect subvector ops that are concatenated together,
3837 // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
3838 // The subvectors in Ops are guaranteed to be the same type.
3839 static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
3840 SelectionDAG &DAG) {
3841 assert(Ops.empty() && "Expected an empty ops vector");
3843 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
3844 Ops.append(N->op_begin(), N->op_end());
3845 return true;
3848 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
3849 SDValue Src = N->getOperand(0);
3850 SDValue Sub = N->getOperand(1);
3851 const APInt &Idx = N->getConstantOperandAPInt(2);
3852 EVT VT = Src.getValueType();
3853 EVT SubVT = Sub.getValueType();
3855 // TODO - Handle more general insert_subvector chains.
3856 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
3857 // insert_subvector(undef, x, lo)
3858 if (Idx == 0 && Src.isUndef()) {
3859 Ops.push_back(Sub);
3860 Ops.push_back(DAG.getUNDEF(SubVT));
3861 return true;
3863 if (Idx == (VT.getVectorNumElements() / 2)) {
3864 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
3865 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
3866 Src.getOperand(1).getValueType() == SubVT &&
3867 isNullConstant(Src.getOperand(2))) {
3868 Ops.push_back(Src.getOperand(1));
3869 Ops.push_back(Sub);
3870 return true;
3872 // insert_subvector(x, extract_subvector(x, lo), hi)
3873 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3874 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
3875 Ops.append(2, Sub);
3876 return true;
3878 // insert_subvector(undef, x, hi)
3879 if (Src.isUndef()) {
3880 Ops.push_back(DAG.getUNDEF(SubVT));
3881 Ops.push_back(Sub);
3882 return true;
3888 return false;
3891 // Helper to check if \p V can be split into subvectors and the upper subvectors
3892 // are all undef. In which case return the lower subvector.
3893 static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL,
3894 SelectionDAG &DAG) {
3895 SmallVector<SDValue> SubOps;
3896 if (!collectConcatOps(V.getNode(), SubOps, DAG))
3897 return SDValue();
3899 unsigned NumSubOps = SubOps.size();
3900 unsigned HalfNumSubOps = NumSubOps / 2;
3901 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
3903 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
3904 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
3905 return SDValue();
3907 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
3908 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
3909 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
3912 // Helper to check if we can access all the constituent subvectors without any
3913 // extract ops.
3914 static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG) {
3915 SmallVector<SDValue> Ops;
3916 return collectConcatOps(N, Ops, DAG);
3919 static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
3920 const SDLoc &dl) {
3921 EVT VT = Op.getValueType();
3922 unsigned NumElems = VT.getVectorNumElements();
3923 unsigned SizeInBits = VT.getSizeInBits();
3924 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
3925 "Can't split odd sized vector");
3927 // If this is a splat value (with no-undefs) then use the lower subvector,
3928 // which should be a free extraction.
3929 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
3930 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
3931 return std::make_pair(Lo, Lo);
3933 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
3934 return std::make_pair(Lo, Hi);
3937 /// Break an operation into 2 half sized ops and then concatenate the results.
3938 static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {
3939 unsigned NumOps = Op.getNumOperands();
3940 EVT VT = Op.getValueType();
3941 SDLoc dl(Op);
3943 // Extract the LHS Lo/Hi vectors
3944 SmallVector<SDValue> LoOps(NumOps, SDValue());
3945 SmallVector<SDValue> HiOps(NumOps, SDValue());
3946 for (unsigned I = 0; I != NumOps; ++I) {
3947 SDValue SrcOp = Op.getOperand(I);
3948 if (!SrcOp.getValueType().isVector()) {
3949 LoOps[I] = HiOps[I] = SrcOp;
3950 continue;
3952 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
3955 EVT LoVT, HiVT;
3956 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
3957 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
3958 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
3959 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
3962 /// Break an unary integer operation into 2 half sized ops and then
3963 /// concatenate the result back.
3964 static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
3965 // Make sure we only try to split 256/512-bit types to avoid creating
3966 // narrow vectors.
3967 EVT VT = Op.getValueType();
3968 (void)VT;
3969 assert((Op.getOperand(0).getValueType().is256BitVector() ||
3970 Op.getOperand(0).getValueType().is512BitVector()) &&
3971 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
3972 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
3973 VT.getVectorNumElements() &&
3974 "Unexpected VTs!");
3975 return splitVectorOp(Op, DAG);
3978 /// Break a binary integer operation into 2 half sized ops and then
3979 /// concatenate the result back.
3980 static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
3981 // Assert that all the types match.
3982 EVT VT = Op.getValueType();
3983 (void)VT;
3984 assert(Op.getOperand(0).getValueType() == VT &&
3985 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
3986 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
3987 return splitVectorOp(Op, DAG);
3990 // Helper for splitting operands of an operation to legal target size and
3991 // apply a function on each part.
3992 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
3993 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
3994 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
3995 // The argument Builder is a function that will be applied on each split part:
3996 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
3997 template <typename F>
3998 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
3999 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4000 F Builder, bool CheckBWI = true) {
4001 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4002 unsigned NumSubs = 1;
4003 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4004 (!CheckBWI && Subtarget.useAVX512Regs())) {
4005 if (VT.getSizeInBits() > 512) {
4006 NumSubs = VT.getSizeInBits() / 512;
4007 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4009 } else if (Subtarget.hasAVX2()) {
4010 if (VT.getSizeInBits() > 256) {
4011 NumSubs = VT.getSizeInBits() / 256;
4012 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4014 } else {
4015 if (VT.getSizeInBits() > 128) {
4016 NumSubs = VT.getSizeInBits() / 128;
4017 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4021 if (NumSubs == 1)
4022 return Builder(DAG, DL, Ops);
4024 SmallVector<SDValue, 4> Subs;
4025 for (unsigned i = 0; i != NumSubs; ++i) {
4026 SmallVector<SDValue, 2> SubOps;
4027 for (SDValue Op : Ops) {
4028 EVT OpVT = Op.getValueType();
4029 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4030 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4031 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4033 Subs.push_back(Builder(DAG, DL, SubOps));
4035 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4038 // Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4039 // targets.
4040 static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4041 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
4042 const X86Subtarget &Subtarget) {
4043 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4044 MVT SVT = VT.getScalarType();
4046 // If we have a 32/64 splatted constant, splat it to DstTy to
4047 // encourage a foldable broadcast'd operand.
4048 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4049 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4050 // AVX512 broadcasts 32/64-bit operands.
4051 // TODO: Support float once getAVX512Node is used by fp-ops.
4052 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4053 !DAG.getTargetLoweringInfo().isTypeLegal(SVT))
4054 return SDValue();
4055 // If we're not widening, don't bother if we're not bitcasting.
4056 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4057 return SDValue();
4058 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4059 APInt SplatValue, SplatUndef;
4060 unsigned SplatBitSize;
4061 bool HasAnyUndefs;
4062 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4063 HasAnyUndefs, OpEltSizeInBits) &&
4064 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4065 return DAG.getConstant(SplatValue, DL, DstVT);
4067 return SDValue();
4070 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4072 MVT DstVT = VT;
4073 if (Widen)
4074 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4076 // Canonicalize src operands.
4077 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
4078 for (SDValue &Op : SrcOps) {
4079 MVT OpVT = Op.getSimpleValueType();
4080 // Just pass through scalar operands.
4081 if (!OpVT.isVector())
4082 continue;
4083 assert(OpVT == VT && "Vector type mismatch");
4085 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4086 Op = BroadcastOp;
4087 continue;
4090 // Just widen the subvector by inserting into an undef wide vector.
4091 if (Widen)
4092 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4095 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4097 // Perform the 512-bit op then extract the bottom subvector.
4098 if (Widen)
4099 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4100 return Res;
4103 /// Insert i1-subvector to i1-vector.
4104 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
4105 const X86Subtarget &Subtarget) {
4107 SDLoc dl(Op);
4108 SDValue Vec = Op.getOperand(0);
4109 SDValue SubVec = Op.getOperand(1);
4110 SDValue Idx = Op.getOperand(2);
4111 unsigned IdxVal = Op.getConstantOperandVal(2);
4113 // Inserting undef is a nop. We can just return the original vector.
4114 if (SubVec.isUndef())
4115 return Vec;
4117 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4118 return Op;
4120 MVT OpVT = Op.getSimpleValueType();
4121 unsigned NumElems = OpVT.getVectorNumElements();
4122 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4124 // Extend to natively supported kshift.
4125 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4127 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4128 // if necessary.
4129 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4130 // May need to promote to a legal type.
4131 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4132 DAG.getConstant(0, dl, WideOpVT),
4133 SubVec, Idx);
4134 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4137 MVT SubVecVT = SubVec.getSimpleValueType();
4138 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4139 assert(IdxVal + SubVecNumElems <= NumElems &&
4140 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4141 "Unexpected index value in INSERT_SUBVECTOR");
4143 SDValue Undef = DAG.getUNDEF(WideOpVT);
4145 if (IdxVal == 0) {
4146 // Zero lower bits of the Vec
4147 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4148 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4149 ZeroIdx);
4150 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4151 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4152 // Merge them together, SubVec should be zero extended.
4153 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4154 DAG.getConstant(0, dl, WideOpVT),
4155 SubVec, ZeroIdx);
4156 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4157 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4160 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4161 Undef, SubVec, ZeroIdx);
4163 if (Vec.isUndef()) {
4164 assert(IdxVal != 0 && "Unexpected index");
4165 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4166 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4167 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4170 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
4171 assert(IdxVal != 0 && "Unexpected index");
4172 // If upper elements of Vec are known undef, then just shift into place.
4173 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4174 [](SDValue V) { return V.isUndef(); })) {
4175 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4176 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4177 } else {
4178 NumElems = WideOpVT.getVectorNumElements();
4179 unsigned ShiftLeft = NumElems - SubVecNumElems;
4180 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4181 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4182 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4183 if (ShiftRight != 0)
4184 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4185 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4187 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4190 // Simple case when we put subvector in the upper part
4191 if (IdxVal + SubVecNumElems == NumElems) {
4192 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4193 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4194 if (SubVecNumElems * 2 == NumElems) {
4195 // Special case, use legal zero extending insert_subvector. This allows
4196 // isel to optimize when bits are known zero.
4197 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4198 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4199 DAG.getConstant(0, dl, WideOpVT),
4200 Vec, ZeroIdx);
4201 } else {
4202 // Otherwise use explicit shifts to zero the bits.
4203 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4204 Undef, Vec, ZeroIdx);
4205 NumElems = WideOpVT.getVectorNumElements();
4206 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4207 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4208 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4210 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4211 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4214 // Inserting into the middle is more complicated.
4216 NumElems = WideOpVT.getVectorNumElements();
4218 // Widen the vector if needed.
4219 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4221 unsigned ShiftLeft = NumElems - SubVecNumElems;
4222 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4224 // Do an optimization for the most frequently used types.
4225 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4226 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4227 Mask0.flipAllBits();
4228 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4229 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4230 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4231 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4232 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4233 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4234 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4235 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4237 // Reduce to original width if needed.
4238 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4241 // Clear the upper bits of the subvector and move it to its insert position.
4242 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4243 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4244 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4245 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4247 // Isolate the bits below the insertion point.
4248 unsigned LowShift = NumElems - IdxVal;
4249 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4250 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4251 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4252 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4254 // Isolate the bits after the last inserted bit.
4255 unsigned HighShift = IdxVal + SubVecNumElems;
4256 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4257 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4258 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4259 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4261 // Now OR all 3 pieces together.
4262 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4263 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4265 // Reduce to original width if needed.
4266 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4269 static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
4270 const SDLoc &dl) {
4271 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4272 EVT SubVT = V1.getValueType();
4273 EVT SubSVT = SubVT.getScalarType();
4274 unsigned SubNumElts = SubVT.getVectorNumElements();
4275 unsigned SubVectorWidth = SubVT.getSizeInBits();
4276 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4277 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4278 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4281 /// Returns a vector of specified type with all bits set.
4282 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4283 /// Then bitcast to their original type, ensuring they get CSE'd.
4284 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4285 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4286 "Expected a 128/256/512-bit vector type");
4288 APInt Ones = APInt::getAllOnes(32);
4289 unsigned NumElts = VT.getSizeInBits() / 32;
4290 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
4291 return DAG.getBitcast(VT, Vec);
4294 static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4295 SDValue In, SelectionDAG &DAG) {
4296 EVT InVT = In.getValueType();
4297 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4298 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4299 ISD::ZERO_EXTEND == Opcode) &&
4300 "Unknown extension opcode");
4302 // For 256-bit vectors, we only need the lower (128-bit) input half.
4303 // For 512-bit vectors, we only need the lower input half or quarter.
4304 if (InVT.getSizeInBits() > 128) {
4305 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4306 "Expected VTs to be the same size!");
4307 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4308 In = extractSubVector(In, 0, DAG, DL,
4309 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4310 InVT = In.getValueType();
4313 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4314 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4316 return DAG.getNode(Opcode, DL, VT, In);
4319 // Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4320 static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4321 SDValue Mask, SelectionDAG &DAG) {
4322 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4323 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4324 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4327 void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
4328 bool Lo, bool Unary) {
4329 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4330 "Illegal vector type to unpack");
4331 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4332 int NumElts = VT.getVectorNumElements();
4333 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4334 for (int i = 0; i < NumElts; ++i) {
4335 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4336 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4337 Pos += (Unary ? 0 : NumElts * (i % 2));
4338 Pos += (Lo ? 0 : NumEltsInLane / 2);
4339 Mask.push_back(Pos);
4343 /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4344 /// imposed by AVX and specific to the unary pattern. Example:
4345 /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4346 /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4347 void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
4348 bool Lo) {
4349 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4350 int NumElts = VT.getVectorNumElements();
4351 for (int i = 0; i < NumElts; ++i) {
4352 int Pos = i / 2;
4353 Pos += (Lo ? 0 : NumElts / 2);
4354 Mask.push_back(Pos);
4358 // Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4359 static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4360 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4361 if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&
4362 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4363 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4364 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4365 int M = Mask[I];
4366 if (M < 0)
4367 continue;
4368 SDValue V = (M < NumElts) ? V1 : V2;
4369 if (V.isUndef())
4370 continue;
4371 Ops[I] = V.getOperand(M % NumElts);
4373 return DAG.getBuildVector(VT, dl, Ops);
4376 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4379 /// Returns a vector_shuffle node for an unpackl operation.
4380 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4381 SDValue V1, SDValue V2) {
4382 SmallVector<int, 8> Mask;
4383 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4384 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4387 /// Returns a vector_shuffle node for an unpackh operation.
4388 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4389 SDValue V1, SDValue V2) {
4390 SmallVector<int, 8> Mask;
4391 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4392 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4395 /// Returns a node that packs the LHS + RHS nodes together at half width.
4396 /// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4397 /// TODO: Add subvector splitting if/when we have a need for it.
4398 static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4399 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4400 bool PackHiHalf = false) {
4401 MVT OpVT = LHS.getSimpleValueType();
4402 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4403 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4404 assert(OpVT == RHS.getSimpleValueType() &&
4405 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4406 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4407 "Unexpected PACK operand types");
4408 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4409 "Unexpected PACK result type");
4411 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4412 if (EltSizeInBits == 32) {
4413 SmallVector<int> PackMask;
4414 int Offset = PackHiHalf ? 1 : 0;
4415 int NumElts = VT.getVectorNumElements();
4416 for (int I = 0; I != NumElts; I += 4) {
4417 PackMask.push_back(I + Offset);
4418 PackMask.push_back(I + Offset + 2);
4419 PackMask.push_back(I + Offset + NumElts);
4420 PackMask.push_back(I + Offset + NumElts + 2);
4422 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4423 DAG.getBitcast(VT, RHS), PackMask);
4426 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4427 if (!PackHiHalf) {
4428 if (UsePackUS &&
4429 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4430 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4431 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4433 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4434 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4435 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4438 // Fallback to sign/zero extending the requested half and pack.
4439 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4440 if (UsePackUS) {
4441 if (PackHiHalf) {
4442 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4443 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4444 } else {
4445 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4446 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4447 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4449 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4452 if (!PackHiHalf) {
4453 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4454 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4456 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4457 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4458 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4461 /// Return a vector_shuffle of the specified vector of zero or undef vector.
4462 /// This produces a shuffle where the low element of V2 is swizzled into the
4463 /// zero/undef vector, landing at element Idx.
4464 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4465 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
4466 bool IsZero,
4467 const X86Subtarget &Subtarget,
4468 SelectionDAG &DAG) {
4469 MVT VT = V2.getSimpleValueType();
4470 SDValue V1 = IsZero
4471 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4472 int NumElems = VT.getVectorNumElements();
4473 SmallVector<int, 16> MaskVec(NumElems);
4474 for (int i = 0; i != NumElems; ++i)
4475 // If this is the insertion idx, put the low elt of V2 here.
4476 MaskVec[i] = (i == Idx) ? NumElems : i;
4477 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4480 static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
4481 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4482 Ptr.getOpcode() == X86ISD::WrapperRIP)
4483 Ptr = Ptr.getOperand(0);
4485 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
4486 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4487 return nullptr;
4489 return CNode->getConstVal();
4492 static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
4493 if (!Load || !ISD::isNormalLoad(Load))
4494 return nullptr;
4495 return getTargetConstantFromBasePtr(Load->getBasePtr());
4498 static const Constant *getTargetConstantFromNode(SDValue Op) {
4499 Op = peekThroughBitcasts(Op);
4500 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4503 const Constant *
4504 X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
4505 assert(LD && "Unexpected null LoadSDNode");
4506 return getTargetConstantFromNode(LD);
4509 // Extract raw constant bits from constant pools.
4510 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4511 APInt &UndefElts,
4512 SmallVectorImpl<APInt> &EltBits,
4513 bool AllowWholeUndefs = true,
4514 bool AllowPartialUndefs = true) {
4515 assert(EltBits.empty() && "Expected an empty EltBits vector");
4517 Op = peekThroughBitcasts(Op);
4519 EVT VT = Op.getValueType();
4520 unsigned SizeInBits = VT.getSizeInBits();
4521 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4522 unsigned NumElts = SizeInBits / EltSizeInBits;
4524 // Bitcast a source array of element bits to the target size.
4525 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4526 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4527 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4528 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4529 "Constant bit sizes don't match");
4531 // Don't split if we don't allow undef bits.
4532 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4533 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4534 return false;
4536 // If we're already the right size, don't bother bitcasting.
4537 if (NumSrcElts == NumElts) {
4538 UndefElts = UndefSrcElts;
4539 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4540 return true;
4543 // Extract all the undef/constant element data and pack into single bitsets.
4544 APInt UndefBits(SizeInBits, 0);
4545 APInt MaskBits(SizeInBits, 0);
4547 for (unsigned i = 0; i != NumSrcElts; ++i) {
4548 unsigned BitOffset = i * SrcEltSizeInBits;
4549 if (UndefSrcElts[i])
4550 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4551 MaskBits.insertBits(SrcEltBits[i], BitOffset);
4554 // Split the undef/constant single bitset data into the target elements.
4555 UndefElts = APInt(NumElts, 0);
4556 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4558 for (unsigned i = 0; i != NumElts; ++i) {
4559 unsigned BitOffset = i * EltSizeInBits;
4560 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4562 // Only treat an element as UNDEF if all bits are UNDEF.
4563 if (UndefEltBits.isAllOnes()) {
4564 if (!AllowWholeUndefs)
4565 return false;
4566 UndefElts.setBit(i);
4567 continue;
4570 // If only some bits are UNDEF then treat them as zero (or bail if not
4571 // supported).
4572 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4573 return false;
4575 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4577 return true;
4580 // Collect constant bits and insert into mask/undef bit masks.
4581 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4582 unsigned UndefBitIndex) {
4583 if (!Cst)
4584 return false;
4585 if (isa<UndefValue>(Cst)) {
4586 Undefs.setBit(UndefBitIndex);
4587 return true;
4589 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4590 Mask = CInt->getValue();
4591 return true;
4593 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4594 Mask = CFP->getValueAPF().bitcastToAPInt();
4595 return true;
4597 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4598 Type *Ty = CDS->getType();
4599 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
4600 Type *EltTy = CDS->getElementType();
4601 bool IsInteger = EltTy->isIntegerTy();
4602 bool IsFP =
4603 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4604 if (!IsInteger && !IsFP)
4605 return false;
4606 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4607 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4608 if (IsInteger)
4609 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4610 else
4611 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4612 I * EltBits);
4613 return true;
4615 return false;
4618 // Handle UNDEFs.
4619 if (Op.isUndef()) {
4620 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
4621 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
4622 return CastBitData(UndefSrcElts, SrcEltBits);
4625 // Extract scalar constant bits.
4626 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
4627 APInt UndefSrcElts = APInt::getZero(1);
4628 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
4629 return CastBitData(UndefSrcElts, SrcEltBits);
4631 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
4632 APInt UndefSrcElts = APInt::getZero(1);
4633 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
4634 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
4635 return CastBitData(UndefSrcElts, SrcEltBits);
4638 // Extract constant bits from build vector.
4639 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
4640 BitVector Undefs;
4641 SmallVector<APInt> SrcEltBits;
4642 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4643 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
4644 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
4645 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
4646 if (Undefs[I])
4647 UndefSrcElts.setBit(I);
4648 return CastBitData(UndefSrcElts, SrcEltBits);
4652 // Extract constant bits from constant pool vector.
4653 if (auto *Cst = getTargetConstantFromNode(Op)) {
4654 Type *CstTy = Cst->getType();
4655 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4656 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
4657 return false;
4659 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
4660 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4662 APInt UndefSrcElts(NumSrcElts, 0);
4663 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
4664 for (unsigned i = 0; i != NumSrcElts; ++i)
4665 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
4666 UndefSrcElts, i))
4667 return false;
4669 return CastBitData(UndefSrcElts, SrcEltBits);
4672 // Extract constant bits from a broadcasted constant pool scalar.
4673 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
4674 EltSizeInBits <= VT.getScalarSizeInBits()) {
4675 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4676 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
4677 return false;
4679 SDValue Ptr = MemIntr->getBasePtr();
4680 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
4681 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4682 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4684 APInt UndefSrcElts(NumSrcElts, 0);
4685 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
4686 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
4687 if (UndefSrcElts[0])
4688 UndefSrcElts.setBits(0, NumSrcElts);
4689 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
4690 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
4691 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
4692 return CastBitData(UndefSrcElts, SrcEltBits);
4697 // Extract constant bits from a subvector broadcast.
4698 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
4699 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4700 SDValue Ptr = MemIntr->getBasePtr();
4701 // The source constant may be larger than the subvector broadcast,
4702 // ensure we extract the correct subvector constants.
4703 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
4704 Type *CstTy = Cst->getType();
4705 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4706 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
4707 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
4708 (SizeInBits % SubVecSizeInBits) != 0)
4709 return false;
4710 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
4711 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
4712 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
4713 APInt UndefSubElts(NumSubElts, 0);
4714 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
4715 APInt(CstEltSizeInBits, 0));
4716 for (unsigned i = 0; i != NumSubElts; ++i) {
4717 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
4718 UndefSubElts, i))
4719 return false;
4720 for (unsigned j = 1; j != NumSubVecs; ++j)
4721 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
4723 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
4724 UndefSubElts);
4725 return CastBitData(UndefSubElts, SubEltBits);
4729 // Extract a rematerialized scalar constant insertion.
4730 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
4731 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
4732 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
4733 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4734 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4736 APInt UndefSrcElts(NumSrcElts, 0);
4737 SmallVector<APInt, 64> SrcEltBits;
4738 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
4739 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
4740 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
4741 return CastBitData(UndefSrcElts, SrcEltBits);
4744 // Insert constant bits from a base and sub vector sources.
4745 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
4746 // If bitcasts to larger elements we might lose track of undefs - don't
4747 // allow any to be safe.
4748 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4749 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
4751 APInt UndefSrcElts, UndefSubElts;
4752 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
4753 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
4754 UndefSubElts, EltSubBits,
4755 AllowWholeUndefs && AllowUndefs,
4756 AllowPartialUndefs && AllowUndefs) &&
4757 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
4758 UndefSrcElts, EltSrcBits,
4759 AllowWholeUndefs && AllowUndefs,
4760 AllowPartialUndefs && AllowUndefs)) {
4761 unsigned BaseIdx = Op.getConstantOperandVal(2);
4762 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
4763 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
4764 EltSrcBits[BaseIdx + i] = EltSubBits[i];
4765 return CastBitData(UndefSrcElts, EltSrcBits);
4769 // Extract constant bits from a subvector's source.
4770 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4771 // TODO - support extract_subvector through bitcasts.
4772 if (EltSizeInBits != VT.getScalarSizeInBits())
4773 return false;
4775 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
4776 UndefElts, EltBits, AllowWholeUndefs,
4777 AllowPartialUndefs)) {
4778 EVT SrcVT = Op.getOperand(0).getValueType();
4779 unsigned NumSrcElts = SrcVT.getVectorNumElements();
4780 unsigned NumSubElts = VT.getVectorNumElements();
4781 unsigned BaseIdx = Op.getConstantOperandVal(1);
4782 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
4783 if ((BaseIdx + NumSubElts) != NumSrcElts)
4784 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
4785 if (BaseIdx != 0)
4786 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
4787 return true;
4791 // Extract constant bits from shuffle node sources.
4792 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
4793 // TODO - support shuffle through bitcasts.
4794 if (EltSizeInBits != VT.getScalarSizeInBits())
4795 return false;
4797 ArrayRef<int> Mask = SVN->getMask();
4798 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
4799 llvm::any_of(Mask, [](int M) { return M < 0; }))
4800 return false;
4802 APInt UndefElts0, UndefElts1;
4803 SmallVector<APInt, 32> EltBits0, EltBits1;
4804 if (isAnyInRange(Mask, 0, NumElts) &&
4805 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
4806 UndefElts0, EltBits0, AllowWholeUndefs,
4807 AllowPartialUndefs))
4808 return false;
4809 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
4810 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
4811 UndefElts1, EltBits1, AllowWholeUndefs,
4812 AllowPartialUndefs))
4813 return false;
4815 UndefElts = APInt::getZero(NumElts);
4816 for (int i = 0; i != (int)NumElts; ++i) {
4817 int M = Mask[i];
4818 if (M < 0) {
4819 UndefElts.setBit(i);
4820 EltBits.push_back(APInt::getZero(EltSizeInBits));
4821 } else if (M < (int)NumElts) {
4822 if (UndefElts0[M])
4823 UndefElts.setBit(i);
4824 EltBits.push_back(EltBits0[M]);
4825 } else {
4826 if (UndefElts1[M - NumElts])
4827 UndefElts.setBit(i);
4828 EltBits.push_back(EltBits1[M - NumElts]);
4831 return true;
4834 return false;
4837 namespace llvm {
4838 namespace X86 {
4839 bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
4840 APInt UndefElts;
4841 SmallVector<APInt, 16> EltBits;
4842 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
4843 UndefElts, EltBits, true,
4844 AllowPartialUndefs)) {
4845 int SplatIndex = -1;
4846 for (int i = 0, e = EltBits.size(); i != e; ++i) {
4847 if (UndefElts[i])
4848 continue;
4849 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
4850 SplatIndex = -1;
4851 break;
4853 SplatIndex = i;
4855 if (0 <= SplatIndex) {
4856 SplatVal = EltBits[SplatIndex];
4857 return true;
4861 return false;
4863 } // namespace X86
4864 } // namespace llvm
4866 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
4867 unsigned MaskEltSizeInBits,
4868 SmallVectorImpl<uint64_t> &RawMask,
4869 APInt &UndefElts) {
4870 // Extract the raw target constant bits.
4871 SmallVector<APInt, 64> EltBits;
4872 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
4873 EltBits, /* AllowWholeUndefs */ true,
4874 /* AllowPartialUndefs */ false))
4875 return false;
4877 // Insert the extracted elements into the mask.
4878 for (const APInt &Elt : EltBits)
4879 RawMask.push_back(Elt.getZExtValue());
4881 return true;
4884 // Match not(xor X, -1) -> X.
4885 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
4886 // Match not(extract_subvector(xor X, -1)) -> extract_subvector(X).
4887 // Match not(concat_vectors(xor X, -1, xor Y, -1)) -> concat_vectors(X, Y).
4888 static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
4889 V = peekThroughBitcasts(V);
4890 if (V.getOpcode() == ISD::XOR &&
4891 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
4892 isAllOnesConstant(V.getOperand(1))))
4893 return V.getOperand(0);
4894 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4895 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
4896 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
4897 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
4898 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
4899 Not, V.getOperand(1));
4902 if (V.getOpcode() == X86ISD::PCMPGT &&
4903 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
4904 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
4905 V.getOperand(0).hasOneUse()) {
4906 APInt UndefElts;
4907 SmallVector<APInt> EltBits;
4908 if (getTargetConstantBitsFromNode(V.getOperand(0),
4909 V.getScalarValueSizeInBits(), UndefElts,
4910 EltBits)) {
4911 // Don't fold min_signed_value -> (min_signed_value - 1)
4912 bool MinSigned = false;
4913 for (APInt &Elt : EltBits) {
4914 MinSigned |= Elt.isMinSignedValue();
4915 Elt -= 1;
4917 if (!MinSigned) {
4918 SDLoc DL(V);
4919 MVT VT = V.getSimpleValueType();
4920 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
4921 getConstVector(EltBits, UndefElts, VT, DAG, DL));
4925 SmallVector<SDValue, 2> CatOps;
4926 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
4927 for (SDValue &CatOp : CatOps) {
4928 SDValue NotCat = IsNOT(CatOp, DAG);
4929 if (!NotCat) return SDValue();
4930 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
4932 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
4934 return SDValue();
4937 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
4938 /// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
4939 /// Note: This ignores saturation, so inputs must be checked first.
4940 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
4941 bool Unary, unsigned NumStages = 1) {
4942 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4943 unsigned NumElts = VT.getVectorNumElements();
4944 unsigned NumLanes = VT.getSizeInBits() / 128;
4945 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
4946 unsigned Offset = Unary ? 0 : NumElts;
4947 unsigned Repetitions = 1u << (NumStages - 1);
4948 unsigned Increment = 1u << NumStages;
4949 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
4951 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
4952 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
4953 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
4954 Mask.push_back(Elt + (Lane * NumEltsPerLane));
4955 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
4956 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
4961 // Split the demanded elts of a PACKSS/PACKUS node between its operands.
4962 static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
4963 APInt &DemandedLHS, APInt &DemandedRHS) {
4964 int NumLanes = VT.getSizeInBits() / 128;
4965 int NumElts = DemandedElts.getBitWidth();
4966 int NumInnerElts = NumElts / 2;
4967 int NumEltsPerLane = NumElts / NumLanes;
4968 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
4970 DemandedLHS = APInt::getZero(NumInnerElts);
4971 DemandedRHS = APInt::getZero(NumInnerElts);
4973 // Map DemandedElts to the packed operands.
4974 for (int Lane = 0; Lane != NumLanes; ++Lane) {
4975 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
4976 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
4977 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
4978 if (DemandedElts[OuterIdx])
4979 DemandedLHS.setBit(InnerIdx);
4980 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
4981 DemandedRHS.setBit(InnerIdx);
4986 // Split the demanded elts of a HADD/HSUB node between its operands.
4987 static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
4988 APInt &DemandedLHS, APInt &DemandedRHS) {
4989 int NumLanes = VT.getSizeInBits() / 128;
4990 int NumElts = DemandedElts.getBitWidth();
4991 int NumEltsPerLane = NumElts / NumLanes;
4992 int HalfEltsPerLane = NumEltsPerLane / 2;
4994 DemandedLHS = APInt::getZero(NumElts);
4995 DemandedRHS = APInt::getZero(NumElts);
4997 // Map DemandedElts to the horizontal operands.
4998 for (int Idx = 0; Idx != NumElts; ++Idx) {
4999 if (!DemandedElts[Idx])
5000 continue;
5001 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
5002 int LocalIdx = Idx % NumEltsPerLane;
5003 if (LocalIdx < HalfEltsPerLane) {
5004 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5005 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5006 } else {
5007 LocalIdx -= HalfEltsPerLane;
5008 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5009 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5014 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5015 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5016 /// operands in \p Ops, and returns true.
5017 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5018 /// IsUnary for shuffles which use a single input multiple times, and in those
5019 /// cases it will adjust the mask to only have indices within that single input.
5020 /// It is an error to call this with non-empty Mask/Ops vectors.
5021 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5022 SmallVectorImpl<SDValue> &Ops,
5023 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5024 unsigned NumElems = VT.getVectorNumElements();
5025 unsigned MaskEltSize = VT.getScalarSizeInBits();
5026 SmallVector<uint64_t, 32> RawMask;
5027 APInt RawUndefs;
5028 uint64_t ImmN;
5030 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5031 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5033 IsUnary = false;
5034 bool IsFakeUnary = false;
5035 switch (N->getOpcode()) {
5036 case X86ISD::BLENDI:
5037 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5038 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5039 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5040 DecodeBLENDMask(NumElems, ImmN, Mask);
5041 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5042 break;
5043 case X86ISD::SHUFP:
5044 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5045 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5046 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5047 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5048 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5049 break;
5050 case X86ISD::INSERTPS:
5051 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5052 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5053 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5054 DecodeINSERTPSMask(ImmN, Mask);
5055 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5056 break;
5057 case X86ISD::EXTRQI:
5058 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5059 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5060 isa<ConstantSDNode>(N->getOperand(2))) {
5061 int BitLen = N->getConstantOperandVal(1);
5062 int BitIdx = N->getConstantOperandVal(2);
5063 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5064 IsUnary = true;
5066 break;
5067 case X86ISD::INSERTQI:
5068 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5069 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5070 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5071 isa<ConstantSDNode>(N->getOperand(3))) {
5072 int BitLen = N->getConstantOperandVal(2);
5073 int BitIdx = N->getConstantOperandVal(3);
5074 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5075 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5077 break;
5078 case X86ISD::UNPCKH:
5079 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5080 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5081 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5082 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5083 break;
5084 case X86ISD::UNPCKL:
5085 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5086 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5087 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5088 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5089 break;
5090 case X86ISD::MOVHLPS:
5091 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5092 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5093 DecodeMOVHLPSMask(NumElems, Mask);
5094 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5095 break;
5096 case X86ISD::MOVLHPS:
5097 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5098 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5099 DecodeMOVLHPSMask(NumElems, Mask);
5100 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5101 break;
5102 case X86ISD::VALIGN:
5103 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5104 "Only 32-bit and 64-bit elements are supported!");
5105 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5106 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5107 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5108 DecodeVALIGNMask(NumElems, ImmN, Mask);
5109 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5110 Ops.push_back(N->getOperand(1));
5111 Ops.push_back(N->getOperand(0));
5112 break;
5113 case X86ISD::PALIGNR:
5114 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5115 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5116 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5117 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5118 DecodePALIGNRMask(NumElems, ImmN, Mask);
5119 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5120 Ops.push_back(N->getOperand(1));
5121 Ops.push_back(N->getOperand(0));
5122 break;
5123 case X86ISD::VSHLDQ:
5124 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5125 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5126 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5127 DecodePSLLDQMask(NumElems, ImmN, Mask);
5128 IsUnary = true;
5129 break;
5130 case X86ISD::VSRLDQ:
5131 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5132 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5133 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5134 DecodePSRLDQMask(NumElems, ImmN, Mask);
5135 IsUnary = true;
5136 break;
5137 case X86ISD::PSHUFD:
5138 case X86ISD::VPERMILPI:
5139 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5140 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5141 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5142 IsUnary = true;
5143 break;
5144 case X86ISD::PSHUFHW:
5145 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5146 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5147 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5148 IsUnary = true;
5149 break;
5150 case X86ISD::PSHUFLW:
5151 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5152 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5153 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5154 IsUnary = true;
5155 break;
5156 case X86ISD::VZEXT_MOVL:
5157 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5158 DecodeZeroMoveLowMask(NumElems, Mask);
5159 IsUnary = true;
5160 break;
5161 case X86ISD::VBROADCAST:
5162 // We only decode broadcasts of same-sized vectors, peeking through to
5163 // extracted subvectors is likely to cause hasOneUse issues with
5164 // SimplifyDemandedBits etc.
5165 if (N->getOperand(0).getValueType() == VT) {
5166 DecodeVectorBroadcast(NumElems, Mask);
5167 IsUnary = true;
5168 break;
5170 return false;
5171 case X86ISD::VPERMILPV: {
5172 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5173 IsUnary = true;
5174 SDValue MaskNode = N->getOperand(1);
5175 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5176 RawUndefs)) {
5177 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5178 break;
5180 return false;
5182 case X86ISD::PSHUFB: {
5183 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5184 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5185 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5186 IsUnary = true;
5187 SDValue MaskNode = N->getOperand(1);
5188 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5189 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5190 break;
5192 return false;
5194 case X86ISD::VPERMI:
5195 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5196 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5197 DecodeVPERMMask(NumElems, ImmN, Mask);
5198 IsUnary = true;
5199 break;
5200 case X86ISD::MOVSS:
5201 case X86ISD::MOVSD:
5202 case X86ISD::MOVSH:
5203 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5204 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5205 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5206 break;
5207 case X86ISD::VPERM2X128:
5208 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5209 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5210 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5211 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5212 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5213 break;
5214 case X86ISD::SHUF128:
5215 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5216 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5217 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5218 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5219 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5220 break;
5221 case X86ISD::MOVSLDUP:
5222 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5223 DecodeMOVSLDUPMask(NumElems, Mask);
5224 IsUnary = true;
5225 break;
5226 case X86ISD::MOVSHDUP:
5227 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5228 DecodeMOVSHDUPMask(NumElems, Mask);
5229 IsUnary = true;
5230 break;
5231 case X86ISD::MOVDDUP:
5232 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5233 DecodeMOVDDUPMask(NumElems, Mask);
5234 IsUnary = true;
5235 break;
5236 case X86ISD::VPERMIL2: {
5237 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5238 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5239 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5240 SDValue MaskNode = N->getOperand(2);
5241 SDValue CtrlNode = N->getOperand(3);
5242 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5243 unsigned CtrlImm = CtrlOp->getZExtValue();
5244 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5245 RawUndefs)) {
5246 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5247 Mask);
5248 break;
5251 return false;
5253 case X86ISD::VPPERM: {
5254 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5255 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5256 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5257 SDValue MaskNode = N->getOperand(2);
5258 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5259 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5260 break;
5262 return false;
5264 case X86ISD::VPERMV: {
5265 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5266 IsUnary = true;
5267 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5268 Ops.push_back(N->getOperand(1));
5269 SDValue MaskNode = N->getOperand(0);
5270 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5271 RawUndefs)) {
5272 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5273 break;
5275 return false;
5277 case X86ISD::VPERMV3: {
5278 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5279 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5280 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5281 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5282 Ops.push_back(N->getOperand(0));
5283 Ops.push_back(N->getOperand(2));
5284 SDValue MaskNode = N->getOperand(1);
5285 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5286 RawUndefs)) {
5287 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5288 break;
5290 return false;
5292 default: llvm_unreachable("unknown target shuffle node");
5295 // Empty mask indicates the decode failed.
5296 if (Mask.empty())
5297 return false;
5299 // Check if we're getting a shuffle mask with zero'd elements.
5300 if (!AllowSentinelZero && isAnyZero(Mask))
5301 return false;
5303 // If we have a fake unary shuffle, the shuffle mask is spread across two
5304 // inputs that are actually the same node. Re-map the mask to always point
5305 // into the first input.
5306 if (IsFakeUnary)
5307 for (int &M : Mask)
5308 if (M >= (int)Mask.size())
5309 M -= Mask.size();
5311 // If we didn't already add operands in the opcode-specific code, default to
5312 // adding 1 or 2 operands starting at 0.
5313 if (Ops.empty()) {
5314 Ops.push_back(N->getOperand(0));
5315 if (!IsUnary || IsFakeUnary)
5316 Ops.push_back(N->getOperand(1));
5319 return true;
5322 // Wrapper for getTargetShuffleMask with InUnary;
5323 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5324 SmallVectorImpl<SDValue> &Ops,
5325 SmallVectorImpl<int> &Mask) {
5326 bool IsUnary;
5327 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
5330 /// Compute whether each element of a shuffle is zeroable.
5332 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
5333 /// Either it is an undef element in the shuffle mask, the element of the input
5334 /// referenced is undef, or the element of the input referenced is known to be
5335 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5336 /// as many lanes with this technique as possible to simplify the remaining
5337 /// shuffle.
5338 static void computeZeroableShuffleElements(ArrayRef<int> Mask,
5339 SDValue V1, SDValue V2,
5340 APInt &KnownUndef, APInt &KnownZero) {
5341 int Size = Mask.size();
5342 KnownUndef = KnownZero = APInt::getZero(Size);
5344 V1 = peekThroughBitcasts(V1);
5345 V2 = peekThroughBitcasts(V2);
5347 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5348 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5350 int VectorSizeInBits = V1.getValueSizeInBits();
5351 int ScalarSizeInBits = VectorSizeInBits / Size;
5352 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5354 for (int i = 0; i < Size; ++i) {
5355 int M = Mask[i];
5356 // Handle the easy cases.
5357 if (M < 0) {
5358 KnownUndef.setBit(i);
5359 continue;
5361 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5362 KnownZero.setBit(i);
5363 continue;
5366 // Determine shuffle input and normalize the mask.
5367 SDValue V = M < Size ? V1 : V2;
5368 M %= Size;
5370 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5371 if (V.getOpcode() != ISD::BUILD_VECTOR)
5372 continue;
5374 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5375 // the (larger) source element must be UNDEF/ZERO.
5376 if ((Size % V.getNumOperands()) == 0) {
5377 int Scale = Size / V->getNumOperands();
5378 SDValue Op = V.getOperand(M / Scale);
5379 if (Op.isUndef())
5380 KnownUndef.setBit(i);
5381 if (X86::isZeroNode(Op))
5382 KnownZero.setBit(i);
5383 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5384 APInt Val = Cst->getAPIntValue();
5385 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5386 if (Val == 0)
5387 KnownZero.setBit(i);
5388 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5389 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5390 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5391 if (Val == 0)
5392 KnownZero.setBit(i);
5394 continue;
5397 // If the BUILD_VECTOR has more elements then all the (smaller) source
5398 // elements must be UNDEF or ZERO.
5399 if ((V.getNumOperands() % Size) == 0) {
5400 int Scale = V->getNumOperands() / Size;
5401 bool AllUndef = true;
5402 bool AllZero = true;
5403 for (int j = 0; j < Scale; ++j) {
5404 SDValue Op = V.getOperand((M * Scale) + j);
5405 AllUndef &= Op.isUndef();
5406 AllZero &= X86::isZeroNode(Op);
5408 if (AllUndef)
5409 KnownUndef.setBit(i);
5410 if (AllZero)
5411 KnownZero.setBit(i);
5412 continue;
5417 /// Decode a target shuffle mask and inputs and see if any values are
5418 /// known to be undef or zero from their inputs.
5419 /// Returns true if the target shuffle mask was decoded.
5420 /// FIXME: Merge this with computeZeroableShuffleElements?
5421 static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
5422 SmallVectorImpl<SDValue> &Ops,
5423 APInt &KnownUndef, APInt &KnownZero) {
5424 bool IsUnary;
5425 if (!isTargetShuffle(N.getOpcode()))
5426 return false;
5428 MVT VT = N.getSimpleValueType();
5429 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5430 return false;
5432 int Size = Mask.size();
5433 SDValue V1 = Ops[0];
5434 SDValue V2 = IsUnary ? V1 : Ops[1];
5435 KnownUndef = KnownZero = APInt::getZero(Size);
5437 V1 = peekThroughBitcasts(V1);
5438 V2 = peekThroughBitcasts(V2);
5440 assert((VT.getSizeInBits() % Size) == 0 &&
5441 "Illegal split of shuffle value type");
5442 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5444 // Extract known constant input data.
5445 APInt UndefSrcElts[2];
5446 SmallVector<APInt, 32> SrcEltBits[2];
5447 bool IsSrcConstant[2] = {
5448 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5449 SrcEltBits[0], true, false),
5450 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5451 SrcEltBits[1], true, false)};
5453 for (int i = 0; i < Size; ++i) {
5454 int M = Mask[i];
5456 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5457 if (M < 0) {
5458 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5459 if (SM_SentinelUndef == M)
5460 KnownUndef.setBit(i);
5461 if (SM_SentinelZero == M)
5462 KnownZero.setBit(i);
5463 continue;
5466 // Determine shuffle input and normalize the mask.
5467 unsigned SrcIdx = M / Size;
5468 SDValue V = M < Size ? V1 : V2;
5469 M %= Size;
5471 // We are referencing an UNDEF input.
5472 if (V.isUndef()) {
5473 KnownUndef.setBit(i);
5474 continue;
5477 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5478 // TODO: We currently only set UNDEF for integer types - floats use the same
5479 // registers as vectors and many of the scalar folded loads rely on the
5480 // SCALAR_TO_VECTOR pattern.
5481 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5482 (Size % V.getValueType().getVectorNumElements()) == 0) {
5483 int Scale = Size / V.getValueType().getVectorNumElements();
5484 int Idx = M / Scale;
5485 if (Idx != 0 && !VT.isFloatingPoint())
5486 KnownUndef.setBit(i);
5487 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5488 KnownZero.setBit(i);
5489 continue;
5492 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5493 // base vectors.
5494 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5495 SDValue Vec = V.getOperand(0);
5496 int NumVecElts = Vec.getValueType().getVectorNumElements();
5497 if (Vec.isUndef() && Size == NumVecElts) {
5498 int Idx = V.getConstantOperandVal(2);
5499 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5500 if (M < Idx || (Idx + NumSubElts) <= M)
5501 KnownUndef.setBit(i);
5503 continue;
5506 // Attempt to extract from the source's constant bits.
5507 if (IsSrcConstant[SrcIdx]) {
5508 if (UndefSrcElts[SrcIdx][M])
5509 KnownUndef.setBit(i);
5510 else if (SrcEltBits[SrcIdx][M] == 0)
5511 KnownZero.setBit(i);
5515 assert(VT.getVectorNumElements() == (unsigned)Size &&
5516 "Different mask size from vector size!");
5517 return true;
5520 // Replace target shuffle mask elements with known undef/zero sentinels.
5521 static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
5522 const APInt &KnownUndef,
5523 const APInt &KnownZero,
5524 bool ResolveKnownZeros= true) {
5525 unsigned NumElts = Mask.size();
5526 assert(KnownUndef.getBitWidth() == NumElts &&
5527 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5529 for (unsigned i = 0; i != NumElts; ++i) {
5530 if (KnownUndef[i])
5531 Mask[i] = SM_SentinelUndef;
5532 else if (ResolveKnownZeros && KnownZero[i])
5533 Mask[i] = SM_SentinelZero;
5537 // Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5538 static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
5539 APInt &KnownUndef,
5540 APInt &KnownZero) {
5541 unsigned NumElts = Mask.size();
5542 KnownUndef = KnownZero = APInt::getZero(NumElts);
5544 for (unsigned i = 0; i != NumElts; ++i) {
5545 int M = Mask[i];
5546 if (SM_SentinelUndef == M)
5547 KnownUndef.setBit(i);
5548 if (SM_SentinelZero == M)
5549 KnownZero.setBit(i);
5553 // Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5554 static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
5555 SDValue Cond, bool IsBLENDV = false) {
5556 EVT CondVT = Cond.getValueType();
5557 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5558 unsigned NumElts = CondVT.getVectorNumElements();
5560 APInt UndefElts;
5561 SmallVector<APInt, 32> EltBits;
5562 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5563 true, false))
5564 return false;
5566 Mask.resize(NumElts, SM_SentinelUndef);
5568 for (int i = 0; i != (int)NumElts; ++i) {
5569 Mask[i] = i;
5570 // Arbitrarily choose from the 2nd operand if the select condition element
5571 // is undef.
5572 // TODO: Can we do better by matching patterns such as even/odd?
5573 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5574 (IsBLENDV && EltBits[i].isNonNegative()))
5575 Mask[i] += NumElts;
5578 return true;
5581 // Forward declaration (for getFauxShuffleMask recursive check).
5582 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5583 SmallVectorImpl<SDValue> &Inputs,
5584 SmallVectorImpl<int> &Mask,
5585 const SelectionDAG &DAG, unsigned Depth,
5586 bool ResolveKnownElts);
5588 // Attempt to decode ops that could be represented as a shuffle mask.
5589 // The decoded shuffle mask may contain a different number of elements to the
5590 // destination value type.
5591 // TODO: Merge into getTargetShuffleInputs()
5592 static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
5593 SmallVectorImpl<int> &Mask,
5594 SmallVectorImpl<SDValue> &Ops,
5595 const SelectionDAG &DAG, unsigned Depth,
5596 bool ResolveKnownElts) {
5597 Mask.clear();
5598 Ops.clear();
5600 MVT VT = N.getSimpleValueType();
5601 unsigned NumElts = VT.getVectorNumElements();
5602 unsigned NumSizeInBits = VT.getSizeInBits();
5603 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5604 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
5605 return false;
5606 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
5607 unsigned NumSizeInBytes = NumSizeInBits / 8;
5608 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5610 unsigned Opcode = N.getOpcode();
5611 switch (Opcode) {
5612 case ISD::VECTOR_SHUFFLE: {
5613 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
5614 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
5615 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
5616 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
5617 Ops.push_back(N.getOperand(0));
5618 Ops.push_back(N.getOperand(1));
5619 return true;
5621 return false;
5623 case ISD::AND:
5624 case X86ISD::ANDNP: {
5625 // Attempt to decode as a per-byte mask.
5626 APInt UndefElts;
5627 SmallVector<APInt, 32> EltBits;
5628 SDValue N0 = N.getOperand(0);
5629 SDValue N1 = N.getOperand(1);
5630 bool IsAndN = (X86ISD::ANDNP == Opcode);
5631 uint64_t ZeroMask = IsAndN ? 255 : 0;
5632 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5633 return false;
5634 // We can't assume an undef src element gives an undef dst - the other src
5635 // might be zero.
5636 if (!UndefElts.isZero())
5637 return false;
5638 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5639 const APInt &ByteBits = EltBits[i];
5640 if (ByteBits != 0 && ByteBits != 255)
5641 return false;
5642 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5644 Ops.push_back(IsAndN ? N1 : N0);
5645 return true;
5647 case ISD::OR: {
5648 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
5649 // is a valid shuffle index.
5650 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
5651 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
5652 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
5653 return false;
5655 SmallVector<int, 64> SrcMask0, SrcMask1;
5656 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
5657 APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());
5658 APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());
5659 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
5660 Depth + 1, true) ||
5661 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
5662 Depth + 1, true))
5663 return false;
5665 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
5666 SmallVector<int, 64> Mask0, Mask1;
5667 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
5668 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
5669 for (int i = 0; i != (int)MaskSize; ++i) {
5670 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
5671 // loops converting between OR and BLEND shuffles due to
5672 // canWidenShuffleElements merging away undef elements, meaning we
5673 // fail to recognise the OR as the undef element isn't known zero.
5674 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
5675 Mask.push_back(SM_SentinelZero);
5676 else if (Mask1[i] == SM_SentinelZero)
5677 Mask.push_back(i);
5678 else if (Mask0[i] == SM_SentinelZero)
5679 Mask.push_back(i + MaskSize);
5680 else
5681 return false;
5683 Ops.push_back(N0);
5684 Ops.push_back(N1);
5685 return true;
5687 case ISD::INSERT_SUBVECTOR: {
5688 SDValue Src = N.getOperand(0);
5689 SDValue Sub = N.getOperand(1);
5690 EVT SubVT = Sub.getValueType();
5691 unsigned NumSubElts = SubVT.getVectorNumElements();
5692 if (!N->isOnlyUserOf(Sub.getNode()))
5693 return false;
5694 SDValue SubBC = peekThroughBitcasts(Sub);
5695 uint64_t InsertIdx = N.getConstantOperandVal(2);
5696 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
5697 if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5698 SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
5699 uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
5700 SDValue SubBCSrc = SubBC.getOperand(0);
5701 unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
5702 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
5703 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
5704 "Subvector valuetype mismatch");
5705 InsertIdx *= (MaxElts / NumElts);
5706 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
5707 NumSubElts *= (MaxElts / NumElts);
5708 bool SrcIsUndef = Src.isUndef();
5709 for (int i = 0; i != (int)MaxElts; ++i)
5710 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
5711 for (int i = 0; i != (int)NumSubElts; ++i)
5712 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
5713 if (!SrcIsUndef)
5714 Ops.push_back(Src);
5715 Ops.push_back(SubBCSrc);
5716 return true;
5718 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
5719 SmallVector<int, 64> SubMask;
5720 SmallVector<SDValue, 2> SubInputs;
5721 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
5722 EVT SubSrcVT = SubSrc.getValueType();
5723 if (!SubSrcVT.isVector())
5724 return false;
5726 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
5727 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
5728 Depth + 1, ResolveKnownElts))
5729 return false;
5731 // Subvector shuffle inputs must not be larger than the subvector.
5732 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
5733 return SubVT.getFixedSizeInBits() <
5734 SubInput.getValueSizeInBits().getFixedValue();
5736 return false;
5738 if (SubMask.size() != NumSubElts) {
5739 assert(((SubMask.size() % NumSubElts) == 0 ||
5740 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
5741 if ((NumSubElts % SubMask.size()) == 0) {
5742 int Scale = NumSubElts / SubMask.size();
5743 SmallVector<int,64> ScaledSubMask;
5744 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
5745 SubMask = ScaledSubMask;
5746 } else {
5747 int Scale = SubMask.size() / NumSubElts;
5748 NumSubElts = SubMask.size();
5749 NumElts *= Scale;
5750 InsertIdx *= Scale;
5753 Ops.push_back(Src);
5754 Ops.append(SubInputs.begin(), SubInputs.end());
5755 if (ISD::isBuildVectorAllZeros(Src.getNode()))
5756 Mask.append(NumElts, SM_SentinelZero);
5757 else
5758 for (int i = 0; i != (int)NumElts; ++i)
5759 Mask.push_back(i);
5760 for (int i = 0; i != (int)NumSubElts; ++i) {
5761 int M = SubMask[i];
5762 if (0 <= M) {
5763 int InputIdx = M / NumSubElts;
5764 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
5766 Mask[i + InsertIdx] = M;
5768 return true;
5770 case X86ISD::PINSRB:
5771 case X86ISD::PINSRW:
5772 case ISD::SCALAR_TO_VECTOR:
5773 case ISD::INSERT_VECTOR_ELT: {
5774 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
5775 // vector, for matching src/dst vector types.
5776 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
5778 unsigned DstIdx = 0;
5779 if (Opcode != ISD::SCALAR_TO_VECTOR) {
5780 // Check we have an in-range constant insertion index.
5781 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
5782 N.getConstantOperandAPInt(2).uge(NumElts))
5783 return false;
5784 DstIdx = N.getConstantOperandVal(2);
5786 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
5787 if (X86::isZeroNode(Scl)) {
5788 Ops.push_back(N.getOperand(0));
5789 for (unsigned i = 0; i != NumElts; ++i)
5790 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
5791 return true;
5795 // Peek through trunc/aext/zext.
5796 // TODO: aext shouldn't require SM_SentinelZero padding.
5797 // TODO: handle shift of scalars.
5798 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
5799 while (Scl.getOpcode() == ISD::TRUNCATE ||
5800 Scl.getOpcode() == ISD::ANY_EXTEND ||
5801 Scl.getOpcode() == ISD::ZERO_EXTEND) {
5802 Scl = Scl.getOperand(0);
5803 MinBitsPerElt =
5804 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
5806 if ((MinBitsPerElt % 8) != 0)
5807 return false;
5809 // Attempt to find the source vector the scalar was extracted from.
5810 SDValue SrcExtract;
5811 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
5812 Scl.getOpcode() == X86ISD::PEXTRW ||
5813 Scl.getOpcode() == X86ISD::PEXTRB) &&
5814 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
5815 SrcExtract = Scl;
5817 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
5818 return false;
5820 SDValue SrcVec = SrcExtract.getOperand(0);
5821 EVT SrcVT = SrcVec.getValueType();
5822 if (!SrcVT.getScalarType().isByteSized())
5823 return false;
5824 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
5825 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
5826 unsigned DstByte = DstIdx * NumBytesPerElt;
5827 MinBitsPerElt =
5828 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
5830 // Create 'identity' byte level shuffle mask and then add inserted bytes.
5831 if (Opcode == ISD::SCALAR_TO_VECTOR) {
5832 Ops.push_back(SrcVec);
5833 Mask.append(NumSizeInBytes, SM_SentinelUndef);
5834 } else {
5835 Ops.push_back(SrcVec);
5836 Ops.push_back(N.getOperand(0));
5837 for (int i = 0; i != (int)NumSizeInBytes; ++i)
5838 Mask.push_back(NumSizeInBytes + i);
5841 unsigned MinBytesPerElts = MinBitsPerElt / 8;
5842 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
5843 for (unsigned i = 0; i != MinBytesPerElts; ++i)
5844 Mask[DstByte + i] = SrcByte + i;
5845 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
5846 Mask[DstByte + i] = SM_SentinelZero;
5847 return true;
5849 case X86ISD::PACKSS:
5850 case X86ISD::PACKUS: {
5851 SDValue N0 = N.getOperand(0);
5852 SDValue N1 = N.getOperand(1);
5853 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
5854 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
5855 "Unexpected input value type");
5857 APInt EltsLHS, EltsRHS;
5858 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
5860 // If we know input saturation won't happen (or we don't care for particular
5861 // lanes), we can treat this as a truncation shuffle.
5862 bool Offset0 = false, Offset1 = false;
5863 if (Opcode == X86ISD::PACKSS) {
5864 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
5865 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
5866 (!(N1.isUndef() || EltsRHS.isZero()) &&
5867 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
5868 return false;
5869 // We can't easily fold ASHR into a shuffle, but if it was feeding a
5870 // PACKSS then it was likely being used for sign-extension for a
5871 // truncation, so just peek through and adjust the mask accordingly.
5872 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
5873 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
5874 Offset0 = true;
5875 N0 = N0.getOperand(0);
5877 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
5878 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
5879 Offset1 = true;
5880 N1 = N1.getOperand(0);
5882 } else {
5883 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
5884 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
5885 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
5886 (!(N1.isUndef() || EltsRHS.isZero()) &&
5887 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
5888 return false;
5891 bool IsUnary = (N0 == N1);
5893 Ops.push_back(N0);
5894 if (!IsUnary)
5895 Ops.push_back(N1);
5897 createPackShuffleMask(VT, Mask, IsUnary);
5899 if (Offset0 || Offset1) {
5900 for (int &M : Mask)
5901 if ((Offset0 && isInRange(M, 0, NumElts)) ||
5902 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
5903 ++M;
5905 return true;
5907 case ISD::VSELECT:
5908 case X86ISD::BLENDV: {
5909 SDValue Cond = N.getOperand(0);
5910 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
5911 Ops.push_back(N.getOperand(1));
5912 Ops.push_back(N.getOperand(2));
5913 return true;
5915 return false;
5917 case X86ISD::VTRUNC: {
5918 SDValue Src = N.getOperand(0);
5919 EVT SrcVT = Src.getValueType();
5920 // Truncated source must be a simple vector.
5921 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
5922 (SrcVT.getScalarSizeInBits() % 8) != 0)
5923 return false;
5924 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5925 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
5926 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
5927 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
5928 for (unsigned i = 0; i != NumSrcElts; ++i)
5929 Mask.push_back(i * Scale);
5930 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
5931 Ops.push_back(Src);
5932 return true;
5934 case X86ISD::VSHLI:
5935 case X86ISD::VSRLI: {
5936 uint64_t ShiftVal = N.getConstantOperandVal(1);
5937 // Out of range bit shifts are guaranteed to be zero.
5938 if (NumBitsPerElt <= ShiftVal) {
5939 Mask.append(NumElts, SM_SentinelZero);
5940 return true;
5943 // We can only decode 'whole byte' bit shifts as shuffles.
5944 if ((ShiftVal % 8) != 0)
5945 break;
5947 uint64_t ByteShift = ShiftVal / 8;
5948 Ops.push_back(N.getOperand(0));
5950 // Clear mask to all zeros and insert the shifted byte indices.
5951 Mask.append(NumSizeInBytes, SM_SentinelZero);
5953 if (X86ISD::VSHLI == Opcode) {
5954 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
5955 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5956 Mask[i + j] = i + j - ByteShift;
5957 } else {
5958 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
5959 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5960 Mask[i + j - ByteShift] = i + j;
5962 return true;
5964 case X86ISD::VROTLI:
5965 case X86ISD::VROTRI: {
5966 // We can only decode 'whole byte' bit rotates as shuffles.
5967 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
5968 if ((RotateVal % 8) != 0)
5969 return false;
5970 Ops.push_back(N.getOperand(0));
5971 int Offset = RotateVal / 8;
5972 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
5973 for (int i = 0; i != (int)NumElts; ++i) {
5974 int BaseIdx = i * NumBytesPerElt;
5975 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
5976 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
5979 return true;
5981 case X86ISD::VBROADCAST: {
5982 SDValue Src = N.getOperand(0);
5983 if (!Src.getSimpleValueType().isVector()) {
5984 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5985 !isNullConstant(Src.getOperand(1)) ||
5986 Src.getOperand(0).getValueType().getScalarType() !=
5987 VT.getScalarType())
5988 return false;
5989 Src = Src.getOperand(0);
5991 Ops.push_back(Src);
5992 Mask.append(NumElts, 0);
5993 return true;
5995 case ISD::SIGN_EXTEND_VECTOR_INREG: {
5996 SDValue Src = N.getOperand(0);
5997 EVT SrcVT = Src.getValueType();
5998 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6000 // Extended source must be a simple vector.
6001 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6002 (NumBitsPerSrcElt % 8) != 0)
6003 return false;
6005 // We can only handle all-signbits extensions.
6006 APInt DemandedSrcElts =
6007 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6008 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6009 return false;
6011 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6012 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6013 for (unsigned I = 0; I != NumElts; ++I)
6014 Mask.append(Scale, I);
6015 Ops.push_back(Src);
6016 return true;
6018 case ISD::ZERO_EXTEND:
6019 case ISD::ANY_EXTEND:
6020 case ISD::ZERO_EXTEND_VECTOR_INREG:
6021 case ISD::ANY_EXTEND_VECTOR_INREG: {
6022 SDValue Src = N.getOperand(0);
6023 EVT SrcVT = Src.getValueType();
6025 // Extended source must be a simple vector.
6026 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6027 (SrcVT.getScalarSizeInBits() % 8) != 0)
6028 return false;
6030 bool IsAnyExtend =
6031 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6032 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6033 IsAnyExtend, Mask);
6034 Ops.push_back(Src);
6035 return true;
6039 return false;
6042 /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6043 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6044 SmallVectorImpl<int> &Mask) {
6045 int MaskWidth = Mask.size();
6046 SmallVector<SDValue, 16> UsedInputs;
6047 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6048 int lo = UsedInputs.size() * MaskWidth;
6049 int hi = lo + MaskWidth;
6051 // Strip UNDEF input usage.
6052 if (Inputs[i].isUndef())
6053 for (int &M : Mask)
6054 if ((lo <= M) && (M < hi))
6055 M = SM_SentinelUndef;
6057 // Check for unused inputs.
6058 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6059 for (int &M : Mask)
6060 if (lo <= M)
6061 M -= MaskWidth;
6062 continue;
6065 // Check for repeated inputs.
6066 bool IsRepeat = false;
6067 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6068 if (UsedInputs[j] != Inputs[i])
6069 continue;
6070 for (int &M : Mask)
6071 if (lo <= M)
6072 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6073 IsRepeat = true;
6074 break;
6076 if (IsRepeat)
6077 continue;
6079 UsedInputs.push_back(Inputs[i]);
6081 Inputs = UsedInputs;
6084 /// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6085 /// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6086 /// Returns true if the target shuffle mask was decoded.
6087 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6088 SmallVectorImpl<SDValue> &Inputs,
6089 SmallVectorImpl<int> &Mask,
6090 APInt &KnownUndef, APInt &KnownZero,
6091 const SelectionDAG &DAG, unsigned Depth,
6092 bool ResolveKnownElts) {
6093 if (Depth >= SelectionDAG::MaxRecursionDepth)
6094 return false; // Limit search depth.
6096 EVT VT = Op.getValueType();
6097 if (!VT.isSimple() || !VT.isVector())
6098 return false;
6100 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6101 if (ResolveKnownElts)
6102 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6103 return true;
6105 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6106 ResolveKnownElts)) {
6107 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6108 return true;
6110 return false;
6113 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6114 SmallVectorImpl<SDValue> &Inputs,
6115 SmallVectorImpl<int> &Mask,
6116 const SelectionDAG &DAG, unsigned Depth,
6117 bool ResolveKnownElts) {
6118 APInt KnownUndef, KnownZero;
6119 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6120 KnownZero, DAG, Depth, ResolveKnownElts);
6123 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
6124 SmallVectorImpl<int> &Mask,
6125 const SelectionDAG &DAG, unsigned Depth = 0,
6126 bool ResolveKnownElts = true) {
6127 EVT VT = Op.getValueType();
6128 if (!VT.isSimple() || !VT.isVector())
6129 return false;
6131 unsigned NumElts = Op.getValueType().getVectorNumElements();
6132 APInt DemandedElts = APInt::getAllOnes(NumElts);
6133 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6134 ResolveKnownElts);
6137 // Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6138 static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6139 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6140 SelectionDAG &DAG) {
6141 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6142 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6143 "Unknown broadcast load type");
6145 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6146 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6147 return SDValue();
6149 SDValue Ptr =
6150 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
6151 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6152 SDValue Ops[] = {Mem->getChain(), Ptr};
6153 SDValue BcstLd = DAG.getMemIntrinsicNode(
6154 Opcode, DL, Tys, Ops, MemVT,
6155 DAG.getMachineFunction().getMachineMemOperand(
6156 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6157 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6158 return BcstLd;
6161 /// Returns the scalar element that will make up the i'th
6162 /// element of the result of the vector shuffle.
6163 static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6164 SelectionDAG &DAG, unsigned Depth) {
6165 if (Depth >= SelectionDAG::MaxRecursionDepth)
6166 return SDValue(); // Limit search depth.
6168 EVT VT = Op.getValueType();
6169 unsigned Opcode = Op.getOpcode();
6170 unsigned NumElems = VT.getVectorNumElements();
6172 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6173 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6174 int Elt = SV->getMaskElt(Index);
6176 if (Elt < 0)
6177 return DAG.getUNDEF(VT.getVectorElementType());
6179 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6180 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6183 // Recurse into target specific vector shuffles to find scalars.
6184 if (isTargetShuffle(Opcode)) {
6185 MVT ShufVT = VT.getSimpleVT();
6186 MVT ShufSVT = ShufVT.getVectorElementType();
6187 int NumElems = (int)ShufVT.getVectorNumElements();
6188 SmallVector<int, 16> ShuffleMask;
6189 SmallVector<SDValue, 16> ShuffleOps;
6190 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
6191 ShuffleMask))
6192 return SDValue();
6194 int Elt = ShuffleMask[Index];
6195 if (Elt == SM_SentinelZero)
6196 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6197 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6198 if (Elt == SM_SentinelUndef)
6199 return DAG.getUNDEF(ShufSVT);
6201 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6202 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6203 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6206 // Recurse into insert_subvector base/sub vector to find scalars.
6207 if (Opcode == ISD::INSERT_SUBVECTOR) {
6208 SDValue Vec = Op.getOperand(0);
6209 SDValue Sub = Op.getOperand(1);
6210 uint64_t SubIdx = Op.getConstantOperandVal(2);
6211 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6213 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6214 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6215 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6218 // Recurse into concat_vectors sub vector to find scalars.
6219 if (Opcode == ISD::CONCAT_VECTORS) {
6220 EVT SubVT = Op.getOperand(0).getValueType();
6221 unsigned NumSubElts = SubVT.getVectorNumElements();
6222 uint64_t SubIdx = Index / NumSubElts;
6223 uint64_t SubElt = Index % NumSubElts;
6224 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6227 // Recurse into extract_subvector src vector to find scalars.
6228 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6229 SDValue Src = Op.getOperand(0);
6230 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6231 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6234 // We only peek through bitcasts of the same vector width.
6235 if (Opcode == ISD::BITCAST) {
6236 SDValue Src = Op.getOperand(0);
6237 EVT SrcVT = Src.getValueType();
6238 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6239 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6240 return SDValue();
6243 // Actual nodes that may contain scalar elements
6245 // For insert_vector_elt - either return the index matching scalar or recurse
6246 // into the base vector.
6247 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6248 isa<ConstantSDNode>(Op.getOperand(2))) {
6249 if (Op.getConstantOperandAPInt(2) == Index)
6250 return Op.getOperand(1);
6251 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6254 if (Opcode == ISD::SCALAR_TO_VECTOR)
6255 return (Index == 0) ? Op.getOperand(0)
6256 : DAG.getUNDEF(VT.getVectorElementType());
6258 if (Opcode == ISD::BUILD_VECTOR)
6259 return Op.getOperand(Index);
6261 return SDValue();
6264 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6265 static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
6266 unsigned NumNonZero, unsigned NumZero,
6267 SelectionDAG &DAG,
6268 const X86Subtarget &Subtarget) {
6269 MVT VT = Op.getSimpleValueType();
6270 unsigned NumElts = VT.getVectorNumElements();
6271 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6272 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6273 "Illegal vector insertion");
6275 SDLoc dl(Op);
6276 SDValue V;
6277 bool First = true;
6279 for (unsigned i = 0; i < NumElts; ++i) {
6280 bool IsNonZero = NonZeroMask[i];
6281 if (!IsNonZero)
6282 continue;
6284 // If the build vector contains zeros or our first insertion is not the
6285 // first index then insert into zero vector to break any register
6286 // dependency else use SCALAR_TO_VECTOR.
6287 if (First) {
6288 First = false;
6289 if (NumZero || 0 != i)
6290 V = getZeroVector(VT, Subtarget, DAG, dl);
6291 else {
6292 assert(0 == i && "Expected insertion into zero-index");
6293 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6294 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6295 V = DAG.getBitcast(VT, V);
6296 continue;
6299 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6300 DAG.getIntPtrConstant(i, dl));
6303 return V;
6306 /// Custom lower build_vector of v16i8.
6307 static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
6308 unsigned NumNonZero, unsigned NumZero,
6309 SelectionDAG &DAG,
6310 const X86Subtarget &Subtarget) {
6311 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6312 return SDValue();
6314 // SSE4.1 - use PINSRB to insert each byte directly.
6315 if (Subtarget.hasSSE41())
6316 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
6317 Subtarget);
6319 SDLoc dl(Op);
6320 SDValue V;
6322 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6323 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6324 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6325 !NonZeroMask.extractBits(2, 2).isZero()) {
6326 for (unsigned I = 0; I != 4; ++I) {
6327 if (!NonZeroMask[I])
6328 continue;
6329 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), dl, MVT::i32);
6330 if (I != 0)
6331 Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
6332 DAG.getConstant(I * 8, dl, MVT::i8));
6333 V = V ? DAG.getNode(ISD::OR, dl, MVT::i32, V, Elt) : Elt;
6335 assert(V && "Failed to fold v16i8 vector to zero");
6336 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6337 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6338 V = DAG.getBitcast(MVT::v8i16, V);
6340 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6341 bool ThisIsNonZero = NonZeroMask[i];
6342 bool NextIsNonZero = NonZeroMask[i + 1];
6343 if (!ThisIsNonZero && !NextIsNonZero)
6344 continue;
6346 SDValue Elt;
6347 if (ThisIsNonZero) {
6348 if (NumZero || NextIsNonZero)
6349 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6350 else
6351 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6354 if (NextIsNonZero) {
6355 SDValue NextElt = Op.getOperand(i + 1);
6356 if (i == 0 && NumZero)
6357 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
6358 else
6359 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
6360 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
6361 DAG.getConstant(8, dl, MVT::i8));
6362 if (ThisIsNonZero)
6363 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
6364 else
6365 Elt = NextElt;
6368 // If our first insertion is not the first index or zeros are needed, then
6369 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6370 // elements undefined).
6371 if (!V) {
6372 if (i != 0 || NumZero)
6373 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6374 else {
6375 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
6376 V = DAG.getBitcast(MVT::v8i16, V);
6377 continue;
6380 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
6381 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
6382 DAG.getIntPtrConstant(i / 2, dl));
6385 return DAG.getBitcast(MVT::v16i8, V);
6388 /// Custom lower build_vector of v8i16.
6389 static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
6390 unsigned NumNonZero, unsigned NumZero,
6391 SelectionDAG &DAG,
6392 const X86Subtarget &Subtarget) {
6393 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6394 return SDValue();
6396 // Use PINSRW to insert each byte directly.
6397 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
6398 Subtarget);
6401 /// Custom lower build_vector of v4i32 or v4f32.
6402 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6403 const X86Subtarget &Subtarget) {
6404 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6405 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6406 // Because we're creating a less complicated build vector here, we may enable
6407 // further folding of the MOVDDUP via shuffle transforms.
6408 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6409 Op.getOperand(0) == Op.getOperand(2) &&
6410 Op.getOperand(1) == Op.getOperand(3) &&
6411 Op.getOperand(0) != Op.getOperand(1)) {
6412 SDLoc DL(Op);
6413 MVT VT = Op.getSimpleValueType();
6414 MVT EltVT = VT.getVectorElementType();
6415 // Create a new build vector with the first 2 elements followed by undef
6416 // padding, bitcast to v2f64, duplicate, and bitcast back.
6417 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6418 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6419 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6420 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6421 return DAG.getBitcast(VT, Dup);
6424 // Find all zeroable elements.
6425 std::bitset<4> Zeroable, Undefs;
6426 for (int i = 0; i < 4; ++i) {
6427 SDValue Elt = Op.getOperand(i);
6428 Undefs[i] = Elt.isUndef();
6429 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6431 assert(Zeroable.size() - Zeroable.count() > 1 &&
6432 "We expect at least two non-zero elements!");
6434 // We only know how to deal with build_vector nodes where elements are either
6435 // zeroable or extract_vector_elt with constant index.
6436 SDValue FirstNonZero;
6437 unsigned FirstNonZeroIdx;
6438 for (unsigned i = 0; i < 4; ++i) {
6439 if (Zeroable[i])
6440 continue;
6441 SDValue Elt = Op.getOperand(i);
6442 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6443 !isa<ConstantSDNode>(Elt.getOperand(1)))
6444 return SDValue();
6445 // Make sure that this node is extracting from a 128-bit vector.
6446 MVT VT = Elt.getOperand(0).getSimpleValueType();
6447 if (!VT.is128BitVector())
6448 return SDValue();
6449 if (!FirstNonZero.getNode()) {
6450 FirstNonZero = Elt;
6451 FirstNonZeroIdx = i;
6455 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6456 SDValue V1 = FirstNonZero.getOperand(0);
6457 MVT VT = V1.getSimpleValueType();
6459 // See if this build_vector can be lowered as a blend with zero.
6460 SDValue Elt;
6461 unsigned EltMaskIdx, EltIdx;
6462 int Mask[4];
6463 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6464 if (Zeroable[EltIdx]) {
6465 // The zero vector will be on the right hand side.
6466 Mask[EltIdx] = EltIdx+4;
6467 continue;
6470 Elt = Op->getOperand(EltIdx);
6471 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6472 EltMaskIdx = Elt.getConstantOperandVal(1);
6473 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6474 break;
6475 Mask[EltIdx] = EltIdx;
6478 if (EltIdx == 4) {
6479 // Let the shuffle legalizer deal with blend operations.
6480 SDValue VZeroOrUndef = (Zeroable == Undefs)
6481 ? DAG.getUNDEF(VT)
6482 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6483 if (V1.getSimpleValueType() != VT)
6484 V1 = DAG.getBitcast(VT, V1);
6485 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6488 // See if we can lower this build_vector to a INSERTPS.
6489 if (!Subtarget.hasSSE41())
6490 return SDValue();
6492 SDValue V2 = Elt.getOperand(0);
6493 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6494 V1 = SDValue();
6496 bool CanFold = true;
6497 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6498 if (Zeroable[i])
6499 continue;
6501 SDValue Current = Op->getOperand(i);
6502 SDValue SrcVector = Current->getOperand(0);
6503 if (!V1.getNode())
6504 V1 = SrcVector;
6505 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6508 if (!CanFold)
6509 return SDValue();
6511 assert(V1.getNode() && "Expected at least two non-zero elements!");
6512 if (V1.getSimpleValueType() != MVT::v4f32)
6513 V1 = DAG.getBitcast(MVT::v4f32, V1);
6514 if (V2.getSimpleValueType() != MVT::v4f32)
6515 V2 = DAG.getBitcast(MVT::v4f32, V2);
6517 // Ok, we can emit an INSERTPS instruction.
6518 unsigned ZMask = Zeroable.to_ulong();
6520 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6521 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6522 SDLoc DL(Op);
6523 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6524 DAG.getIntPtrConstant(InsertPSMask, DL, true));
6525 return DAG.getBitcast(VT, Result);
6528 /// Return a vector logical shift node.
6529 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6530 SelectionDAG &DAG, const TargetLowering &TLI,
6531 const SDLoc &dl) {
6532 assert(VT.is128BitVector() && "Unknown type for VShift");
6533 MVT ShVT = MVT::v16i8;
6534 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6535 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6536 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6537 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6538 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6541 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6542 SelectionDAG &DAG) {
6544 // Check if the scalar load can be widened into a vector load. And if
6545 // the address is "base + cst" see if the cst can be "absorbed" into
6546 // the shuffle mask.
6547 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6548 SDValue Ptr = LD->getBasePtr();
6549 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
6550 return SDValue();
6551 EVT PVT = LD->getValueType(0);
6552 if (PVT != MVT::i32 && PVT != MVT::f32)
6553 return SDValue();
6555 int FI = -1;
6556 int64_t Offset = 0;
6557 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6558 FI = FINode->getIndex();
6559 Offset = 0;
6560 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6561 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6562 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6563 Offset = Ptr.getConstantOperandVal(1);
6564 Ptr = Ptr.getOperand(0);
6565 } else {
6566 return SDValue();
6569 // FIXME: 256-bit vector instructions don't require a strict alignment,
6570 // improve this code to support it better.
6571 Align RequiredAlign(VT.getSizeInBits() / 8);
6572 SDValue Chain = LD->getChain();
6573 // Make sure the stack object alignment is at least 16 or 32.
6574 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6575 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
6576 if (!InferredAlign || *InferredAlign < RequiredAlign) {
6577 if (MFI.isFixedObjectIndex(FI)) {
6578 // Can't change the alignment. FIXME: It's possible to compute
6579 // the exact stack offset and reference FI + adjust offset instead.
6580 // If someone *really* cares about this. That's the way to implement it.
6581 return SDValue();
6582 } else {
6583 MFI.setObjectAlignment(FI, RequiredAlign);
6587 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6588 // Ptr + (Offset & ~15).
6589 if (Offset < 0)
6590 return SDValue();
6591 if ((Offset % RequiredAlign.value()) & 3)
6592 return SDValue();
6593 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
6594 if (StartOffset) {
6595 SDLoc DL(Ptr);
6596 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6597 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6600 int EltNo = (Offset - StartOffset) >> 2;
6601 unsigned NumElems = VT.getVectorNumElements();
6603 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6604 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6605 LD->getPointerInfo().getWithOffset(StartOffset));
6607 SmallVector<int, 8> Mask(NumElems, EltNo);
6609 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6612 return SDValue();
6615 // Recurse to find a LoadSDNode source and the accumulated ByteOffest.
6616 static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
6617 if (ISD::isNON_EXTLoad(Elt.getNode())) {
6618 auto *BaseLd = cast<LoadSDNode>(Elt);
6619 if (!BaseLd->isSimple())
6620 return false;
6621 Ld = BaseLd;
6622 ByteOffset = 0;
6623 return true;
6626 switch (Elt.getOpcode()) {
6627 case ISD::BITCAST:
6628 case ISD::TRUNCATE:
6629 case ISD::SCALAR_TO_VECTOR:
6630 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
6631 case ISD::SRL:
6632 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6633 uint64_t Amt = AmtC->getZExtValue();
6634 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
6635 ByteOffset += Amt / 8;
6636 return true;
6639 break;
6640 case ISD::EXTRACT_VECTOR_ELT:
6641 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6642 SDValue Src = Elt.getOperand(0);
6643 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
6644 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
6645 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
6646 findEltLoadSrc(Src, Ld, ByteOffset)) {
6647 uint64_t Idx = IdxC->getZExtValue();
6648 ByteOffset += Idx * (SrcSizeInBits / 8);
6649 return true;
6652 break;
6655 return false;
6658 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6659 /// elements can be replaced by a single large load which has the same value as
6660 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6662 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6663 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6664 const SDLoc &DL, SelectionDAG &DAG,
6665 const X86Subtarget &Subtarget,
6666 bool IsAfterLegalize) {
6667 if ((VT.getScalarSizeInBits() % 8) != 0)
6668 return SDValue();
6670 unsigned NumElems = Elts.size();
6672 int LastLoadedElt = -1;
6673 APInt LoadMask = APInt::getZero(NumElems);
6674 APInt ZeroMask = APInt::getZero(NumElems);
6675 APInt UndefMask = APInt::getZero(NumElems);
6677 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
6678 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
6680 // For each element in the initializer, see if we've found a load, zero or an
6681 // undef.
6682 for (unsigned i = 0; i < NumElems; ++i) {
6683 SDValue Elt = peekThroughBitcasts(Elts[i]);
6684 if (!Elt.getNode())
6685 return SDValue();
6686 if (Elt.isUndef()) {
6687 UndefMask.setBit(i);
6688 continue;
6690 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
6691 ZeroMask.setBit(i);
6692 continue;
6695 // Each loaded element must be the correct fractional portion of the
6696 // requested vector load.
6697 unsigned EltSizeInBits = Elt.getValueSizeInBits();
6698 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
6699 return SDValue();
6701 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
6702 return SDValue();
6703 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
6704 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
6705 return SDValue();
6707 LoadMask.setBit(i);
6708 LastLoadedElt = i;
6710 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
6711 NumElems &&
6712 "Incomplete element masks");
6714 // Handle Special Cases - all undef or undef/zero.
6715 if (UndefMask.popcount() == NumElems)
6716 return DAG.getUNDEF(VT);
6717 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
6718 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6719 : DAG.getConstantFP(0.0, DL, VT);
6721 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6722 int FirstLoadedElt = LoadMask.countr_zero();
6723 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6724 EVT EltBaseVT = EltBase.getValueType();
6725 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
6726 "Register/Memory size mismatch");
6727 LoadSDNode *LDBase = Loads[FirstLoadedElt];
6728 assert(LDBase && "Did not find base load for merging consecutive loads");
6729 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
6730 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
6731 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
6732 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
6733 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
6735 // TODO: Support offsetting the base load.
6736 if (ByteOffsets[FirstLoadedElt] != 0)
6737 return SDValue();
6739 // Check to see if the element's load is consecutive to the base load
6740 // or offset from a previous (already checked) load.
6741 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
6742 LoadSDNode *Ld = Loads[EltIdx];
6743 int64_t ByteOffset = ByteOffsets[EltIdx];
6744 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
6745 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
6746 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
6747 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
6749 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
6750 EltIdx - FirstLoadedElt);
6753 // Consecutive loads can contain UNDEFS but not ZERO elements.
6754 // Consecutive loads with UNDEFs and ZEROs elements require a
6755 // an additional shuffle stage to clear the ZERO elements.
6756 bool IsConsecutiveLoad = true;
6757 bool IsConsecutiveLoadWithZeros = true;
6758 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6759 if (LoadMask[i]) {
6760 if (!CheckConsecutiveLoad(LDBase, i)) {
6761 IsConsecutiveLoad = false;
6762 IsConsecutiveLoadWithZeros = false;
6763 break;
6765 } else if (ZeroMask[i]) {
6766 IsConsecutiveLoad = false;
6770 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6771 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6772 assert(LDBase->isSimple() &&
6773 "Cannot merge volatile or atomic loads.");
6774 SDValue NewLd =
6775 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6776 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
6777 MMOFlags);
6778 for (auto *LD : Loads)
6779 if (LD)
6780 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6781 return NewLd;
6784 // Check if the base load is entirely dereferenceable.
6785 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
6786 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
6788 // LOAD - all consecutive load/undefs (must start/end with a load or be
6789 // entirely dereferenceable). If we have found an entire vector of loads and
6790 // undefs, then return a large load of the entire vector width starting at the
6791 // base pointer. If the vector contains zeros, then attempt to shuffle those
6792 // elements.
6793 if (FirstLoadedElt == 0 &&
6794 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
6795 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6796 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6797 return SDValue();
6799 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6800 // will lower to regular temporal loads and use the cache.
6801 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
6802 VT.is256BitVector() && !Subtarget.hasInt256())
6803 return SDValue();
6805 if (NumElems == 1)
6806 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
6808 if (!ZeroMask)
6809 return CreateLoad(VT, LDBase);
6811 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6812 // vector and a zero vector to clear out the zero elements.
6813 if (!IsAfterLegalize && VT.isVector()) {
6814 unsigned NumMaskElts = VT.getVectorNumElements();
6815 if ((NumMaskElts % NumElems) == 0) {
6816 unsigned Scale = NumMaskElts / NumElems;
6817 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
6818 for (unsigned i = 0; i < NumElems; ++i) {
6819 if (UndefMask[i])
6820 continue;
6821 int Offset = ZeroMask[i] ? NumMaskElts : 0;
6822 for (unsigned j = 0; j != Scale; ++j)
6823 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
6825 SDValue V = CreateLoad(VT, LDBase);
6826 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6827 : DAG.getConstantFP(0.0, DL, VT);
6828 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6833 // If the upper half of a ymm/zmm load is undef then just load the lower half.
6834 if (VT.is256BitVector() || VT.is512BitVector()) {
6835 unsigned HalfNumElems = NumElems / 2;
6836 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
6837 EVT HalfVT =
6838 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
6839 SDValue HalfLD =
6840 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
6841 DAG, Subtarget, IsAfterLegalize);
6842 if (HalfLD)
6843 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
6844 HalfLD, DAG.getIntPtrConstant(0, DL));
6848 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6849 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6850 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
6851 LoadSizeInBits == 64) &&
6852 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6853 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
6854 : MVT::getIntegerVT(LoadSizeInBits);
6855 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
6856 // Allow v4f32 on SSE1 only targets.
6857 // FIXME: Add more isel patterns so we can just use VT directly.
6858 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
6859 VecVT = MVT::v4f32;
6860 if (TLI.isTypeLegal(VecVT)) {
6861 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6862 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6863 SDValue ResNode = DAG.getMemIntrinsicNode(
6864 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
6865 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
6866 for (auto *LD : Loads)
6867 if (LD)
6868 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6869 return DAG.getBitcast(VT, ResNode);
6873 // BROADCAST - match the smallest possible repetition pattern, load that
6874 // scalar/subvector element and then broadcast to the entire vector.
6875 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
6876 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
6877 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
6878 unsigned RepeatSize = SubElems * BaseSizeInBits;
6879 unsigned ScalarSize = std::min(RepeatSize, 64u);
6880 if (!Subtarget.hasAVX2() && ScalarSize < 32)
6881 continue;
6883 // Don't attempt a 1:N subvector broadcast - it should be caught by
6884 // combineConcatVectorOps, else will cause infinite loops.
6885 if (RepeatSize > ScalarSize && SubElems == 1)
6886 continue;
6888 bool Match = true;
6889 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
6890 for (unsigned i = 0; i != NumElems && Match; ++i) {
6891 if (!LoadMask[i])
6892 continue;
6893 SDValue Elt = peekThroughBitcasts(Elts[i]);
6894 if (RepeatedLoads[i % SubElems].isUndef())
6895 RepeatedLoads[i % SubElems] = Elt;
6896 else
6897 Match &= (RepeatedLoads[i % SubElems] == Elt);
6900 // We must have loads at both ends of the repetition.
6901 Match &= !RepeatedLoads.front().isUndef();
6902 Match &= !RepeatedLoads.back().isUndef();
6903 if (!Match)
6904 continue;
6906 EVT RepeatVT =
6907 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
6908 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
6909 : EVT::getFloatingPointVT(ScalarSize);
6910 if (RepeatSize > ScalarSize)
6911 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
6912 RepeatSize / ScalarSize);
6913 EVT BroadcastVT =
6914 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
6915 VT.getSizeInBits() / ScalarSize);
6916 if (TLI.isTypeLegal(BroadcastVT)) {
6917 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
6918 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
6919 SDValue Broadcast = RepeatLoad;
6920 if (RepeatSize > ScalarSize) {
6921 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
6922 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
6923 } else {
6924 if (!Subtarget.hasAVX2() &&
6925 !X86::mayFoldLoadIntoBroadcastFromMem(
6926 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
6927 Subtarget,
6928 /*AssumeSingleUse=*/true))
6929 return SDValue();
6930 Broadcast =
6931 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
6933 return DAG.getBitcast(VT, Broadcast);
6939 return SDValue();
6942 // Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
6943 // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
6944 // are consecutive, non-overlapping, and in the right order.
6945 static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
6946 SelectionDAG &DAG,
6947 const X86Subtarget &Subtarget,
6948 bool IsAfterLegalize) {
6949 SmallVector<SDValue, 64> Elts;
6950 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
6951 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
6952 Elts.push_back(Elt);
6953 continue;
6955 return SDValue();
6957 assert(Elts.size() == VT.getVectorNumElements());
6958 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
6959 IsAfterLegalize);
6962 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6963 unsigned SplatBitSize, LLVMContext &C) {
6964 unsigned ScalarSize = VT.getScalarSizeInBits();
6966 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
6967 if (VT.isFloatingPoint()) {
6968 if (ScalarSize == 16)
6969 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
6970 if (ScalarSize == 32)
6971 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6972 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6973 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6975 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6978 if (ScalarSize == SplatBitSize)
6979 return getConstantScalar(SplatValue);
6981 unsigned NumElm = SplatBitSize / ScalarSize;
6982 SmallVector<Constant *, 32> ConstantVec;
6983 for (unsigned I = 0; I != NumElm; ++I) {
6984 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
6985 ConstantVec.push_back(getConstantScalar(Val));
6987 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6990 static bool isFoldableUseOfShuffle(SDNode *N) {
6991 for (auto *U : N->uses()) {
6992 unsigned Opc = U->getOpcode();
6993 // VPERMV/VPERMV3 shuffles can never fold their index operands.
6994 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
6995 return false;
6996 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
6997 return false;
6998 if (isTargetShuffle(Opc))
6999 return true;
7000 if (Opc == ISD::BITCAST) // Ignore bitcasts
7001 return isFoldableUseOfShuffle(U);
7002 if (N->hasOneUse()) {
7003 // TODO, there may be some general way to know if a SDNode can
7004 // be folded. We now only know whether an MI is foldable.
7005 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7006 return false;
7007 return true;
7010 return false;
7013 /// Attempt to use the vbroadcast instruction to generate a splat value
7014 /// from a splat BUILD_VECTOR which uses:
7015 /// a. A single scalar load, or a constant.
7016 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7018 /// The VBROADCAST node is returned when a pattern is found,
7019 /// or SDValue() otherwise.
7020 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
7021 const X86Subtarget &Subtarget,
7022 SelectionDAG &DAG) {
7023 // VBROADCAST requires AVX.
7024 // TODO: Splats could be generated for non-AVX CPUs using SSE
7025 // instructions, but there's less potential gain for only 128-bit vectors.
7026 if (!Subtarget.hasAVX())
7027 return SDValue();
7029 MVT VT = BVOp->getSimpleValueType(0);
7030 unsigned NumElts = VT.getVectorNumElements();
7031 SDLoc dl(BVOp);
7033 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7034 "Unsupported vector type for broadcast.");
7036 // See if the build vector is a repeating sequence of scalars (inc. splat).
7037 SDValue Ld;
7038 BitVector UndefElements;
7039 SmallVector<SDValue, 16> Sequence;
7040 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7041 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7042 if (Sequence.size() == 1)
7043 Ld = Sequence[0];
7046 // Attempt to use VBROADCASTM
7047 // From this pattern:
7048 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7049 // b. t1 = (build_vector t0 t0)
7051 // Create (VBROADCASTM v2i1 X)
7052 if (!Sequence.empty() && Subtarget.hasCDI()) {
7053 // If not a splat, are the upper sequence values zeroable?
7054 unsigned SeqLen = Sequence.size();
7055 bool UpperZeroOrUndef =
7056 SeqLen == 1 ||
7057 llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
7058 return !V || V.isUndef() || isNullConstant(V);
7060 SDValue Op0 = Sequence[0];
7061 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7062 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7063 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7064 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7065 ? Op0.getOperand(0)
7066 : Op0.getOperand(0).getOperand(0);
7067 MVT MaskVT = BOperand.getSimpleValueType();
7068 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7069 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7070 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7071 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7072 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7073 unsigned Scale = 512 / VT.getSizeInBits();
7074 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7076 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7077 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7078 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7079 return DAG.getBitcast(VT, Bcst);
7084 unsigned NumUndefElts = UndefElements.count();
7085 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7086 APInt SplatValue, Undef;
7087 unsigned SplatBitSize;
7088 bool HasUndef;
7089 // Check if this is a repeated constant pattern suitable for broadcasting.
7090 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7091 SplatBitSize > VT.getScalarSizeInBits() &&
7092 SplatBitSize < VT.getSizeInBits()) {
7093 // Avoid replacing with broadcast when it's a use of a shuffle
7094 // instruction to preserve the present custom lowering of shuffles.
7095 if (isFoldableUseOfShuffle(BVOp))
7096 return SDValue();
7097 // replace BUILD_VECTOR with broadcast of the repeated constants.
7098 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7099 LLVMContext *Ctx = DAG.getContext();
7100 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7101 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7102 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7103 // Load the constant scalar/subvector and broadcast it.
7104 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7105 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7106 SDValue CP = DAG.getConstantPool(C, PVT);
7107 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7109 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7110 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7111 SDValue Ops[] = {DAG.getEntryNode(), CP};
7112 MachinePointerInfo MPI =
7113 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
7114 SDValue Brdcst =
7115 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7116 MPI, Alignment, MachineMemOperand::MOLoad);
7117 return DAG.getBitcast(VT, Brdcst);
7119 if (SplatBitSize > 64) {
7120 // Load the vector of constants and broadcast it.
7121 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7122 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7123 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7124 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7125 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7126 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7127 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7128 MachinePointerInfo MPI =
7129 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
7130 return DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, dl, Tys,
7131 Ops, VVT, MPI, Alignment,
7132 MachineMemOperand::MOLoad);
7136 // If we are moving a scalar into a vector (Ld must be set and all elements
7137 // but 1 are undef) and that operation is not obviously supported by
7138 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7139 // That's better than general shuffling and may eliminate a load to GPR and
7140 // move from scalar to vector register.
7141 if (!Ld || NumElts - NumUndefElts != 1)
7142 return SDValue();
7143 unsigned ScalarSize = Ld.getValueSizeInBits();
7144 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7145 return SDValue();
7148 bool ConstSplatVal =
7149 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7150 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7152 // TODO: Handle broadcasts of non-constant sequences.
7154 // Make sure that all of the users of a non-constant load are from the
7155 // BUILD_VECTOR node.
7156 // FIXME: Is the use count needed for non-constant, non-load case?
7157 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7158 return SDValue();
7160 unsigned ScalarSize = Ld.getValueSizeInBits();
7161 bool IsGE256 = (VT.getSizeInBits() >= 256);
7163 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7164 // instruction to save 8 or more bytes of constant pool data.
7165 // TODO: If multiple splats are generated to load the same constant,
7166 // it may be detrimental to overall size. There needs to be a way to detect
7167 // that condition to know if this is truly a size win.
7168 bool OptForSize = DAG.shouldOptForSize();
7170 // Handle broadcasting a single constant scalar from the constant pool
7171 // into a vector.
7172 // On Sandybridge (no AVX2), it is still better to load a constant vector
7173 // from the constant pool and not to broadcast it from a scalar.
7174 // But override that restriction when optimizing for size.
7175 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7176 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7177 EVT CVT = Ld.getValueType();
7178 assert(!CVT.isVector() && "Must not broadcast a vector type");
7180 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7181 // For size optimization, also splat v2f64 and v2i64, and for size opt
7182 // with AVX2, also splat i8 and i16.
7183 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7184 if (ScalarSize == 32 ||
7185 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7186 CVT == MVT::f16 ||
7187 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7188 const Constant *C = nullptr;
7189 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7190 C = CI->getConstantIntValue();
7191 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7192 C = CF->getConstantFPValue();
7194 assert(C && "Invalid constant type");
7196 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7197 SDValue CP =
7198 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
7199 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7201 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7202 SDValue Ops[] = {DAG.getEntryNode(), CP};
7203 MachinePointerInfo MPI =
7204 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
7205 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7206 MPI, Alignment, MachineMemOperand::MOLoad);
7210 // Handle AVX2 in-register broadcasts.
7211 if (!IsLoad && Subtarget.hasInt256() &&
7212 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7213 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7215 // The scalar source must be a normal load.
7216 if (!IsLoad)
7217 return SDValue();
7219 // Make sure the non-chain result is only used by this build vector.
7220 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7221 return SDValue();
7223 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7224 (Subtarget.hasVLX() && ScalarSize == 64)) {
7225 auto *LN = cast<LoadSDNode>(Ld);
7226 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7227 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7228 SDValue BCast =
7229 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
7230 LN->getMemoryVT(), LN->getMemOperand());
7231 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7232 return BCast;
7235 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7236 // double since there is no vbroadcastsd xmm
7237 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7238 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7239 auto *LN = cast<LoadSDNode>(Ld);
7240 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7241 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7242 SDValue BCast =
7243 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
7244 LN->getMemoryVT(), LN->getMemOperand());
7245 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7246 return BCast;
7249 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7250 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7252 // Unsupported broadcast.
7253 return SDValue();
7256 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
7257 /// underlying vector and index.
7259 /// Modifies \p ExtractedFromVec to the real vector and returns the real
7260 /// index.
7261 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7262 SDValue ExtIdx) {
7263 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7264 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7265 return Idx;
7267 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7268 // lowered this:
7269 // (extract_vector_elt (v8f32 %1), Constant<6>)
7270 // to:
7271 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7272 // (extract_subvector (v8f32 %0), Constant<4>),
7273 // undef)
7274 // Constant<0>)
7275 // In this case the vector is the extract_subvector expression and the index
7276 // is 2, as specified by the shuffle.
7277 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7278 SDValue ShuffleVec = SVOp->getOperand(0);
7279 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7280 assert(ShuffleVecVT.getVectorElementType() ==
7281 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7283 int ShuffleIdx = SVOp->getMaskElt(Idx);
7284 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7285 ExtractedFromVec = ShuffleVec;
7286 return ShuffleIdx;
7288 return Idx;
7291 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7292 MVT VT = Op.getSimpleValueType();
7294 // Skip if insert_vec_elt is not supported.
7295 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7296 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7297 return SDValue();
7299 SDLoc DL(Op);
7300 unsigned NumElems = Op.getNumOperands();
7302 SDValue VecIn1;
7303 SDValue VecIn2;
7304 SmallVector<unsigned, 4> InsertIndices;
7305 SmallVector<int, 8> Mask(NumElems, -1);
7307 for (unsigned i = 0; i != NumElems; ++i) {
7308 unsigned Opc = Op.getOperand(i).getOpcode();
7310 if (Opc == ISD::UNDEF)
7311 continue;
7313 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7314 // Quit if more than 1 elements need inserting.
7315 if (InsertIndices.size() > 1)
7316 return SDValue();
7318 InsertIndices.push_back(i);
7319 continue;
7322 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7323 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7325 // Quit if non-constant index.
7326 if (!isa<ConstantSDNode>(ExtIdx))
7327 return SDValue();
7328 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7330 // Quit if extracted from vector of different type.
7331 if (ExtractedFromVec.getValueType() != VT)
7332 return SDValue();
7334 if (!VecIn1.getNode())
7335 VecIn1 = ExtractedFromVec;
7336 else if (VecIn1 != ExtractedFromVec) {
7337 if (!VecIn2.getNode())
7338 VecIn2 = ExtractedFromVec;
7339 else if (VecIn2 != ExtractedFromVec)
7340 // Quit if more than 2 vectors to shuffle
7341 return SDValue();
7344 if (ExtractedFromVec == VecIn1)
7345 Mask[i] = Idx;
7346 else if (ExtractedFromVec == VecIn2)
7347 Mask[i] = Idx + NumElems;
7350 if (!VecIn1.getNode())
7351 return SDValue();
7353 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7354 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7356 for (unsigned Idx : InsertIndices)
7357 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7358 DAG.getIntPtrConstant(Idx, DL));
7360 return NV;
7363 // Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7364 static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
7365 const X86Subtarget &Subtarget) {
7366 MVT VT = Op.getSimpleValueType();
7367 MVT IVT = VT.changeVectorElementTypeToInteger();
7368 SmallVector<SDValue, 16> NewOps;
7369 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7370 NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
7371 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7372 return DAG.getBitcast(VT, Res);
7375 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7376 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
7377 const X86Subtarget &Subtarget) {
7379 MVT VT = Op.getSimpleValueType();
7380 assert((VT.getVectorElementType() == MVT::i1) &&
7381 "Unexpected type in LowerBUILD_VECTORvXi1!");
7383 SDLoc dl(Op);
7384 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7385 ISD::isBuildVectorAllOnes(Op.getNode()))
7386 return Op;
7388 uint64_t Immediate = 0;
7389 SmallVector<unsigned, 16> NonConstIdx;
7390 bool IsSplat = true;
7391 bool HasConstElts = false;
7392 int SplatIdx = -1;
7393 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7394 SDValue In = Op.getOperand(idx);
7395 if (In.isUndef())
7396 continue;
7397 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7398 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7399 HasConstElts = true;
7400 } else {
7401 NonConstIdx.push_back(idx);
7403 if (SplatIdx < 0)
7404 SplatIdx = idx;
7405 else if (In != Op.getOperand(SplatIdx))
7406 IsSplat = false;
7409 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7410 if (IsSplat) {
7411 // The build_vector allows the scalar element to be larger than the vector
7412 // element type. We need to mask it to use as a condition unless we know
7413 // the upper bits are zero.
7414 // FIXME: Use computeKnownBits instead of checking specific opcode?
7415 SDValue Cond = Op.getOperand(SplatIdx);
7416 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7417 if (Cond.getOpcode() != ISD::SETCC)
7418 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7419 DAG.getConstant(1, dl, MVT::i8));
7421 // Perform the select in the scalar domain so we can use cmov.
7422 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7423 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7424 DAG.getAllOnesConstant(dl, MVT::i32),
7425 DAG.getConstant(0, dl, MVT::i32));
7426 Select = DAG.getBitcast(MVT::v32i1, Select);
7427 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7428 } else {
7429 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7430 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7431 DAG.getAllOnesConstant(dl, ImmVT),
7432 DAG.getConstant(0, dl, ImmVT));
7433 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7434 Select = DAG.getBitcast(VecVT, Select);
7435 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7436 DAG.getIntPtrConstant(0, dl));
7440 // insert elements one by one
7441 SDValue DstVec;
7442 if (HasConstElts) {
7443 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7444 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7445 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7446 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7447 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7448 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7449 } else {
7450 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7451 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7452 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7453 DstVec = DAG.getBitcast(VecVT, Imm);
7454 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7455 DAG.getIntPtrConstant(0, dl));
7457 } else
7458 DstVec = DAG.getUNDEF(VT);
7460 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7461 unsigned InsertIdx = NonConstIdx[i];
7462 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7463 Op.getOperand(InsertIdx),
7464 DAG.getIntPtrConstant(InsertIdx, dl));
7466 return DstVec;
7469 LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7470 switch (Opcode) {
7471 case X86ISD::PACKSS:
7472 case X86ISD::PACKUS:
7473 case X86ISD::FHADD:
7474 case X86ISD::FHSUB:
7475 case X86ISD::HADD:
7476 case X86ISD::HSUB:
7477 return true;
7479 return false;
7482 /// This is a helper function of LowerToHorizontalOp().
7483 /// This function checks that the build_vector \p N in input implements a
7484 /// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7485 /// may not match the layout of an x86 256-bit horizontal instruction.
7486 /// In other words, if this returns true, then some extraction/insertion will
7487 /// be required to produce a valid horizontal instruction.
7489 /// Parameter \p Opcode defines the kind of horizontal operation to match.
7490 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7491 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7492 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7493 /// arithmetic sub.
7495 /// This function only analyzes elements of \p N whose indices are
7496 /// in range [BaseIdx, LastIdx).
7498 /// TODO: This function was originally used to match both real and fake partial
7499 /// horizontal operations, but the index-matching logic is incorrect for that.
7500 /// See the corrected implementation in isHopBuildVector(). Can we reduce this
7501 /// code because it is only used for partial h-op matching now?
7502 static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7503 SelectionDAG &DAG,
7504 unsigned BaseIdx, unsigned LastIdx,
7505 SDValue &V0, SDValue &V1) {
7506 EVT VT = N->getValueType(0);
7507 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7508 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7509 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7510 "Invalid Vector in input!");
7512 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7513 bool CanFold = true;
7514 unsigned ExpectedVExtractIdx = BaseIdx;
7515 unsigned NumElts = LastIdx - BaseIdx;
7516 V0 = DAG.getUNDEF(VT);
7517 V1 = DAG.getUNDEF(VT);
7519 // Check if N implements a horizontal binop.
7520 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7521 SDValue Op = N->getOperand(i + BaseIdx);
7523 // Skip UNDEFs.
7524 if (Op->isUndef()) {
7525 // Update the expected vector extract index.
7526 if (i * 2 == NumElts)
7527 ExpectedVExtractIdx = BaseIdx;
7528 ExpectedVExtractIdx += 2;
7529 continue;
7532 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7534 if (!CanFold)
7535 break;
7537 SDValue Op0 = Op.getOperand(0);
7538 SDValue Op1 = Op.getOperand(1);
7540 // Try to match the following pattern:
7541 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7542 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7543 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7544 Op0.getOperand(0) == Op1.getOperand(0) &&
7545 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7546 isa<ConstantSDNode>(Op1.getOperand(1)));
7547 if (!CanFold)
7548 break;
7550 unsigned I0 = Op0.getConstantOperandVal(1);
7551 unsigned I1 = Op1.getConstantOperandVal(1);
7553 if (i * 2 < NumElts) {
7554 if (V0.isUndef()) {
7555 V0 = Op0.getOperand(0);
7556 if (V0.getValueType() != VT)
7557 return false;
7559 } else {
7560 if (V1.isUndef()) {
7561 V1 = Op0.getOperand(0);
7562 if (V1.getValueType() != VT)
7563 return false;
7565 if (i * 2 == NumElts)
7566 ExpectedVExtractIdx = BaseIdx;
7569 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7570 if (I0 == ExpectedVExtractIdx)
7571 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7572 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7573 // Try to match the following dag sequence:
7574 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7575 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7576 } else
7577 CanFold = false;
7579 ExpectedVExtractIdx += 2;
7582 return CanFold;
7585 /// Emit a sequence of two 128-bit horizontal add/sub followed by
7586 /// a concat_vector.
7588 /// This is a helper function of LowerToHorizontalOp().
7589 /// This function expects two 256-bit vectors called V0 and V1.
7590 /// At first, each vector is split into two separate 128-bit vectors.
7591 /// Then, the resulting 128-bit vectors are used to implement two
7592 /// horizontal binary operations.
7594 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7596 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7597 /// the two new horizontal binop.
7598 /// When Mode is set, the first horizontal binop dag node would take as input
7599 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7600 /// horizontal binop dag node would take as input the lower 128-bit of V1
7601 /// and the upper 128-bit of V1.
7602 /// Example:
7603 /// HADD V0_LO, V0_HI
7604 /// HADD V1_LO, V1_HI
7606 /// Otherwise, the first horizontal binop dag node takes as input the lower
7607 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7608 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7609 /// Example:
7610 /// HADD V0_LO, V1_LO
7611 /// HADD V0_HI, V1_HI
7613 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7614 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7615 /// the upper 128-bits of the result.
7616 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7617 const SDLoc &DL, SelectionDAG &DAG,
7618 unsigned X86Opcode, bool Mode,
7619 bool isUndefLO, bool isUndefHI) {
7620 MVT VT = V0.getSimpleValueType();
7621 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7622 "Invalid nodes in input!");
7624 unsigned NumElts = VT.getVectorNumElements();
7625 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7626 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7627 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7628 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7629 MVT NewVT = V0_LO.getSimpleValueType();
7631 SDValue LO = DAG.getUNDEF(NewVT);
7632 SDValue HI = DAG.getUNDEF(NewVT);
7634 if (Mode) {
7635 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7636 if (!isUndefLO && !V0->isUndef())
7637 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7638 if (!isUndefHI && !V1->isUndef())
7639 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7640 } else {
7641 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7642 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7643 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7645 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7646 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7649 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7652 /// Returns true iff \p BV builds a vector with the result equivalent to
7653 /// the result of ADDSUB/SUBADD operation.
7654 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7655 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7656 /// \p Opnd0 and \p Opnd1.
7657 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
7658 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7659 SDValue &Opnd0, SDValue &Opnd1,
7660 unsigned &NumExtracts,
7661 bool &IsSubAdd) {
7663 MVT VT = BV->getSimpleValueType(0);
7664 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7665 return false;
7667 unsigned NumElts = VT.getVectorNumElements();
7668 SDValue InVec0 = DAG.getUNDEF(VT);
7669 SDValue InVec1 = DAG.getUNDEF(VT);
7671 NumExtracts = 0;
7673 // Odd-numbered elements in the input build vector are obtained from
7674 // adding/subtracting two integer/float elements.
7675 // Even-numbered elements in the input build vector are obtained from
7676 // subtracting/adding two integer/float elements.
7677 unsigned Opc[2] = {0, 0};
7678 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7679 SDValue Op = BV->getOperand(i);
7681 // Skip 'undef' values.
7682 unsigned Opcode = Op.getOpcode();
7683 if (Opcode == ISD::UNDEF)
7684 continue;
7686 // Early exit if we found an unexpected opcode.
7687 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7688 return false;
7690 SDValue Op0 = Op.getOperand(0);
7691 SDValue Op1 = Op.getOperand(1);
7693 // Try to match the following pattern:
7694 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7695 // Early exit if we cannot match that sequence.
7696 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7697 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7698 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7699 Op0.getOperand(1) != Op1.getOperand(1))
7700 return false;
7702 unsigned I0 = Op0.getConstantOperandVal(1);
7703 if (I0 != i)
7704 return false;
7706 // We found a valid add/sub node, make sure its the same opcode as previous
7707 // elements for this parity.
7708 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7709 return false;
7710 Opc[i % 2] = Opcode;
7712 // Update InVec0 and InVec1.
7713 if (InVec0.isUndef()) {
7714 InVec0 = Op0.getOperand(0);
7715 if (InVec0.getSimpleValueType() != VT)
7716 return false;
7718 if (InVec1.isUndef()) {
7719 InVec1 = Op1.getOperand(0);
7720 if (InVec1.getSimpleValueType() != VT)
7721 return false;
7724 // Make sure that operands in input to each add/sub node always
7725 // come from a same pair of vectors.
7726 if (InVec0 != Op0.getOperand(0)) {
7727 if (Opcode == ISD::FSUB)
7728 return false;
7730 // FADD is commutable. Try to commute the operands
7731 // and then test again.
7732 std::swap(Op0, Op1);
7733 if (InVec0 != Op0.getOperand(0))
7734 return false;
7737 if (InVec1 != Op1.getOperand(0))
7738 return false;
7740 // Increment the number of extractions done.
7741 ++NumExtracts;
7744 // Ensure we have found an opcode for both parities and that they are
7745 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7746 // inputs are undef.
7747 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7748 InVec0.isUndef() || InVec1.isUndef())
7749 return false;
7751 IsSubAdd = Opc[0] == ISD::FADD;
7753 Opnd0 = InVec0;
7754 Opnd1 = InVec1;
7755 return true;
7758 /// Returns true if is possible to fold MUL and an idiom that has already been
7759 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7760 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7761 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7763 /// Prior to calling this function it should be known that there is some
7764 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7765 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7766 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7767 /// of \p Opnd0 uses is expected to be equal to 2.
7768 /// For example, this function may be called for the following IR:
7769 /// %AB = fmul fast <2 x double> %A, %B
7770 /// %Sub = fsub fast <2 x double> %AB, %C
7771 /// %Add = fadd fast <2 x double> %AB, %C
7772 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7773 /// <2 x i32> <i32 0, i32 3>
7774 /// There is a def for %Addsub here, which potentially can be replaced by
7775 /// X86ISD::ADDSUB operation:
7776 /// %Addsub = X86ISD::ADDSUB %AB, %C
7777 /// and such ADDSUB can further be replaced with FMADDSUB:
7778 /// %Addsub = FMADDSUB %A, %B, %C.
7780 /// The main reason why this method is called before the replacement of the
7781 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7782 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7783 /// FMADDSUB is.
7784 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
7785 SelectionDAG &DAG,
7786 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7787 unsigned ExpectedUses) {
7788 if (Opnd0.getOpcode() != ISD::FMUL ||
7789 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
7790 return false;
7792 // FIXME: These checks must match the similar ones in
7793 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7794 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7795 // or MUL + ADDSUB to FMADDSUB.
7796 const TargetOptions &Options = DAG.getTarget().Options;
7797 bool AllowFusion =
7798 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7799 if (!AllowFusion)
7800 return false;
7802 Opnd2 = Opnd1;
7803 Opnd1 = Opnd0.getOperand(1);
7804 Opnd0 = Opnd0.getOperand(0);
7806 return true;
7809 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
7810 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
7811 /// X86ISD::FMSUBADD node.
7812 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7813 const X86Subtarget &Subtarget,
7814 SelectionDAG &DAG) {
7815 SDValue Opnd0, Opnd1;
7816 unsigned NumExtracts;
7817 bool IsSubAdd;
7818 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
7819 IsSubAdd))
7820 return SDValue();
7822 MVT VT = BV->getSimpleValueType(0);
7823 SDLoc DL(BV);
7825 // Try to generate X86ISD::FMADDSUB node here.
7826 SDValue Opnd2;
7827 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
7828 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
7829 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
7832 // We only support ADDSUB.
7833 if (IsSubAdd)
7834 return SDValue();
7836 // There are no known X86 targets with 512-bit ADDSUB instructions!
7837 // Convert to blend(fsub,fadd).
7838 if (VT.is512BitVector()) {
7839 SmallVector<int> Mask;
7840 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
7841 Mask.push_back(I);
7842 Mask.push_back(I + E + 1);
7844 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
7845 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
7846 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
7849 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7852 static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
7853 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
7854 // Initialize outputs to known values.
7855 MVT VT = BV->getSimpleValueType(0);
7856 HOpcode = ISD::DELETED_NODE;
7857 V0 = DAG.getUNDEF(VT);
7858 V1 = DAG.getUNDEF(VT);
7860 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
7861 // half of the result is calculated independently from the 128-bit halves of
7862 // the inputs, so that makes the index-checking logic below more complicated.
7863 unsigned NumElts = VT.getVectorNumElements();
7864 unsigned GenericOpcode = ISD::DELETED_NODE;
7865 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
7866 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
7867 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
7868 for (unsigned i = 0; i != Num128BitChunks; ++i) {
7869 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
7870 // Ignore undef elements.
7871 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
7872 if (Op.isUndef())
7873 continue;
7875 // If there's an opcode mismatch, we're done.
7876 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
7877 return false;
7879 // Initialize horizontal opcode.
7880 if (HOpcode == ISD::DELETED_NODE) {
7881 GenericOpcode = Op.getOpcode();
7882 switch (GenericOpcode) {
7883 case ISD::ADD: HOpcode = X86ISD::HADD; break;
7884 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
7885 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
7886 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
7887 default: return false;
7891 SDValue Op0 = Op.getOperand(0);
7892 SDValue Op1 = Op.getOperand(1);
7893 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7894 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7895 Op0.getOperand(0) != Op1.getOperand(0) ||
7896 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7897 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
7898 return false;
7900 // The source vector is chosen based on which 64-bit half of the
7901 // destination vector is being calculated.
7902 if (j < NumEltsIn64Bits) {
7903 if (V0.isUndef())
7904 V0 = Op0.getOperand(0);
7905 } else {
7906 if (V1.isUndef())
7907 V1 = Op0.getOperand(0);
7910 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
7911 if (SourceVec != Op0.getOperand(0))
7912 return false;
7914 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
7915 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
7916 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
7917 unsigned ExpectedIndex = i * NumEltsIn128Bits +
7918 (j % NumEltsIn64Bits) * 2;
7919 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
7920 continue;
7922 // If this is not a commutative op, this does not match.
7923 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
7924 return false;
7926 // Addition is commutative, so try swapping the extract indexes.
7927 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
7928 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
7929 continue;
7931 // Extract indexes do not match horizontal requirement.
7932 return false;
7935 // We matched. Opcode and operands are returned by reference as arguments.
7936 return true;
7939 static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
7940 SelectionDAG &DAG, unsigned HOpcode,
7941 SDValue V0, SDValue V1) {
7942 // If either input vector is not the same size as the build vector,
7943 // extract/insert the low bits to the correct size.
7944 // This is free (examples: zmm --> xmm, xmm --> ymm).
7945 MVT VT = BV->getSimpleValueType(0);
7946 unsigned Width = VT.getSizeInBits();
7947 if (V0.getValueSizeInBits() > Width)
7948 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
7949 else if (V0.getValueSizeInBits() < Width)
7950 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
7952 if (V1.getValueSizeInBits() > Width)
7953 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
7954 else if (V1.getValueSizeInBits() < Width)
7955 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
7957 unsigned NumElts = VT.getVectorNumElements();
7958 APInt DemandedElts = APInt::getAllOnes(NumElts);
7959 for (unsigned i = 0; i != NumElts; ++i)
7960 if (BV->getOperand(i).isUndef())
7961 DemandedElts.clearBit(i);
7963 // If we don't need the upper xmm, then perform as a xmm hop.
7964 unsigned HalfNumElts = NumElts / 2;
7965 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
7966 MVT HalfVT = VT.getHalfNumVectorElementsVT();
7967 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
7968 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
7969 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
7970 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
7973 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
7976 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7977 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7978 const X86Subtarget &Subtarget,
7979 SelectionDAG &DAG) {
7980 // We need at least 2 non-undef elements to make this worthwhile by default.
7981 unsigned NumNonUndefs =
7982 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
7983 if (NumNonUndefs < 2)
7984 return SDValue();
7986 // There are 4 sets of horizontal math operations distinguished by type:
7987 // int/FP at 128-bit/256-bit. Each type was introduced with a different
7988 // subtarget feature. Try to match those "native" patterns first.
7989 MVT VT = BV->getSimpleValueType(0);
7990 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
7991 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
7992 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
7993 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
7994 unsigned HOpcode;
7995 SDValue V0, V1;
7996 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
7997 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
8000 // Try harder to match 256-bit ops by using extract/concat.
8001 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8002 return SDValue();
8004 // Count the number of UNDEF operands in the build_vector in input.
8005 unsigned NumElts = VT.getVectorNumElements();
8006 unsigned Half = NumElts / 2;
8007 unsigned NumUndefsLO = 0;
8008 unsigned NumUndefsHI = 0;
8009 for (unsigned i = 0, e = Half; i != e; ++i)
8010 if (BV->getOperand(i)->isUndef())
8011 NumUndefsLO++;
8013 for (unsigned i = Half, e = NumElts; i != e; ++i)
8014 if (BV->getOperand(i)->isUndef())
8015 NumUndefsHI++;
8017 SDLoc DL(BV);
8018 SDValue InVec0, InVec1;
8019 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8020 SDValue InVec2, InVec3;
8021 unsigned X86Opcode;
8022 bool CanFold = true;
8024 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
8025 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
8026 InVec3) &&
8027 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8028 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8029 X86Opcode = X86ISD::HADD;
8030 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
8031 InVec1) &&
8032 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
8033 InVec3) &&
8034 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8035 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8036 X86Opcode = X86ISD::HSUB;
8037 else
8038 CanFold = false;
8040 if (CanFold) {
8041 // Do not try to expand this build_vector into a pair of horizontal
8042 // add/sub if we can emit a pair of scalar add/sub.
8043 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8044 return SDValue();
8046 // Convert this build_vector into a pair of horizontal binops followed by
8047 // a concat vector. We must adjust the outputs from the partial horizontal
8048 // matching calls above to account for undefined vector halves.
8049 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8050 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8051 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8052 bool isUndefLO = NumUndefsLO == Half;
8053 bool isUndefHI = NumUndefsHI == Half;
8054 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8055 isUndefHI);
8059 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8060 VT == MVT::v16i16) {
8061 unsigned X86Opcode;
8062 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
8063 X86Opcode = X86ISD::HADD;
8064 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
8065 InVec1))
8066 X86Opcode = X86ISD::HSUB;
8067 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
8068 InVec1))
8069 X86Opcode = X86ISD::FHADD;
8070 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
8071 InVec1))
8072 X86Opcode = X86ISD::FHSUB;
8073 else
8074 return SDValue();
8076 // Don't try to expand this build_vector into a pair of horizontal add/sub
8077 // if we can simply emit a pair of scalar add/sub.
8078 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8079 return SDValue();
8081 // Convert this build_vector into two horizontal add/sub followed by
8082 // a concat vector.
8083 bool isUndefLO = NumUndefsLO == Half;
8084 bool isUndefHI = NumUndefsHI == Half;
8085 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8086 isUndefLO, isUndefHI);
8089 return SDValue();
8092 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8093 SelectionDAG &DAG);
8095 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
8096 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8097 /// just apply the bit to the vectors.
8098 /// NOTE: Its not in our interest to start make a general purpose vectorizer
8099 /// from this, but enough scalar bit operations are created from the later
8100 /// legalization + scalarization stages to need basic support.
8101 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
8102 const X86Subtarget &Subtarget,
8103 SelectionDAG &DAG) {
8104 SDLoc DL(Op);
8105 MVT VT = Op->getSimpleValueType(0);
8106 unsigned NumElems = VT.getVectorNumElements();
8107 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8109 // Check that all elements have the same opcode.
8110 // TODO: Should we allow UNDEFS and if so how many?
8111 unsigned Opcode = Op->getOperand(0).getOpcode();
8112 for (unsigned i = 1; i < NumElems; ++i)
8113 if (Opcode != Op->getOperand(i).getOpcode())
8114 return SDValue();
8116 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8117 bool IsShift = false;
8118 switch (Opcode) {
8119 default:
8120 return SDValue();
8121 case ISD::SHL:
8122 case ISD::SRL:
8123 case ISD::SRA:
8124 IsShift = true;
8125 break;
8126 case ISD::AND:
8127 case ISD::XOR:
8128 case ISD::OR:
8129 // Don't do this if the buildvector is a splat - we'd replace one
8130 // constant with an entire vector.
8131 if (Op->getSplatValue())
8132 return SDValue();
8133 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8134 return SDValue();
8135 break;
8138 SmallVector<SDValue, 4> LHSElts, RHSElts;
8139 for (SDValue Elt : Op->ops()) {
8140 SDValue LHS = Elt.getOperand(0);
8141 SDValue RHS = Elt.getOperand(1);
8143 // We expect the canonicalized RHS operand to be the constant.
8144 if (!isa<ConstantSDNode>(RHS))
8145 return SDValue();
8147 // Extend shift amounts.
8148 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8149 if (!IsShift)
8150 return SDValue();
8151 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8154 LHSElts.push_back(LHS);
8155 RHSElts.push_back(RHS);
8158 // Limit to shifts by uniform immediates.
8159 // TODO: Only accept vXi8/vXi64 special cases?
8160 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8161 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8162 return SDValue();
8164 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8165 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8166 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8168 if (!IsShift)
8169 return Res;
8171 // Immediately lower the shift to ensure the constant build vector doesn't
8172 // get converted to a constant pool before the shift is lowered.
8173 return LowerShift(Res, Subtarget, DAG);
8176 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
8177 /// functionality to do this, so it's all zeros, all ones, or some derivation
8178 /// that is cheap to calculate.
8179 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
8180 const X86Subtarget &Subtarget) {
8181 SDLoc DL(Op);
8182 MVT VT = Op.getSimpleValueType();
8184 // Vectors containing all zeros can be matched by pxor and xorps.
8185 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8186 return Op;
8188 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8189 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8190 // vpcmpeqd on 256-bit vectors.
8191 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8192 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8193 return Op;
8195 return getOnesVector(VT, DAG, DL);
8198 return SDValue();
8201 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8202 /// from a vector of source values and a vector of extraction indices.
8203 /// The vectors might be manipulated to match the type of the permute op.
8204 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8205 SDLoc &DL, SelectionDAG &DAG,
8206 const X86Subtarget &Subtarget) {
8207 MVT ShuffleVT = VT;
8208 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8209 unsigned NumElts = VT.getVectorNumElements();
8210 unsigned SizeInBits = VT.getSizeInBits();
8212 // Adjust IndicesVec to match VT size.
8213 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8214 "Illegal variable permute mask size");
8215 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8216 // Narrow/widen the indices vector to the correct size.
8217 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8218 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8219 NumElts * VT.getScalarSizeInBits());
8220 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8221 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8222 SDLoc(IndicesVec), SizeInBits);
8223 // Zero-extend the index elements within the vector.
8224 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8225 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8226 IndicesVT, IndicesVec);
8228 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8230 // Handle SrcVec that don't match VT type.
8231 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8232 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8233 // Handle larger SrcVec by treating it as a larger permute.
8234 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8235 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8236 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8237 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8238 Subtarget, DAG, SDLoc(IndicesVec));
8239 SDValue NewSrcVec =
8240 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8241 if (NewSrcVec)
8242 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8243 return SDValue();
8244 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8245 // Widen smaller SrcVec to match VT.
8246 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8247 } else
8248 return SDValue();
8251 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8252 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8253 EVT SrcVT = Idx.getValueType();
8254 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8255 uint64_t IndexScale = 0;
8256 uint64_t IndexOffset = 0;
8258 // If we're scaling a smaller permute op, then we need to repeat the
8259 // indices, scaling and offsetting them as well.
8260 // e.g. v4i32 -> v16i8 (Scale = 4)
8261 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8262 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8263 for (uint64_t i = 0; i != Scale; ++i) {
8264 IndexScale |= Scale << (i * NumDstBits);
8265 IndexOffset |= i << (i * NumDstBits);
8268 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8269 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8270 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8271 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8272 return Idx;
8275 unsigned Opcode = 0;
8276 switch (VT.SimpleTy) {
8277 default:
8278 break;
8279 case MVT::v16i8:
8280 if (Subtarget.hasSSSE3())
8281 Opcode = X86ISD::PSHUFB;
8282 break;
8283 case MVT::v8i16:
8284 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8285 Opcode = X86ISD::VPERMV;
8286 else if (Subtarget.hasSSSE3()) {
8287 Opcode = X86ISD::PSHUFB;
8288 ShuffleVT = MVT::v16i8;
8290 break;
8291 case MVT::v4f32:
8292 case MVT::v4i32:
8293 if (Subtarget.hasAVX()) {
8294 Opcode = X86ISD::VPERMILPV;
8295 ShuffleVT = MVT::v4f32;
8296 } else if (Subtarget.hasSSSE3()) {
8297 Opcode = X86ISD::PSHUFB;
8298 ShuffleVT = MVT::v16i8;
8300 break;
8301 case MVT::v2f64:
8302 case MVT::v2i64:
8303 if (Subtarget.hasAVX()) {
8304 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8305 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8306 Opcode = X86ISD::VPERMILPV;
8307 ShuffleVT = MVT::v2f64;
8308 } else if (Subtarget.hasSSE41()) {
8309 // SSE41 can compare v2i64 - select between indices 0 and 1.
8310 return DAG.getSelectCC(
8311 DL, IndicesVec,
8312 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8313 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8314 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8315 ISD::CondCode::SETEQ);
8317 break;
8318 case MVT::v32i8:
8319 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8320 Opcode = X86ISD::VPERMV;
8321 else if (Subtarget.hasXOP()) {
8322 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8323 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8324 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8325 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8326 return DAG.getNode(
8327 ISD::CONCAT_VECTORS, DL, VT,
8328 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8329 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8330 } else if (Subtarget.hasAVX()) {
8331 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8332 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8333 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8334 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8335 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8336 ArrayRef<SDValue> Ops) {
8337 // Permute Lo and Hi and then select based on index range.
8338 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8339 // care about the bit[7] as its just an index vector.
8340 SDValue Idx = Ops[2];
8341 EVT VT = Idx.getValueType();
8342 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8343 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8344 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8345 ISD::CondCode::SETGT);
8347 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8348 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8349 PSHUFBBuilder);
8351 break;
8352 case MVT::v16i16:
8353 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8354 Opcode = X86ISD::VPERMV;
8355 else if (Subtarget.hasAVX()) {
8356 // Scale to v32i8 and perform as v32i8.
8357 IndicesVec = ScaleIndices(IndicesVec, 2);
8358 return DAG.getBitcast(
8359 VT, createVariablePermute(
8360 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8361 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8363 break;
8364 case MVT::v8f32:
8365 case MVT::v8i32:
8366 if (Subtarget.hasAVX2())
8367 Opcode = X86ISD::VPERMV;
8368 else if (Subtarget.hasAVX()) {
8369 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8370 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8371 {0, 1, 2, 3, 0, 1, 2, 3});
8372 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8373 {4, 5, 6, 7, 4, 5, 6, 7});
8374 if (Subtarget.hasXOP())
8375 return DAG.getBitcast(
8376 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8377 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8378 // Permute Lo and Hi and then select based on index range.
8379 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8380 SDValue Res = DAG.getSelectCC(
8381 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8382 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8383 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8384 ISD::CondCode::SETGT);
8385 return DAG.getBitcast(VT, Res);
8387 break;
8388 case MVT::v4i64:
8389 case MVT::v4f64:
8390 if (Subtarget.hasAVX512()) {
8391 if (!Subtarget.hasVLX()) {
8392 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8393 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8394 SDLoc(SrcVec));
8395 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8396 DAG, SDLoc(IndicesVec));
8397 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8398 DAG, Subtarget);
8399 return extract256BitVector(Res, 0, DAG, DL);
8401 Opcode = X86ISD::VPERMV;
8402 } else if (Subtarget.hasAVX()) {
8403 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8404 SDValue LoLo =
8405 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8406 SDValue HiHi =
8407 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8408 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8409 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8410 if (Subtarget.hasXOP())
8411 return DAG.getBitcast(
8412 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8413 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8414 // Permute Lo and Hi and then select based on index range.
8415 // This works as VPERMILPD only uses index bit[1] to permute elements.
8416 SDValue Res = DAG.getSelectCC(
8417 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8418 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8419 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8420 ISD::CondCode::SETGT);
8421 return DAG.getBitcast(VT, Res);
8423 break;
8424 case MVT::v64i8:
8425 if (Subtarget.hasVBMI())
8426 Opcode = X86ISD::VPERMV;
8427 break;
8428 case MVT::v32i16:
8429 if (Subtarget.hasBWI())
8430 Opcode = X86ISD::VPERMV;
8431 break;
8432 case MVT::v16f32:
8433 case MVT::v16i32:
8434 case MVT::v8f64:
8435 case MVT::v8i64:
8436 if (Subtarget.hasAVX512())
8437 Opcode = X86ISD::VPERMV;
8438 break;
8440 if (!Opcode)
8441 return SDValue();
8443 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8444 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8445 "Illegal variable permute shuffle type");
8447 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8448 if (Scale > 1)
8449 IndicesVec = ScaleIndices(IndicesVec, Scale);
8451 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8452 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8454 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8455 SDValue Res = Opcode == X86ISD::VPERMV
8456 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8457 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8458 return DAG.getBitcast(VT, Res);
8461 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8462 // reasoned to be a permutation of a vector by indices in a non-constant vector.
8463 // (build_vector (extract_elt V, (extract_elt I, 0)),
8464 // (extract_elt V, (extract_elt I, 1)),
8465 // ...
8466 // ->
8467 // (vpermv I, V)
8469 // TODO: Handle undefs
8470 // TODO: Utilize pshufb and zero mask blending to support more efficient
8471 // construction of vectors with constant-0 elements.
8472 static SDValue
8473 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
8474 const X86Subtarget &Subtarget) {
8475 SDValue SrcVec, IndicesVec;
8476 // Check for a match of the permute source vector and permute index elements.
8477 // This is done by checking that the i-th build_vector operand is of the form:
8478 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8479 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8480 SDValue Op = V.getOperand(Idx);
8481 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8482 return SDValue();
8484 // If this is the first extract encountered in V, set the source vector,
8485 // otherwise verify the extract is from the previously defined source
8486 // vector.
8487 if (!SrcVec)
8488 SrcVec = Op.getOperand(0);
8489 else if (SrcVec != Op.getOperand(0))
8490 return SDValue();
8491 SDValue ExtractedIndex = Op->getOperand(1);
8492 // Peek through extends.
8493 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8494 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8495 ExtractedIndex = ExtractedIndex.getOperand(0);
8496 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8497 return SDValue();
8499 // If this is the first extract from the index vector candidate, set the
8500 // indices vector, otherwise verify the extract is from the previously
8501 // defined indices vector.
8502 if (!IndicesVec)
8503 IndicesVec = ExtractedIndex.getOperand(0);
8504 else if (IndicesVec != ExtractedIndex.getOperand(0))
8505 return SDValue();
8507 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8508 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8509 return SDValue();
8512 SDLoc DL(V);
8513 MVT VT = V.getSimpleValueType();
8514 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8517 SDValue
8518 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8519 SDLoc dl(Op);
8521 MVT VT = Op.getSimpleValueType();
8522 MVT EltVT = VT.getVectorElementType();
8523 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
8524 unsigned NumElems = Op.getNumOperands();
8526 // Generate vectors for predicate vectors.
8527 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8528 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
8530 if (VT.getVectorElementType() == MVT::bf16 &&
8531 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
8532 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
8534 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
8535 return VectorConstant;
8537 unsigned EVTBits = EltVT.getSizeInBits();
8538 APInt UndefMask = APInt::getZero(NumElems);
8539 APInt FrozenUndefMask = APInt::getZero(NumElems);
8540 APInt ZeroMask = APInt::getZero(NumElems);
8541 APInt NonZeroMask = APInt::getZero(NumElems);
8542 bool IsAllConstants = true;
8543 bool OneUseFrozenUndefs = true;
8544 SmallSet<SDValue, 8> Values;
8545 unsigned NumConstants = NumElems;
8546 for (unsigned i = 0; i < NumElems; ++i) {
8547 SDValue Elt = Op.getOperand(i);
8548 if (Elt.isUndef()) {
8549 UndefMask.setBit(i);
8550 continue;
8552 if (ISD::isFreezeUndef(Elt.getNode())) {
8553 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
8554 FrozenUndefMask.setBit(i);
8555 continue;
8557 Values.insert(Elt);
8558 if (!isIntOrFPConstant(Elt)) {
8559 IsAllConstants = false;
8560 NumConstants--;
8562 if (X86::isZeroNode(Elt)) {
8563 ZeroMask.setBit(i);
8564 } else {
8565 NonZeroMask.setBit(i);
8569 // All undef vector. Return an UNDEF.
8570 if (UndefMask.isAllOnes())
8571 return DAG.getUNDEF(VT);
8573 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
8574 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
8575 return DAG.getFreeze(DAG.getUNDEF(VT));
8577 // All undef/freeze(undef)/zero vector. Return a zero vector.
8578 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
8579 return getZeroVector(VT, Subtarget, DAG, dl);
8581 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
8582 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
8583 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
8584 // and blend the FREEZE-UNDEF operands back in.
8585 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
8586 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
8587 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
8588 SmallVector<int, 16> BlendMask(NumElems, -1);
8589 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
8590 for (unsigned i = 0; i < NumElems; ++i) {
8591 if (UndefMask[i]) {
8592 BlendMask[i] = -1;
8593 continue;
8595 BlendMask[i] = i;
8596 if (!FrozenUndefMask[i])
8597 Elts[i] = Op.getOperand(i);
8598 else
8599 BlendMask[i] += NumElems;
8601 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
8602 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
8603 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
8604 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
8607 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8609 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
8610 // be better off lowering to a smaller build vector and padding with
8611 // undef/zero.
8612 if ((VT.is256BitVector() || VT.is512BitVector()) &&
8613 !isFoldableUseOfShuffle(BV)) {
8614 unsigned UpperElems = NumElems / 2;
8615 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
8616 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
8617 if (NumUpperUndefsOrZeros >= UpperElems) {
8618 if (VT.is512BitVector() &&
8619 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
8620 UpperElems = NumElems - (NumElems / 4);
8621 // If freeze(undef) is in any upper elements, force to zero.
8622 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
8623 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
8624 SDValue NewBV =
8625 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
8626 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
8630 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
8631 return AddSub;
8632 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
8633 return HorizontalOp;
8634 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
8635 return Broadcast;
8636 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
8637 return BitOp;
8639 unsigned NumZero = ZeroMask.popcount();
8640 unsigned NumNonZero = NonZeroMask.popcount();
8642 // If we are inserting one variable into a vector of non-zero constants, try
8643 // to avoid loading each constant element as a scalar. Load the constants as a
8644 // vector and then insert the variable scalar element. If insertion is not
8645 // supported, fall back to a shuffle to get the scalar blended with the
8646 // constants. Insertion into a zero vector is handled as a special-case
8647 // somewhere below here.
8648 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8649 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
8650 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
8651 // Create an all-constant vector. The variable element in the old
8652 // build vector is replaced by undef in the constant vector. Save the
8653 // variable scalar element and its index for use in the insertelement.
8654 LLVMContext &Context = *DAG.getContext();
8655 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8656 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8657 SDValue VarElt;
8658 SDValue InsIndex;
8659 for (unsigned i = 0; i != NumElems; ++i) {
8660 SDValue Elt = Op.getOperand(i);
8661 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8662 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8663 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8664 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8665 else if (!Elt.isUndef()) {
8666 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8667 "Expected one variable element in this vector");
8668 VarElt = Elt;
8669 InsIndex = DAG.getVectorIdxConstant(i, dl);
8672 Constant *CV = ConstantVector::get(ConstVecOps);
8673 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8675 // The constants we just created may not be legal (eg, floating point). We
8676 // must lower the vector right here because we can not guarantee that we'll
8677 // legalize it before loading it. This is also why we could not just create
8678 // a new build vector here. If the build vector contains illegal constants,
8679 // it could get split back up into a series of insert elements.
8680 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8681 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8682 MachineFunction &MF = DAG.getMachineFunction();
8683 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
8684 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8685 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
8686 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
8687 if (InsertC < NumEltsInLow128Bits)
8688 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8690 // There's no good way to insert into the high elements of a >128-bit
8691 // vector, so use shuffles to avoid an extract/insert sequence.
8692 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
8693 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
8694 SmallVector<int, 8> ShuffleMask;
8695 unsigned NumElts = VT.getVectorNumElements();
8696 for (unsigned i = 0; i != NumElts; ++i)
8697 ShuffleMask.push_back(i == InsertC ? NumElts : i);
8698 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
8699 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
8702 // Special case for single non-zero, non-undef, element.
8703 if (NumNonZero == 1) {
8704 unsigned Idx = NonZeroMask.countr_zero();
8705 SDValue Item = Op.getOperand(Idx);
8707 // If we have a constant or non-constant insertion into the low element of
8708 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8709 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8710 // depending on what the source datatype is.
8711 if (Idx == 0) {
8712 if (NumZero == 0)
8713 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8715 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
8716 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
8717 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
8718 assert((VT.is128BitVector() || VT.is256BitVector() ||
8719 VT.is512BitVector()) &&
8720 "Expected an SSE value type!");
8721 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8722 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
8723 // zero vector.
8724 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8727 // We can't directly insert an i8 or i16 into a vector, so zero extend
8728 // it to i32 first.
8729 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8730 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8731 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
8732 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8733 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8734 return DAG.getBitcast(VT, Item);
8738 // Is it a vector logical left shift?
8739 if (NumElems == 2 && Idx == 1 &&
8740 X86::isZeroNode(Op.getOperand(0)) &&
8741 !X86::isZeroNode(Op.getOperand(1))) {
8742 unsigned NumBits = VT.getSizeInBits();
8743 return getVShift(true, VT,
8744 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8745 VT, Op.getOperand(1)),
8746 NumBits/2, DAG, *this, dl);
8749 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8750 return SDValue();
8752 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8753 // is a non-constant being inserted into an element other than the low one,
8754 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8755 // movd/movss) to move this into the low element, then shuffle it into
8756 // place.
8757 if (EVTBits == 32) {
8758 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8759 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8763 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8764 if (Values.size() == 1) {
8765 if (EVTBits == 32) {
8766 // Instead of a shuffle like this:
8767 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8768 // Check if it's possible to issue this instead.
8769 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8770 unsigned Idx = NonZeroMask.countr_zero();
8771 SDValue Item = Op.getOperand(Idx);
8772 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8773 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8775 return SDValue();
8778 // A vector full of immediates; various special cases are already
8779 // handled, so this is best done with a single constant-pool load.
8780 if (IsAllConstants)
8781 return SDValue();
8783 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8784 return V;
8786 // See if we can use a vector load to get all of the elements.
8788 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8789 if (SDValue LD =
8790 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8791 return LD;
8794 // If this is a splat of pairs of 32-bit elements, we can use a narrower
8795 // build_vector and broadcast it.
8796 // TODO: We could probably generalize this more.
8797 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
8798 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8799 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8800 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
8801 // Make sure all the even/odd operands match.
8802 for (unsigned i = 2; i != NumElems; ++i)
8803 if (Ops[i % 2] != Op.getOperand(i))
8804 return false;
8805 return true;
8807 if (CanSplat(Op, NumElems, Ops)) {
8808 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
8809 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
8810 // Create a new build vector and cast to v2i64/v2f64.
8811 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
8812 DAG.getBuildVector(NarrowVT, dl, Ops));
8813 // Broadcast from v2i64/v2f64 and cast to final VT.
8814 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
8815 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
8816 NewBV));
8820 // For AVX-length vectors, build the individual 128-bit pieces and use
8821 // shuffles to put them in place.
8822 if (VT.getSizeInBits() > 128) {
8823 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
8825 // Build both the lower and upper subvector.
8826 SDValue Lower =
8827 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8828 SDValue Upper = DAG.getBuildVector(
8829 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8831 // Recreate the wider vector with the lower and upper part.
8832 return concatSubVectors(Lower, Upper, DAG, dl);
8835 // Let legalizer expand 2-wide build_vectors.
8836 if (EVTBits == 64) {
8837 if (NumNonZero == 1) {
8838 // One half is zero or undef.
8839 unsigned Idx = NonZeroMask.countr_zero();
8840 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8841 Op.getOperand(Idx));
8842 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8844 return SDValue();
8847 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8848 if (EVTBits == 8 && NumElems == 16)
8849 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
8850 DAG, Subtarget))
8851 return V;
8853 if (EltVT == MVT::i16 && NumElems == 8)
8854 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
8855 DAG, Subtarget))
8856 return V;
8858 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8859 if (EVTBits == 32 && NumElems == 4)
8860 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8861 return V;
8863 // If element VT is == 32 bits, turn it into a number of shuffles.
8864 if (NumElems == 4 && NumZero > 0) {
8865 SmallVector<SDValue, 8> Ops(NumElems);
8866 for (unsigned i = 0; i < 4; ++i) {
8867 bool isZero = !NonZeroMask[i];
8868 if (isZero)
8869 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8870 else
8871 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8874 for (unsigned i = 0; i < 2; ++i) {
8875 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
8876 default: llvm_unreachable("Unexpected NonZero count");
8877 case 0:
8878 Ops[i] = Ops[i*2]; // Must be a zero vector.
8879 break;
8880 case 1:
8881 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8882 break;
8883 case 2:
8884 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8885 break;
8886 case 3:
8887 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8888 break;
8892 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
8893 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
8894 int MaskVec[] = {
8895 Reverse1 ? 1 : 0,
8896 Reverse1 ? 0 : 1,
8897 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8898 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8900 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8903 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8905 // Check for a build vector from mostly shuffle plus few inserting.
8906 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8907 return Sh;
8909 // For SSE 4.1, use insertps to put the high elements into the low element.
8910 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
8911 SDValue Result;
8912 if (!Op.getOperand(0).isUndef())
8913 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8914 else
8915 Result = DAG.getUNDEF(VT);
8917 for (unsigned i = 1; i < NumElems; ++i) {
8918 if (Op.getOperand(i).isUndef()) continue;
8919 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8920 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8922 return Result;
8925 // Otherwise, expand into a number of unpckl*, start by extending each of
8926 // our (non-undef) elements to the full vector width with the element in the
8927 // bottom slot of the vector (which generates no code for SSE).
8928 SmallVector<SDValue, 8> Ops(NumElems);
8929 for (unsigned i = 0; i < NumElems; ++i) {
8930 if (!Op.getOperand(i).isUndef())
8931 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8932 else
8933 Ops[i] = DAG.getUNDEF(VT);
8936 // Next, we iteratively mix elements, e.g. for v4f32:
8937 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8938 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8939 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8940 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8941 // Generate scaled UNPCKL shuffle mask.
8942 SmallVector<int, 16> Mask;
8943 for(unsigned i = 0; i != Scale; ++i)
8944 Mask.push_back(i);
8945 for (unsigned i = 0; i != Scale; ++i)
8946 Mask.push_back(NumElems+i);
8947 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8949 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8950 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8952 return Ops[0];
8955 // 256-bit AVX can use the vinsertf128 instruction
8956 // to create 256-bit vectors from two other 128-bit ones.
8957 // TODO: Detect subvector broadcast here instead of DAG combine?
8958 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
8959 const X86Subtarget &Subtarget) {
8960 SDLoc dl(Op);
8961 MVT ResVT = Op.getSimpleValueType();
8963 assert((ResVT.is256BitVector() ||
8964 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8966 unsigned NumOperands = Op.getNumOperands();
8967 unsigned NumFreezeUndef = 0;
8968 unsigned NumZero = 0;
8969 unsigned NumNonZero = 0;
8970 unsigned NonZeros = 0;
8971 for (unsigned i = 0; i != NumOperands; ++i) {
8972 SDValue SubVec = Op.getOperand(i);
8973 if (SubVec.isUndef())
8974 continue;
8975 if (ISD::isFreezeUndef(SubVec.getNode())) {
8976 // If the freeze(undef) has multiple uses then we must fold to zero.
8977 if (SubVec.hasOneUse())
8978 ++NumFreezeUndef;
8979 else
8980 ++NumZero;
8982 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8983 ++NumZero;
8984 else {
8985 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8986 NonZeros |= 1 << i;
8987 ++NumNonZero;
8991 // If we have more than 2 non-zeros, build each half separately.
8992 if (NumNonZero > 2) {
8993 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
8994 ArrayRef<SDUse> Ops = Op->ops();
8995 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8996 Ops.slice(0, NumOperands/2));
8997 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8998 Ops.slice(NumOperands/2));
8999 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9002 // Otherwise, build it up through insert_subvectors.
9003 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9004 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9005 : DAG.getUNDEF(ResVT));
9007 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9008 unsigned NumSubElems = SubVT.getVectorNumElements();
9009 for (unsigned i = 0; i != NumOperands; ++i) {
9010 if ((NonZeros & (1 << i)) == 0)
9011 continue;
9013 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
9014 Op.getOperand(i),
9015 DAG.getIntPtrConstant(i * NumSubElems, dl));
9018 return Vec;
9021 // Returns true if the given node is a type promotion (by concatenating i1
9022 // zeros) of the result of a node that already zeros all upper bits of
9023 // k-register.
9024 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
9025 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
9026 const X86Subtarget &Subtarget,
9027 SelectionDAG & DAG) {
9028 SDLoc dl(Op);
9029 MVT ResVT = Op.getSimpleValueType();
9030 unsigned NumOperands = Op.getNumOperands();
9032 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9033 "Unexpected number of operands in CONCAT_VECTORS");
9035 uint64_t Zeros = 0;
9036 uint64_t NonZeros = 0;
9037 for (unsigned i = 0; i != NumOperands; ++i) {
9038 SDValue SubVec = Op.getOperand(i);
9039 if (SubVec.isUndef())
9040 continue;
9041 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9042 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9043 Zeros |= (uint64_t)1 << i;
9044 else
9045 NonZeros |= (uint64_t)1 << i;
9048 unsigned NumElems = ResVT.getVectorNumElements();
9050 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9051 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9052 // insert_subvector will give us two kshifts.
9053 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9054 Log2_64(NonZeros) != NumOperands - 1) {
9055 unsigned Idx = Log2_64(NonZeros);
9056 SDValue SubVec = Op.getOperand(Idx);
9057 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9058 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9059 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9060 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9061 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9062 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9063 DAG.getIntPtrConstant(0, dl));
9066 // If there are zero or one non-zeros we can handle this very simply.
9067 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9068 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9069 if (!NonZeros)
9070 return Vec;
9071 unsigned Idx = Log2_64(NonZeros);
9072 SDValue SubVec = Op.getOperand(Idx);
9073 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9074 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9075 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
9078 if (NumOperands > 2) {
9079 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9080 ArrayRef<SDUse> Ops = Op->ops();
9081 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9082 Ops.slice(0, NumOperands/2));
9083 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9084 Ops.slice(NumOperands/2));
9085 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9088 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9090 if (ResVT.getVectorNumElements() >= 16)
9091 return Op; // The operation is legal with KUNPCK
9093 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
9094 DAG.getUNDEF(ResVT), Op.getOperand(0),
9095 DAG.getIntPtrConstant(0, dl));
9096 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9097 DAG.getIntPtrConstant(NumElems/2, dl));
9100 static SDValue LowerCONCAT_VECTORS(SDValue Op,
9101 const X86Subtarget &Subtarget,
9102 SelectionDAG &DAG) {
9103 MVT VT = Op.getSimpleValueType();
9104 if (VT.getVectorElementType() == MVT::i1)
9105 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9107 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9108 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9109 Op.getNumOperands() == 4)));
9111 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9112 // from two other 128-bit ones.
9114 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9115 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9118 //===----------------------------------------------------------------------===//
9119 // Vector shuffle lowering
9121 // This is an experimental code path for lowering vector shuffles on x86. It is
9122 // designed to handle arbitrary vector shuffles and blends, gracefully
9123 // degrading performance as necessary. It works hard to recognize idiomatic
9124 // shuffles and lower them to optimal instruction patterns without leaving
9125 // a framework that allows reasonably efficient handling of all vector shuffle
9126 // patterns.
9127 //===----------------------------------------------------------------------===//
9129 /// Tiny helper function to identify a no-op mask.
9131 /// This is a somewhat boring predicate function. It checks whether the mask
9132 /// array input, which is assumed to be a single-input shuffle mask of the kind
9133 /// used by the X86 shuffle instructions (not a fully general
9134 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9135 /// in-place shuffle are 'no-op's.
9136 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
9137 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9138 assert(Mask[i] >= -1 && "Out of bound mask element!");
9139 if (Mask[i] >= 0 && Mask[i] != i)
9140 return false;
9142 return true;
9145 /// Test whether there are elements crossing LaneSizeInBits lanes in this
9146 /// shuffle mask.
9148 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9149 /// and we routinely test for these.
9150 static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9151 unsigned ScalarSizeInBits,
9152 ArrayRef<int> Mask) {
9153 assert(LaneSizeInBits && ScalarSizeInBits &&
9154 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9155 "Illegal shuffle lane size");
9156 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9157 int Size = Mask.size();
9158 for (int i = 0; i < Size; ++i)
9159 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9160 return true;
9161 return false;
9164 /// Test whether there are elements crossing 128-bit lanes in this
9165 /// shuffle mask.
9166 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
9167 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9170 /// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9171 /// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9172 /// better support 'repeated mask + lane permute' style shuffles.
9173 static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9174 unsigned ScalarSizeInBits,
9175 ArrayRef<int> Mask) {
9176 assert(LaneSizeInBits && ScalarSizeInBits &&
9177 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9178 "Illegal shuffle lane size");
9179 int NumElts = Mask.size();
9180 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9181 int NumLanes = NumElts / NumEltsPerLane;
9182 if (NumLanes > 1) {
9183 for (int i = 0; i != NumLanes; ++i) {
9184 int SrcLane = -1;
9185 for (int j = 0; j != NumEltsPerLane; ++j) {
9186 int M = Mask[(i * NumEltsPerLane) + j];
9187 if (M < 0)
9188 continue;
9189 int Lane = (M % NumElts) / NumEltsPerLane;
9190 if (SrcLane >= 0 && SrcLane != Lane)
9191 return true;
9192 SrcLane = Lane;
9196 return false;
9199 /// Test whether a shuffle mask is equivalent within each sub-lane.
9201 /// This checks a shuffle mask to see if it is performing the same
9202 /// lane-relative shuffle in each sub-lane. This trivially implies
9203 /// that it is also not lane-crossing. It may however involve a blend from the
9204 /// same lane of a second vector.
9206 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9207 /// non-trivial to compute in the face of undef lanes. The representation is
9208 /// suitable for use with existing 128-bit shuffles as entries from the second
9209 /// vector have been remapped to [LaneSize, 2*LaneSize).
9210 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9211 ArrayRef<int> Mask,
9212 SmallVectorImpl<int> &RepeatedMask) {
9213 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9214 RepeatedMask.assign(LaneSize, -1);
9215 int Size = Mask.size();
9216 for (int i = 0; i < Size; ++i) {
9217 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9218 if (Mask[i] < 0)
9219 continue;
9220 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9221 // This entry crosses lanes, so there is no way to model this shuffle.
9222 return false;
9224 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9225 // Adjust second vector indices to start at LaneSize instead of Size.
9226 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9227 : Mask[i] % LaneSize + LaneSize;
9228 if (RepeatedMask[i % LaneSize] < 0)
9229 // This is the first non-undef entry in this slot of a 128-bit lane.
9230 RepeatedMask[i % LaneSize] = LocalM;
9231 else if (RepeatedMask[i % LaneSize] != LocalM)
9232 // Found a mismatch with the repeated mask.
9233 return false;
9235 return true;
9238 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
9239 static bool
9240 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
9241 SmallVectorImpl<int> &RepeatedMask) {
9242 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9245 static bool
9246 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
9247 SmallVector<int, 32> RepeatedMask;
9248 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9251 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
9252 static bool
9253 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
9254 SmallVectorImpl<int> &RepeatedMask) {
9255 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9258 /// Test whether a target shuffle mask is equivalent within each sub-lane.
9259 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9260 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9261 unsigned EltSizeInBits,
9262 ArrayRef<int> Mask,
9263 SmallVectorImpl<int> &RepeatedMask) {
9264 int LaneSize = LaneSizeInBits / EltSizeInBits;
9265 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9266 int Size = Mask.size();
9267 for (int i = 0; i < Size; ++i) {
9268 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9269 if (Mask[i] == SM_SentinelUndef)
9270 continue;
9271 if (Mask[i] == SM_SentinelZero) {
9272 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9273 return false;
9274 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9275 continue;
9277 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9278 // This entry crosses lanes, so there is no way to model this shuffle.
9279 return false;
9281 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9282 // later vector indices to start at multiples of LaneSize instead of Size.
9283 int LaneM = Mask[i] / Size;
9284 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9285 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9286 // This is the first non-undef entry in this slot of a 128-bit lane.
9287 RepeatedMask[i % LaneSize] = LocalM;
9288 else if (RepeatedMask[i % LaneSize] != LocalM)
9289 // Found a mismatch with the repeated mask.
9290 return false;
9292 return true;
9295 /// Test whether a target shuffle mask is equivalent within each sub-lane.
9296 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9297 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9298 ArrayRef<int> Mask,
9299 SmallVectorImpl<int> &RepeatedMask) {
9300 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9301 Mask, RepeatedMask);
9304 /// Checks whether the vector elements referenced by two shuffle masks are
9305 /// equivalent.
9306 static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9307 int Idx, int ExpectedIdx) {
9308 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9309 ExpectedIdx < MaskSize && "Out of range element index");
9310 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9311 return false;
9313 switch (Op.getOpcode()) {
9314 case ISD::BUILD_VECTOR:
9315 // If the values are build vectors, we can look through them to find
9316 // equivalent inputs that make the shuffles equivalent.
9317 // TODO: Handle MaskSize != Op.getNumOperands()?
9318 if (MaskSize == (int)Op.getNumOperands() &&
9319 MaskSize == (int)ExpectedOp.getNumOperands())
9320 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9321 break;
9322 case X86ISD::VBROADCAST:
9323 case X86ISD::VBROADCAST_LOAD:
9324 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
9325 return (Op == ExpectedOp &&
9326 (int)Op.getValueType().getVectorNumElements() == MaskSize);
9327 case X86ISD::HADD:
9328 case X86ISD::HSUB:
9329 case X86ISD::FHADD:
9330 case X86ISD::FHSUB:
9331 case X86ISD::PACKSS:
9332 case X86ISD::PACKUS:
9333 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9334 // TODO: Handle MaskSize != NumElts?
9335 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9336 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9337 MVT VT = Op.getSimpleValueType();
9338 int NumElts = VT.getVectorNumElements();
9339 if (MaskSize == NumElts) {
9340 int NumLanes = VT.getSizeInBits() / 128;
9341 int NumEltsPerLane = NumElts / NumLanes;
9342 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9343 bool SameLane =
9344 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9345 bool SameElt =
9346 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9347 return SameLane && SameElt;
9350 break;
9353 return false;
9356 /// Checks whether a shuffle mask is equivalent to an explicit list of
9357 /// arguments.
9359 /// This is a fast way to test a shuffle mask against a fixed pattern:
9361 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9363 /// It returns true if the mask is exactly as wide as the argument list, and
9364 /// each element of the mask is either -1 (signifying undef) or the value given
9365 /// in the argument.
9366 static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9367 SDValue V1 = SDValue(),
9368 SDValue V2 = SDValue()) {
9369 int Size = Mask.size();
9370 if (Size != (int)ExpectedMask.size())
9371 return false;
9373 for (int i = 0; i < Size; ++i) {
9374 assert(Mask[i] >= -1 && "Out of bound mask element!");
9375 int MaskIdx = Mask[i];
9376 int ExpectedIdx = ExpectedMask[i];
9377 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9378 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9379 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9380 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9381 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9382 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9383 return false;
9386 return true;
9389 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9391 /// The masks must be exactly the same width.
9393 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9394 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
9396 /// SM_SentinelZero is accepted as a valid negative index but must match in
9397 /// both, or via a known bits test.
9398 static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
9399 ArrayRef<int> ExpectedMask,
9400 const SelectionDAG &DAG,
9401 SDValue V1 = SDValue(),
9402 SDValue V2 = SDValue()) {
9403 int Size = Mask.size();
9404 if (Size != (int)ExpectedMask.size())
9405 return false;
9406 assert(llvm::all_of(ExpectedMask,
9407 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9408 "Illegal target shuffle mask");
9410 // Check for out-of-range target shuffle mask indices.
9411 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9412 return false;
9414 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9415 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9416 !V1.getValueType().isVector()))
9417 V1 = SDValue();
9418 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9419 !V2.getValueType().isVector()))
9420 V2 = SDValue();
9422 APInt ZeroV1 = APInt::getZero(Size);
9423 APInt ZeroV2 = APInt::getZero(Size);
9425 for (int i = 0; i < Size; ++i) {
9426 int MaskIdx = Mask[i];
9427 int ExpectedIdx = ExpectedMask[i];
9428 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9429 continue;
9430 if (MaskIdx == SM_SentinelZero) {
9431 // If we need this expected index to be a zero element, then update the
9432 // relevant zero mask and perform the known bits at the end to minimize
9433 // repeated computes.
9434 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9435 if (ExpectedV &&
9436 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9437 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9438 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9439 ZeroMask.setBit(BitIdx);
9440 continue;
9443 if (MaskIdx >= 0) {
9444 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9445 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9446 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9447 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9448 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9449 continue;
9451 return false;
9453 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9454 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9457 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9458 // instructions.
9459 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,
9460 const SelectionDAG &DAG) {
9461 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9462 return false;
9464 SmallVector<int, 8> Unpcklwd;
9465 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9466 /* Unary = */ false);
9467 SmallVector<int, 8> Unpckhwd;
9468 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9469 /* Unary = */ false);
9470 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9471 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9472 return IsUnpackwdMask;
9475 static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,
9476 const SelectionDAG &DAG) {
9477 // Create 128-bit vector type based on mask size.
9478 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9479 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9481 // We can't assume a canonical shuffle mask, so try the commuted version too.
9482 SmallVector<int, 4> CommutedMask(Mask);
9483 ShuffleVectorSDNode::commuteMask(CommutedMask);
9485 // Match any of unary/binary or low/high.
9486 for (unsigned i = 0; i != 4; ++i) {
9487 SmallVector<int, 16> UnpackMask;
9488 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9489 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9490 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9491 return true;
9493 return false;
9496 /// Return true if a shuffle mask chooses elements identically in its top and
9497 /// bottom halves. For example, any splat mask has the same top and bottom
9498 /// halves. If an element is undefined in only one half of the mask, the halves
9499 /// are not considered identical.
9500 static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
9501 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9502 unsigned HalfSize = Mask.size() / 2;
9503 for (unsigned i = 0; i != HalfSize; ++i) {
9504 if (Mask[i] != Mask[i + HalfSize])
9505 return false;
9507 return true;
9510 /// Get a 4-lane 8-bit shuffle immediate for a mask.
9512 /// This helper function produces an 8-bit shuffle immediate corresponding to
9513 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
9514 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9515 /// example.
9517 /// NB: We rely heavily on "undef" masks preserving the input lane.
9518 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9519 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9520 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9521 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9522 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9523 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9525 // If the mask only uses one non-undef element, then fully 'splat' it to
9526 // improve later broadcast matching.
9527 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
9528 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
9530 int FirstElt = Mask[FirstIndex];
9531 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
9532 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
9534 unsigned Imm = 0;
9535 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9536 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9537 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9538 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9539 return Imm;
9542 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
9543 SelectionDAG &DAG) {
9544 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9547 // The Shuffle result is as follow:
9548 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9549 // Each Zeroable's element correspond to a particular Mask's element.
9550 // As described in computeZeroableShuffleElements function.
9552 // The function looks for a sub-mask that the nonzero elements are in
9553 // increasing order. If such sub-mask exist. The function returns true.
9554 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9555 ArrayRef<int> Mask, const EVT &VectorType,
9556 bool &IsZeroSideLeft) {
9557 int NextElement = -1;
9558 // Check if the Mask's nonzero elements are in increasing order.
9559 for (int i = 0, e = Mask.size(); i < e; i++) {
9560 // Checks if the mask's zeros elements are built from only zeros.
9561 assert(Mask[i] >= -1 && "Out of bound mask element!");
9562 if (Mask[i] < 0)
9563 return false;
9564 if (Zeroable[i])
9565 continue;
9566 // Find the lowest non zero element
9567 if (NextElement < 0) {
9568 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9569 IsZeroSideLeft = NextElement != 0;
9571 // Exit if the mask's non zero elements are not in increasing order.
9572 if (NextElement != Mask[i])
9573 return false;
9574 NextElement++;
9576 return true;
9579 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9580 static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
9581 ArrayRef<int> Mask, SDValue V1,
9582 SDValue V2, const APInt &Zeroable,
9583 const X86Subtarget &Subtarget,
9584 SelectionDAG &DAG) {
9585 int Size = Mask.size();
9586 int LaneSize = 128 / VT.getScalarSizeInBits();
9587 const int NumBytes = VT.getSizeInBits() / 8;
9588 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9590 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9591 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9592 (Subtarget.hasBWI() && VT.is512BitVector()));
9594 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9595 // Sign bit set in i8 mask means zero element.
9596 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9598 SDValue V;
9599 for (int i = 0; i < NumBytes; ++i) {
9600 int M = Mask[i / NumEltBytes];
9601 if (M < 0) {
9602 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9603 continue;
9605 if (Zeroable[i / NumEltBytes]) {
9606 PSHUFBMask[i] = ZeroMask;
9607 continue;
9610 // We can only use a single input of V1 or V2.
9611 SDValue SrcV = (M >= Size ? V2 : V1);
9612 if (V && V != SrcV)
9613 return SDValue();
9614 V = SrcV;
9615 M %= Size;
9617 // PSHUFB can't cross lanes, ensure this doesn't happen.
9618 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9619 return SDValue();
9621 M = M % LaneSize;
9622 M = M * NumEltBytes + (i % NumEltBytes);
9623 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9625 assert(V && "Failed to find a source input");
9627 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9628 return DAG.getBitcast(
9629 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9630 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9633 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9634 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9635 const SDLoc &dl);
9637 // X86 has dedicated shuffle that can be lowered to VEXPAND
9638 static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
9639 const APInt &Zeroable,
9640 ArrayRef<int> Mask, SDValue &V1,
9641 SDValue &V2, SelectionDAG &DAG,
9642 const X86Subtarget &Subtarget) {
9643 bool IsLeftZeroSide = true;
9644 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9645 IsLeftZeroSide))
9646 return SDValue();
9647 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9648 MVT IntegerType =
9649 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9650 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9651 unsigned NumElts = VT.getVectorNumElements();
9652 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9653 "Unexpected number of vector elements");
9654 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9655 Subtarget, DAG, DL);
9656 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9657 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9658 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
9661 static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9662 unsigned &UnpackOpcode, bool IsUnary,
9663 ArrayRef<int> TargetMask, const SDLoc &DL,
9664 SelectionDAG &DAG,
9665 const X86Subtarget &Subtarget) {
9666 int NumElts = VT.getVectorNumElements();
9668 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9669 for (int i = 0; i != NumElts; i += 2) {
9670 int M1 = TargetMask[i + 0];
9671 int M2 = TargetMask[i + 1];
9672 Undef1 &= (SM_SentinelUndef == M1);
9673 Undef2 &= (SM_SentinelUndef == M2);
9674 Zero1 &= isUndefOrZero(M1);
9675 Zero2 &= isUndefOrZero(M2);
9677 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9678 "Zeroable shuffle detected");
9680 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9681 SmallVector<int, 64> Unpckl, Unpckh;
9682 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9683 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
9684 (IsUnary ? V1 : V2))) {
9685 UnpackOpcode = X86ISD::UNPCKL;
9686 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9687 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9688 return true;
9691 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9692 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
9693 (IsUnary ? V1 : V2))) {
9694 UnpackOpcode = X86ISD::UNPCKH;
9695 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9696 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9697 return true;
9700 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9701 if (IsUnary && (Zero1 || Zero2)) {
9702 // Don't bother if we can blend instead.
9703 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9704 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9705 return false;
9707 bool MatchLo = true, MatchHi = true;
9708 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9709 int M = TargetMask[i];
9711 // Ignore if the input is known to be zero or the index is undef.
9712 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9713 (M == SM_SentinelUndef))
9714 continue;
9716 MatchLo &= (M == Unpckl[i]);
9717 MatchHi &= (M == Unpckh[i]);
9720 if (MatchLo || MatchHi) {
9721 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9722 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9723 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9724 return true;
9728 // If a binary shuffle, commute and try again.
9729 if (!IsUnary) {
9730 ShuffleVectorSDNode::commuteMask(Unpckl);
9731 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
9732 UnpackOpcode = X86ISD::UNPCKL;
9733 std::swap(V1, V2);
9734 return true;
9737 ShuffleVectorSDNode::commuteMask(Unpckh);
9738 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
9739 UnpackOpcode = X86ISD::UNPCKH;
9740 std::swap(V1, V2);
9741 return true;
9745 return false;
9748 // X86 has dedicated unpack instructions that can handle specific blend
9749 // operations: UNPCKH and UNPCKL.
9750 static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
9751 ArrayRef<int> Mask, SDValue V1, SDValue V2,
9752 SelectionDAG &DAG) {
9753 SmallVector<int, 8> Unpckl;
9754 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
9755 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9756 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
9758 SmallVector<int, 8> Unpckh;
9759 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
9760 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
9761 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
9763 // Commute and try again.
9764 ShuffleVectorSDNode::commuteMask(Unpckl);
9765 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9766 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9768 ShuffleVectorSDNode::commuteMask(Unpckh);
9769 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
9770 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
9772 return SDValue();
9775 /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
9776 /// followed by unpack 256-bit.
9777 static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
9778 ArrayRef<int> Mask, SDValue V1,
9779 SDValue V2, SelectionDAG &DAG) {
9780 SmallVector<int, 32> Unpckl, Unpckh;
9781 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
9782 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
9784 unsigned UnpackOpcode;
9785 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9786 UnpackOpcode = X86ISD::UNPCKL;
9787 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
9788 UnpackOpcode = X86ISD::UNPCKH;
9789 else
9790 return SDValue();
9792 // This is a "natural" unpack operation (rather than the 128-bit sectored
9793 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
9794 // input in order to use the x86 instruction.
9795 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
9796 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
9797 V1 = DAG.getBitcast(VT, V1);
9798 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
9801 // Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
9802 // source into the lower elements and zeroing the upper elements.
9803 static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
9804 ArrayRef<int> Mask, const APInt &Zeroable,
9805 const X86Subtarget &Subtarget) {
9806 if (!VT.is512BitVector() && !Subtarget.hasVLX())
9807 return false;
9809 unsigned NumElts = Mask.size();
9810 unsigned EltSizeInBits = VT.getScalarSizeInBits();
9811 unsigned MaxScale = 64 / EltSizeInBits;
9813 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
9814 unsigned SrcEltBits = EltSizeInBits * Scale;
9815 if (SrcEltBits < 32 && !Subtarget.hasBWI())
9816 continue;
9817 unsigned NumSrcElts = NumElts / Scale;
9818 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
9819 continue;
9820 unsigned UpperElts = NumElts - NumSrcElts;
9821 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
9822 continue;
9823 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
9824 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
9825 DstVT = MVT::getIntegerVT(EltSizeInBits);
9826 if ((NumSrcElts * EltSizeInBits) >= 128) {
9827 // ISD::TRUNCATE
9828 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
9829 } else {
9830 // X86ISD::VTRUNC
9831 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
9833 return true;
9836 return false;
9839 // Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
9840 // element padding to the final DstVT.
9841 static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
9842 const X86Subtarget &Subtarget,
9843 SelectionDAG &DAG, bool ZeroUppers) {
9844 MVT SrcVT = Src.getSimpleValueType();
9845 MVT DstSVT = DstVT.getScalarType();
9846 unsigned NumDstElts = DstVT.getVectorNumElements();
9847 unsigned NumSrcElts = SrcVT.getVectorNumElements();
9848 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
9850 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
9851 return SDValue();
9853 // Perform a direct ISD::TRUNCATE if possible.
9854 if (NumSrcElts == NumDstElts)
9855 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
9857 if (NumSrcElts > NumDstElts) {
9858 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
9859 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
9860 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
9863 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
9864 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
9865 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
9866 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
9867 DstVT.getSizeInBits());
9870 // Non-VLX targets must truncate from a 512-bit type, so we need to
9871 // widen, truncate and then possibly extract the original subvector.
9872 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
9873 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
9874 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
9877 // Fallback to a X86ISD::VTRUNC, padding if necessary.
9878 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
9879 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
9880 if (DstVT != TruncVT)
9881 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
9882 DstVT.getSizeInBits());
9883 return Trunc;
9886 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
9888 // An example is the following:
9890 // t0: ch = EntryToken
9891 // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
9892 // t25: v4i32 = truncate t2
9893 // t41: v8i16 = bitcast t25
9894 // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
9895 // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
9896 // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
9897 // t18: v2i64 = bitcast t51
9899 // One can just use a single vpmovdw instruction, without avx512vl we need to
9900 // use the zmm variant and extract the lower subvector, padding with zeroes.
9901 // TODO: Merge with lowerShuffleAsVTRUNC.
9902 static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
9903 SDValue V2, ArrayRef<int> Mask,
9904 const APInt &Zeroable,
9905 const X86Subtarget &Subtarget,
9906 SelectionDAG &DAG) {
9907 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
9908 if (!Subtarget.hasAVX512())
9909 return SDValue();
9911 unsigned NumElts = VT.getVectorNumElements();
9912 unsigned EltSizeInBits = VT.getScalarSizeInBits();
9913 unsigned MaxScale = 64 / EltSizeInBits;
9914 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
9915 unsigned SrcEltBits = EltSizeInBits * Scale;
9916 unsigned NumSrcElts = NumElts / Scale;
9917 unsigned UpperElts = NumElts - NumSrcElts;
9918 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
9919 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
9920 continue;
9922 // Attempt to find a matching source truncation, but as a fall back VLX
9923 // cases can use the VPMOV directly.
9924 SDValue Src = peekThroughBitcasts(V1);
9925 if (Src.getOpcode() == ISD::TRUNCATE &&
9926 Src.getScalarValueSizeInBits() == SrcEltBits) {
9927 Src = Src.getOperand(0);
9928 } else if (Subtarget.hasVLX()) {
9929 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
9930 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
9931 Src = DAG.getBitcast(SrcVT, Src);
9932 // Don't do this if PACKSS/PACKUS could perform it cheaper.
9933 if (Scale == 2 &&
9934 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
9935 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
9936 return SDValue();
9937 } else
9938 return SDValue();
9940 // VPMOVWB is only available with avx512bw.
9941 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
9942 return SDValue();
9944 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
9945 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
9948 return SDValue();
9951 // Attempt to match binary shuffle patterns as a truncate.
9952 static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
9953 SDValue V2, ArrayRef<int> Mask,
9954 const APInt &Zeroable,
9955 const X86Subtarget &Subtarget,
9956 SelectionDAG &DAG) {
9957 assert((VT.is128BitVector() || VT.is256BitVector()) &&
9958 "Unexpected VTRUNC type");
9959 if (!Subtarget.hasAVX512())
9960 return SDValue();
9962 unsigned NumElts = VT.getVectorNumElements();
9963 unsigned EltSizeInBits = VT.getScalarSizeInBits();
9964 unsigned MaxScale = 64 / EltSizeInBits;
9965 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
9966 // TODO: Support non-BWI VPMOVWB truncations?
9967 unsigned SrcEltBits = EltSizeInBits * Scale;
9968 if (SrcEltBits < 32 && !Subtarget.hasBWI())
9969 continue;
9971 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
9972 // Bail if the V2 elements are undef.
9973 unsigned NumHalfSrcElts = NumElts / Scale;
9974 unsigned NumSrcElts = 2 * NumHalfSrcElts;
9975 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
9976 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
9977 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
9978 continue;
9980 // The elements beyond the truncation must be undef/zero.
9981 unsigned UpperElts = NumElts - NumSrcElts;
9982 if (UpperElts > 0 &&
9983 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
9984 continue;
9985 bool UndefUppers =
9986 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
9988 // For offset truncations, ensure that the concat is cheap.
9989 if (Offset) {
9990 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
9991 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
9992 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
9993 return Lo.getOperand(0) == Hi.getOperand(0);
9994 if (ISD::isNormalLoad(Lo.getNode()) &&
9995 ISD::isNormalLoad(Hi.getNode())) {
9996 auto *LDLo = cast<LoadSDNode>(Lo);
9997 auto *LDHi = cast<LoadSDNode>(Hi);
9998 return DAG.areNonVolatileConsecutiveLoads(
9999 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10001 return false;
10003 if (!IsCheapConcat(V1, V2))
10004 continue;
10007 // As we're using both sources then we need to concat them together
10008 // and truncate from the double-sized src.
10009 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10010 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10012 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10013 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10014 Src = DAG.getBitcast(SrcVT, Src);
10016 // Shift the offset'd elements into place for the truncation.
10017 // TODO: Use getTargetVShiftByConstNode.
10018 if (Offset)
10019 Src = DAG.getNode(
10020 X86ISD::VSRLI, DL, SrcVT, Src,
10021 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10023 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10027 return SDValue();
10030 /// Check whether a compaction lowering can be done by dropping even/odd
10031 /// elements and compute how many times even/odd elements must be dropped.
10033 /// This handles shuffles which take every Nth element where N is a power of
10034 /// two. Example shuffle masks:
10036 /// (even)
10037 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10038 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10039 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10040 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10041 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10042 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10044 /// (odd)
10045 /// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10046 /// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10048 /// Any of these lanes can of course be undef.
10050 /// This routine only supports N <= 3.
10051 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10052 /// for larger N.
10054 /// \returns N above, or the number of times even/odd elements must be dropped
10055 /// if there is such a number. Otherwise returns zero.
10056 static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10057 bool IsSingleInput) {
10058 // The modulus for the shuffle vector entries is based on whether this is
10059 // a single input or not.
10060 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10061 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10062 "We should only be called with masks with a power-of-2 size!");
10064 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10065 int Offset = MatchEven ? 0 : 1;
10067 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10068 // and 2^3 simultaneously. This is because we may have ambiguity with
10069 // partially undef inputs.
10070 bool ViableForN[3] = {true, true, true};
10072 for (int i = 0, e = Mask.size(); i < e; ++i) {
10073 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10074 // want.
10075 if (Mask[i] < 0)
10076 continue;
10078 bool IsAnyViable = false;
10079 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10080 if (ViableForN[j]) {
10081 uint64_t N = j + 1;
10083 // The shuffle mask must be equal to (i * 2^N) % M.
10084 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10085 IsAnyViable = true;
10086 else
10087 ViableForN[j] = false;
10089 // Early exit if we exhaust the possible powers of two.
10090 if (!IsAnyViable)
10091 break;
10094 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10095 if (ViableForN[j])
10096 return j + 1;
10098 // Return 0 as there is no viable power of two.
10099 return 0;
10102 // X86 has dedicated pack instructions that can handle specific truncation
10103 // operations: PACKSS and PACKUS.
10104 // Checks for compaction shuffle masks if MaxStages > 1.
10105 // TODO: Add support for matching multiple PACKSS/PACKUS stages.
10106 static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10107 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10108 const SelectionDAG &DAG,
10109 const X86Subtarget &Subtarget,
10110 unsigned MaxStages = 1) {
10111 unsigned NumElts = VT.getVectorNumElements();
10112 unsigned BitSize = VT.getScalarSizeInBits();
10113 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10114 "Illegal maximum compaction");
10116 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10117 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10118 unsigned NumPackedBits = NumSrcBits - BitSize;
10119 N1 = peekThroughBitcasts(N1);
10120 N2 = peekThroughBitcasts(N2);
10121 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10122 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10123 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10124 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10125 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10126 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10127 return false;
10128 if (Subtarget.hasSSE41() || BitSize == 8) {
10129 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10130 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10131 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10132 V1 = N1;
10133 V2 = N2;
10134 SrcVT = PackVT;
10135 PackOpcode = X86ISD::PACKUS;
10136 return true;
10139 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10140 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10141 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10142 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10143 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10144 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10145 V1 = N1;
10146 V2 = N2;
10147 SrcVT = PackVT;
10148 PackOpcode = X86ISD::PACKSS;
10149 return true;
10151 return false;
10154 // Attempt to match against wider and wider compaction patterns.
10155 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10156 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10157 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10159 // Try binary shuffle.
10160 SmallVector<int, 32> BinaryMask;
10161 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10162 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10163 if (MatchPACK(V1, V2, PackVT))
10164 return true;
10166 // Try unary shuffle.
10167 SmallVector<int, 32> UnaryMask;
10168 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10169 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10170 if (MatchPACK(V1, V1, PackVT))
10171 return true;
10174 return false;
10177 static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
10178 SDValue V1, SDValue V2, SelectionDAG &DAG,
10179 const X86Subtarget &Subtarget) {
10180 MVT PackVT;
10181 unsigned PackOpcode;
10182 unsigned SizeBits = VT.getSizeInBits();
10183 unsigned EltBits = VT.getScalarSizeInBits();
10184 unsigned MaxStages = Log2_32(64 / EltBits);
10185 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10186 Subtarget, MaxStages))
10187 return SDValue();
10189 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10190 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10192 // Don't lower multi-stage packs on AVX512, truncation is better.
10193 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10194 return SDValue();
10196 // Pack to the largest type possible:
10197 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10198 unsigned MaxPackBits = 16;
10199 if (CurrentEltBits > 16 &&
10200 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10201 MaxPackBits = 32;
10203 // Repeatedly pack down to the target size.
10204 SDValue Res;
10205 for (unsigned i = 0; i != NumStages; ++i) {
10206 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10207 unsigned NumSrcElts = SizeBits / SrcEltBits;
10208 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10209 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10210 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10211 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10212 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10213 DAG.getBitcast(SrcVT, V2));
10214 V1 = V2 = Res;
10215 CurrentEltBits /= 2;
10217 assert(Res && Res.getValueType() == VT &&
10218 "Failed to lower compaction shuffle");
10219 return Res;
10222 /// Try to emit a bitmask instruction for a shuffle.
10224 /// This handles cases where we can model a blend exactly as a bitmask due to
10225 /// one of the inputs being zeroable.
10226 static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
10227 SDValue V2, ArrayRef<int> Mask,
10228 const APInt &Zeroable,
10229 const X86Subtarget &Subtarget,
10230 SelectionDAG &DAG) {
10231 MVT MaskVT = VT;
10232 MVT EltVT = VT.getVectorElementType();
10233 SDValue Zero, AllOnes;
10234 // Use f64 if i64 isn't legal.
10235 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10236 EltVT = MVT::f64;
10237 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10240 MVT LogicVT = VT;
10241 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10242 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10243 APFloat AllOnesValue =
10244 APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));
10245 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10246 LogicVT =
10247 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10248 } else {
10249 Zero = DAG.getConstant(0, DL, EltVT);
10250 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10253 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10254 SDValue V;
10255 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10256 if (Zeroable[i])
10257 continue;
10258 if (Mask[i] % Size != i)
10259 return SDValue(); // Not a blend.
10260 if (!V)
10261 V = Mask[i] < Size ? V1 : V2;
10262 else if (V != (Mask[i] < Size ? V1 : V2))
10263 return SDValue(); // Can only let one input through the mask.
10265 VMaskOps[i] = AllOnes;
10267 if (!V)
10268 return SDValue(); // No non-zeroable elements!
10270 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10271 VMask = DAG.getBitcast(LogicVT, VMask);
10272 V = DAG.getBitcast(LogicVT, V);
10273 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10274 return DAG.getBitcast(VT, And);
10277 /// Try to emit a blend instruction for a shuffle using bit math.
10279 /// This is used as a fallback approach when first class blend instructions are
10280 /// unavailable. Currently it is only suitable for integer vectors, but could
10281 /// be generalized for floating point vectors if desirable.
10282 static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
10283 SDValue V2, ArrayRef<int> Mask,
10284 SelectionDAG &DAG) {
10285 assert(VT.isInteger() && "Only supports integer vector types!");
10286 MVT EltVT = VT.getVectorElementType();
10287 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10288 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10289 SmallVector<SDValue, 16> MaskOps;
10290 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10291 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10292 return SDValue(); // Shuffled input!
10293 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10296 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10297 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10300 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
10301 SDValue PreservedSrc,
10302 const X86Subtarget &Subtarget,
10303 SelectionDAG &DAG);
10305 static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,
10306 MutableArrayRef<int> Mask,
10307 const APInt &Zeroable, bool &ForceV1Zero,
10308 bool &ForceV2Zero, uint64_t &BlendMask) {
10309 bool V1IsZeroOrUndef =
10310 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
10311 bool V2IsZeroOrUndef =
10312 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10314 BlendMask = 0;
10315 ForceV1Zero = false, ForceV2Zero = false;
10316 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10318 int NumElts = Mask.size();
10319 int NumLanes = VT.getSizeInBits() / 128;
10320 int NumEltsPerLane = NumElts / NumLanes;
10321 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10323 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10324 // then ensure the blend mask part for that lane just references that input.
10325 bool ForceWholeLaneMasks =
10326 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10328 // Attempt to generate the binary blend mask. If an input is zero then
10329 // we can use any lane.
10330 for (int Lane = 0; Lane != NumLanes; ++Lane) {
10331 // Keep track of the inputs used per lane.
10332 bool LaneV1InUse = false;
10333 bool LaneV2InUse = false;
10334 uint64_t LaneBlendMask = 0;
10335 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10336 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10337 int M = Mask[Elt];
10338 if (M == SM_SentinelUndef)
10339 continue;
10340 if (M == Elt || (0 <= M && M < NumElts &&
10341 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10342 Mask[Elt] = Elt;
10343 LaneV1InUse = true;
10344 continue;
10346 if (M == (Elt + NumElts) ||
10347 (NumElts <= M &&
10348 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10349 LaneBlendMask |= 1ull << LaneElt;
10350 Mask[Elt] = Elt + NumElts;
10351 LaneV2InUse = true;
10352 continue;
10354 if (Zeroable[Elt]) {
10355 if (V1IsZeroOrUndef) {
10356 ForceV1Zero = true;
10357 Mask[Elt] = Elt;
10358 LaneV1InUse = true;
10359 continue;
10361 if (V2IsZeroOrUndef) {
10362 ForceV2Zero = true;
10363 LaneBlendMask |= 1ull << LaneElt;
10364 Mask[Elt] = Elt + NumElts;
10365 LaneV2InUse = true;
10366 continue;
10369 return false;
10372 // If we only used V2 then splat the lane blend mask to avoid any demanded
10373 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10374 // blend mask bit).
10375 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10376 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10378 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10380 return true;
10383 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
10384 int Scale) {
10385 uint64_t ScaledMask = 0;
10386 for (int i = 0; i != Size; ++i)
10387 if (BlendMask & (1ull << i))
10388 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
10389 return ScaledMask;
10392 /// Try to emit a blend instruction for a shuffle.
10394 /// This doesn't do any checks for the availability of instructions for blending
10395 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10396 /// be matched in the backend with the type given. What it does check for is
10397 /// that the shuffle mask is a blend, or convertible into a blend with zero.
10398 static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
10399 SDValue V2, ArrayRef<int> Original,
10400 const APInt &Zeroable,
10401 const X86Subtarget &Subtarget,
10402 SelectionDAG &DAG) {
10403 uint64_t BlendMask = 0;
10404 bool ForceV1Zero = false, ForceV2Zero = false;
10405 SmallVector<int, 64> Mask(Original);
10406 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10407 BlendMask))
10408 return SDValue();
10410 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10411 if (ForceV1Zero)
10412 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10413 if (ForceV2Zero)
10414 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10416 unsigned NumElts = VT.getVectorNumElements();
10418 switch (VT.SimpleTy) {
10419 case MVT::v4i64:
10420 case MVT::v8i32:
10421 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10422 [[fallthrough]];
10423 case MVT::v4f64:
10424 case MVT::v8f32:
10425 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10426 [[fallthrough]];
10427 case MVT::v2f64:
10428 case MVT::v2i64:
10429 case MVT::v4f32:
10430 case MVT::v4i32:
10431 case MVT::v8i16:
10432 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10433 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10434 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10435 case MVT::v16i16: {
10436 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10437 SmallVector<int, 8> RepeatedMask;
10438 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10439 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10440 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10441 BlendMask = 0;
10442 for (int i = 0; i < 8; ++i)
10443 if (RepeatedMask[i] >= 8)
10444 BlendMask |= 1ull << i;
10445 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10446 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10448 // Use PBLENDW for lower/upper lanes and then blend lanes.
10449 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10450 // merge to VSELECT where useful.
10451 uint64_t LoMask = BlendMask & 0xFF;
10452 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10453 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10454 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10455 DAG.getTargetConstant(LoMask, DL, MVT::i8));
10456 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10457 DAG.getTargetConstant(HiMask, DL, MVT::i8));
10458 return DAG.getVectorShuffle(
10459 MVT::v16i16, DL, Lo, Hi,
10460 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10462 [[fallthrough]];
10464 case MVT::v32i8:
10465 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10466 [[fallthrough]];
10467 case MVT::v16i8: {
10468 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10470 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10471 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10472 Subtarget, DAG))
10473 return Masked;
10475 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10476 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10477 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10478 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10481 // If we have VPTERNLOG, we can use that as a bit blend.
10482 if (Subtarget.hasVLX())
10483 if (SDValue BitBlend =
10484 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
10485 return BitBlend;
10487 // Scale the blend by the number of bytes per element.
10488 int Scale = VT.getScalarSizeInBits() / 8;
10490 // This form of blend is always done on bytes. Compute the byte vector
10491 // type.
10492 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10494 // x86 allows load folding with blendvb from the 2nd source operand. But
10495 // we are still using LLVM select here (see comment below), so that's V1.
10496 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
10497 // allow that load-folding possibility.
10498 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
10499 ShuffleVectorSDNode::commuteMask(Mask);
10500 std::swap(V1, V2);
10503 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
10504 // mix of LLVM's code generator and the x86 backend. We tell the code
10505 // generator that boolean values in the elements of an x86 vector register
10506 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
10507 // mapping a select to operand #1, and 'false' mapping to operand #2. The
10508 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
10509 // of the element (the remaining are ignored) and 0 in that high bit would
10510 // mean operand #1 while 1 in the high bit would mean operand #2. So while
10511 // the LLVM model for boolean values in vector elements gets the relevant
10512 // bit set, it is set backwards and over constrained relative to x86's
10513 // actual model.
10514 SmallVector<SDValue, 32> VSELECTMask;
10515 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10516 for (int j = 0; j < Scale; ++j)
10517 VSELECTMask.push_back(
10518 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
10519 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
10520 MVT::i8));
10522 V1 = DAG.getBitcast(BlendVT, V1);
10523 V2 = DAG.getBitcast(BlendVT, V2);
10524 return DAG.getBitcast(
10526 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
10527 V1, V2));
10529 case MVT::v16f32:
10530 case MVT::v8f64:
10531 case MVT::v8i64:
10532 case MVT::v16i32:
10533 case MVT::v32i16:
10534 case MVT::v64i8: {
10535 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
10536 bool OptForSize = DAG.shouldOptForSize();
10537 if (!OptForSize) {
10538 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10539 Subtarget, DAG))
10540 return Masked;
10543 // Otherwise load an immediate into a GPR, cast to k-register, and use a
10544 // masked move.
10545 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10546 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10547 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10549 default:
10550 llvm_unreachable("Not a supported integer vector type!");
10554 /// Try to lower as a blend of elements from two inputs followed by
10555 /// a single-input permutation.
10557 /// This matches the pattern where we can blend elements from two inputs and
10558 /// then reduce the shuffle to a single-input permutation.
10559 static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
10560 SDValue V1, SDValue V2,
10561 ArrayRef<int> Mask,
10562 SelectionDAG &DAG,
10563 bool ImmBlends = false) {
10564 // We build up the blend mask while checking whether a blend is a viable way
10565 // to reduce the shuffle.
10566 SmallVector<int, 32> BlendMask(Mask.size(), -1);
10567 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
10569 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10570 if (Mask[i] < 0)
10571 continue;
10573 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
10575 if (BlendMask[Mask[i] % Size] < 0)
10576 BlendMask[Mask[i] % Size] = Mask[i];
10577 else if (BlendMask[Mask[i] % Size] != Mask[i])
10578 return SDValue(); // Can't blend in the needed input!
10580 PermuteMask[i] = Mask[i] % Size;
10583 // If only immediate blends, then bail if the blend mask can't be widened to
10584 // i16.
10585 unsigned EltSize = VT.getScalarSizeInBits();
10586 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
10587 return SDValue();
10589 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
10590 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
10593 /// Try to lower as an unpack of elements from two inputs followed by
10594 /// a single-input permutation.
10596 /// This matches the pattern where we can unpack elements from two inputs and
10597 /// then reduce the shuffle to a single-input (wider) permutation.
10598 static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
10599 SDValue V1, SDValue V2,
10600 ArrayRef<int> Mask,
10601 SelectionDAG &DAG) {
10602 int NumElts = Mask.size();
10603 int NumLanes = VT.getSizeInBits() / 128;
10604 int NumLaneElts = NumElts / NumLanes;
10605 int NumHalfLaneElts = NumLaneElts / 2;
10607 bool MatchLo = true, MatchHi = true;
10608 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
10610 // Determine UNPCKL/UNPCKH type and operand order.
10611 for (int Elt = 0; Elt != NumElts; ++Elt) {
10612 int M = Mask[Elt];
10613 if (M < 0)
10614 continue;
10616 // Normalize the mask value depending on whether it's V1 or V2.
10617 int NormM = M;
10618 SDValue &Op = Ops[Elt & 1];
10619 if (M < NumElts && (Op.isUndef() || Op == V1))
10620 Op = V1;
10621 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
10622 Op = V2;
10623 NormM -= NumElts;
10624 } else
10625 return SDValue();
10627 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
10628 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
10629 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
10630 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
10631 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
10632 if (MatchLoAnyLane || MatchHiAnyLane) {
10633 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
10634 "Failed to match UNPCKLO/UNPCKHI");
10635 break;
10638 MatchLo &= MatchLoAnyLane;
10639 MatchHi &= MatchHiAnyLane;
10640 if (!MatchLo && !MatchHi)
10641 return SDValue();
10643 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
10645 // Element indices have changed after unpacking. Calculate permute mask
10646 // so that they will be put back to the position as dictated by the
10647 // original shuffle mask indices.
10648 SmallVector<int, 32> PermuteMask(NumElts, -1);
10649 for (int Elt = 0; Elt != NumElts; ++Elt) {
10650 int M = Mask[Elt];
10651 if (M < 0)
10652 continue;
10653 int NormM = M;
10654 if (NumElts <= M)
10655 NormM -= NumElts;
10656 bool IsFirstOp = M < NumElts;
10657 int BaseMaskElt =
10658 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
10659 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
10660 PermuteMask[Elt] = BaseMaskElt;
10661 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
10662 PermuteMask[Elt] = BaseMaskElt + 1;
10663 assert(PermuteMask[Elt] != -1 &&
10664 "Input mask element is defined but failed to assign permute mask");
10667 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10668 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
10669 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
10672 /// Try to lower a shuffle as a permute of the inputs followed by an
10673 /// UNPCK instruction.
10675 /// This specifically targets cases where we end up with alternating between
10676 /// the two inputs, and so can permute them into something that feeds a single
10677 /// UNPCK instruction. Note that this routine only targets integer vectors
10678 /// because for floating point vectors we have a generalized SHUFPS lowering
10679 /// strategy that handles everything that doesn't *exactly* match an unpack,
10680 /// making this clever lowering unnecessary.
10681 static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10682 SDValue V1, SDValue V2,
10683 ArrayRef<int> Mask,
10684 const X86Subtarget &Subtarget,
10685 SelectionDAG &DAG) {
10686 int Size = Mask.size();
10687 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10689 // This routine only supports 128-bit integer dual input vectors.
10690 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
10691 return SDValue();
10693 int NumLoInputs =
10694 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10695 int NumHiInputs =
10696 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10698 bool UnpackLo = NumLoInputs >= NumHiInputs;
10700 auto TryUnpack = [&](int ScalarSize, int Scale) {
10701 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10702 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10704 for (int i = 0; i < Size; ++i) {
10705 if (Mask[i] < 0)
10706 continue;
10708 // Each element of the unpack contains Scale elements from this mask.
10709 int UnpackIdx = i / Scale;
10711 // We only handle the case where V1 feeds the first slots of the unpack.
10712 // We rely on canonicalization to ensure this is the case.
10713 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10714 return SDValue();
10716 // Setup the mask for this input. The indexing is tricky as we have to
10717 // handle the unpack stride.
10718 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10719 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10720 Mask[i] % Size;
10723 // If we will have to shuffle both inputs to use the unpack, check whether
10724 // we can just unpack first and shuffle the result. If so, skip this unpack.
10725 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10726 !isNoopShuffleMask(V2Mask))
10727 return SDValue();
10729 // Shuffle the inputs into place.
10730 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10731 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10733 // Cast the inputs to the type we will use to unpack them.
10734 MVT UnpackVT =
10735 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10736 V1 = DAG.getBitcast(UnpackVT, V1);
10737 V2 = DAG.getBitcast(UnpackVT, V2);
10739 // Unpack the inputs and cast the result back to the desired type.
10740 return DAG.getBitcast(
10741 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10742 UnpackVT, V1, V2));
10745 // We try each unpack from the largest to the smallest to try and find one
10746 // that fits this mask.
10747 int OrigScalarSize = VT.getScalarSizeInBits();
10748 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10749 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10750 return Unpack;
10752 // If we're shuffling with a zero vector then we're better off not doing
10753 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
10754 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
10755 ISD::isBuildVectorAllZeros(V2.getNode()))
10756 return SDValue();
10758 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10759 // initial unpack.
10760 if (NumLoInputs == 0 || NumHiInputs == 0) {
10761 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10762 "We have to have *some* inputs!");
10763 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10765 // FIXME: We could consider the total complexity of the permute of each
10766 // possible unpacking. Or at the least we should consider how many
10767 // half-crossings are created.
10768 // FIXME: We could consider commuting the unpacks.
10770 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10771 for (int i = 0; i < Size; ++i) {
10772 if (Mask[i] < 0)
10773 continue;
10775 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10777 PermMask[i] =
10778 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10780 return DAG.getVectorShuffle(
10781 VT, DL,
10782 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
10783 V1, V2),
10784 DAG.getUNDEF(VT), PermMask);
10787 return SDValue();
10790 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
10791 /// permuting the elements of the result in place.
10792 static SDValue lowerShuffleAsByteRotateAndPermute(
10793 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10794 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10795 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
10796 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
10797 (VT.is512BitVector() && !Subtarget.hasBWI()))
10798 return SDValue();
10800 // We don't currently support lane crossing permutes.
10801 if (is128BitLaneCrossingShuffleMask(VT, Mask))
10802 return SDValue();
10804 int Scale = VT.getScalarSizeInBits() / 8;
10805 int NumLanes = VT.getSizeInBits() / 128;
10806 int NumElts = VT.getVectorNumElements();
10807 int NumEltsPerLane = NumElts / NumLanes;
10809 // Determine range of mask elts.
10810 bool Blend1 = true;
10811 bool Blend2 = true;
10812 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
10813 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
10814 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
10815 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
10816 int M = Mask[Lane + Elt];
10817 if (M < 0)
10818 continue;
10819 if (M < NumElts) {
10820 Blend1 &= (M == (Lane + Elt));
10821 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
10822 M = M % NumEltsPerLane;
10823 Range1.first = std::min(Range1.first, M);
10824 Range1.second = std::max(Range1.second, M);
10825 } else {
10826 M -= NumElts;
10827 Blend2 &= (M == (Lane + Elt));
10828 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
10829 M = M % NumEltsPerLane;
10830 Range2.first = std::min(Range2.first, M);
10831 Range2.second = std::max(Range2.second, M);
10836 // Bail if we don't need both elements.
10837 // TODO - it might be worth doing this for unary shuffles if the permute
10838 // can be widened.
10839 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
10840 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
10841 return SDValue();
10843 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
10844 return SDValue();
10846 // Rotate the 2 ops so we can access both ranges, then permute the result.
10847 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
10848 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10849 SDValue Rotate = DAG.getBitcast(
10850 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
10851 DAG.getBitcast(ByteVT, Lo),
10852 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
10853 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
10854 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
10855 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
10856 int M = Mask[Lane + Elt];
10857 if (M < 0)
10858 continue;
10859 if (M < NumElts)
10860 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
10861 else
10862 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
10865 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
10868 // Check if the ranges are small enough to rotate from either direction.
10869 if (Range2.second < Range1.first)
10870 return RotateAndPermute(V1, V2, Range1.first, 0);
10871 if (Range1.second < Range2.first)
10872 return RotateAndPermute(V2, V1, Range2.first, NumElts);
10873 return SDValue();
10876 static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
10877 return isUndefOrEqual(Mask, 0);
10880 static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
10881 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
10884 /// Check if the Mask consists of the same element repeated multiple times.
10885 static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) {
10886 size_t NumUndefs = 0;
10887 std::optional<int> UniqueElt;
10888 for (int Elt : Mask) {
10889 if (Elt == SM_SentinelUndef) {
10890 NumUndefs++;
10891 continue;
10893 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
10894 return false;
10895 UniqueElt = Elt;
10897 // Make sure the element is repeated enough times by checking the number of
10898 // undefs is small.
10899 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
10902 /// Generic routine to decompose a shuffle and blend into independent
10903 /// blends and permutes.
10905 /// This matches the extremely common pattern for handling combined
10906 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
10907 /// operations. It will try to pick the best arrangement of shuffles and
10908 /// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
10909 static SDValue lowerShuffleAsDecomposedShuffleMerge(
10910 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10911 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10912 int NumElts = Mask.size();
10913 int NumLanes = VT.getSizeInBits() / 128;
10914 int NumEltsPerLane = NumElts / NumLanes;
10916 // Shuffle the input elements into the desired positions in V1 and V2 and
10917 // unpack/blend them together.
10918 bool IsAlternating = true;
10919 SmallVector<int, 32> V1Mask(NumElts, -1);
10920 SmallVector<int, 32> V2Mask(NumElts, -1);
10921 SmallVector<int, 32> FinalMask(NumElts, -1);
10922 for (int i = 0; i < NumElts; ++i) {
10923 int M = Mask[i];
10924 if (M >= 0 && M < NumElts) {
10925 V1Mask[i] = M;
10926 FinalMask[i] = i;
10927 IsAlternating &= (i & 1) == 0;
10928 } else if (M >= NumElts) {
10929 V2Mask[i] = M - NumElts;
10930 FinalMask[i] = i + NumElts;
10931 IsAlternating &= (i & 1) == 1;
10935 // If we effectively only demand the 0'th element of \p Input, and not only
10936 // as 0'th element, then broadcast said input,
10937 // and change \p InputMask to be a no-op (identity) mask.
10938 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
10939 &DAG](SDValue &Input,
10940 MutableArrayRef<int> InputMask) {
10941 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
10942 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
10943 !X86::mayFoldLoad(Input, Subtarget)))
10944 return;
10945 if (isNoopShuffleMask(InputMask))
10946 return;
10947 assert(isBroadcastShuffleMask(InputMask) &&
10948 "Expected to demand only the 0'th element.");
10949 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
10950 for (auto I : enumerate(InputMask)) {
10951 int &InputMaskElt = I.value();
10952 if (InputMaskElt >= 0)
10953 InputMaskElt = I.index();
10957 // Currently, we may need to produce one shuffle per input, and blend results.
10958 // It is possible that the shuffle for one of the inputs is already a no-op.
10959 // See if we can simplify non-no-op shuffles into broadcasts,
10960 // which we consider to be strictly better than an arbitrary shuffle.
10961 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
10962 isNoopOrBroadcastShuffleMask(V2Mask)) {
10963 canonicalizeBroadcastableInput(V1, V1Mask);
10964 canonicalizeBroadcastableInput(V2, V2Mask);
10967 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
10968 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
10969 // the shuffle may be able to fold with a load or other benefit. However, when
10970 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
10971 // pre-shuffle first is a better strategy.
10972 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
10973 // Only prefer immediate blends to unpack/rotate.
10974 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
10975 DAG, true))
10976 return BlendPerm;
10977 // If either input vector provides only a single element which is repeated
10978 // multiple times, unpacking from both input vectors would generate worse
10979 // code. e.g. for
10980 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
10981 // it is better to process t4 first to create a vector of t4[0], then unpack
10982 // that vector with t2.
10983 if (!isSingleElementRepeatedMask(V1Mask) &&
10984 !isSingleElementRepeatedMask(V2Mask))
10985 if (SDValue UnpackPerm =
10986 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
10987 return UnpackPerm;
10988 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
10989 DL, VT, V1, V2, Mask, Subtarget, DAG))
10990 return RotatePerm;
10991 // Unpack/rotate failed - try again with variable blends.
10992 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
10993 DAG))
10994 return BlendPerm;
10995 if (VT.getScalarSizeInBits() >= 32)
10996 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
10997 DL, VT, V1, V2, Mask, Subtarget, DAG))
10998 return PermUnpack;
11001 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11002 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11003 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11004 // than half the elements coming from each source.
11005 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11006 V1Mask.assign(NumElts, -1);
11007 V2Mask.assign(NumElts, -1);
11008 FinalMask.assign(NumElts, -1);
11009 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11010 for (int j = 0; j != NumEltsPerLane; ++j) {
11011 int M = Mask[i + j];
11012 if (M >= 0 && M < NumElts) {
11013 V1Mask[i + (j / 2)] = M;
11014 FinalMask[i + j] = i + (j / 2);
11015 } else if (M >= NumElts) {
11016 V2Mask[i + (j / 2)] = M - NumElts;
11017 FinalMask[i + j] = i + (j / 2) + NumElts;
11022 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11023 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11024 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11027 static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11028 const X86Subtarget &Subtarget,
11029 ArrayRef<int> Mask) {
11030 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11031 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11033 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11034 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11035 int MaxSubElts = 64 / EltSizeInBits;
11036 unsigned RotateAmt, NumSubElts;
11037 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11038 MaxSubElts, NumSubElts, RotateAmt))
11039 return -1;
11040 unsigned NumElts = Mask.size();
11041 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11042 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11043 return RotateAmt;
11046 /// Lower shuffle using X86ISD::VROTLI rotations.
11047 static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
11048 ArrayRef<int> Mask,
11049 const X86Subtarget &Subtarget,
11050 SelectionDAG &DAG) {
11051 // Only XOP + AVX512 targets have bit rotation instructions.
11052 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11053 bool IsLegal =
11054 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11055 if (!IsLegal && Subtarget.hasSSE3())
11056 return SDValue();
11058 MVT RotateVT;
11059 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11060 Subtarget, Mask);
11061 if (RotateAmt < 0)
11062 return SDValue();
11064 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11065 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11066 // widen to vXi16 or more then existing lowering should will be better.
11067 if (!IsLegal) {
11068 if ((RotateAmt % 16) == 0)
11069 return SDValue();
11070 // TODO: Use getTargetVShiftByConstNode.
11071 unsigned ShlAmt = RotateAmt;
11072 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11073 V1 = DAG.getBitcast(RotateVT, V1);
11074 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11075 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11076 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11077 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11078 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11079 return DAG.getBitcast(VT, Rot);
11082 SDValue Rot =
11083 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11084 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11085 return DAG.getBitcast(VT, Rot);
11088 /// Try to match a vector shuffle as an element rotation.
11090 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11091 static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
11092 ArrayRef<int> Mask) {
11093 int NumElts = Mask.size();
11095 // We need to detect various ways of spelling a rotation:
11096 // [11, 12, 13, 14, 15, 0, 1, 2]
11097 // [-1, 12, 13, 14, -1, -1, 1, -1]
11098 // [-1, -1, -1, -1, -1, -1, 1, 2]
11099 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11100 // [-1, 4, 5, 6, -1, -1, 9, -1]
11101 // [-1, 4, 5, 6, -1, -1, -1, -1]
11102 int Rotation = 0;
11103 SDValue Lo, Hi;
11104 for (int i = 0; i < NumElts; ++i) {
11105 int M = Mask[i];
11106 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11107 "Unexpected mask index.");
11108 if (M < 0)
11109 continue;
11111 // Determine where a rotated vector would have started.
11112 int StartIdx = i - (M % NumElts);
11113 if (StartIdx == 0)
11114 // The identity rotation isn't interesting, stop.
11115 return -1;
11117 // If we found the tail of a vector the rotation must be the missing
11118 // front. If we found the head of a vector, it must be how much of the
11119 // head.
11120 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11122 if (Rotation == 0)
11123 Rotation = CandidateRotation;
11124 else if (Rotation != CandidateRotation)
11125 // The rotations don't match, so we can't match this mask.
11126 return -1;
11128 // Compute which value this mask is pointing at.
11129 SDValue MaskV = M < NumElts ? V1 : V2;
11131 // Compute which of the two target values this index should be assigned
11132 // to. This reflects whether the high elements are remaining or the low
11133 // elements are remaining.
11134 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11136 // Either set up this value if we've not encountered it before, or check
11137 // that it remains consistent.
11138 if (!TargetV)
11139 TargetV = MaskV;
11140 else if (TargetV != MaskV)
11141 // This may be a rotation, but it pulls from the inputs in some
11142 // unsupported interleaving.
11143 return -1;
11146 // Check that we successfully analyzed the mask, and normalize the results.
11147 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11148 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11149 if (!Lo)
11150 Lo = Hi;
11151 else if (!Hi)
11152 Hi = Lo;
11154 V1 = Lo;
11155 V2 = Hi;
11157 return Rotation;
11160 /// Try to lower a vector shuffle as a byte rotation.
11162 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11163 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11164 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11165 /// try to generically lower a vector shuffle through such an pattern. It
11166 /// does not check for the profitability of lowering either as PALIGNR or
11167 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11168 /// This matches shuffle vectors that look like:
11170 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11172 /// Essentially it concatenates V1 and V2, shifts right by some number of
11173 /// elements, and takes the low elements as the result. Note that while this is
11174 /// specified as a *right shift* because x86 is little-endian, it is a *left
11175 /// rotate* of the vector lanes.
11176 static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
11177 ArrayRef<int> Mask) {
11178 // Don't accept any shuffles with zero elements.
11179 if (isAnyZero(Mask))
11180 return -1;
11182 // PALIGNR works on 128-bit lanes.
11183 SmallVector<int, 16> RepeatedMask;
11184 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11185 return -1;
11187 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11188 if (Rotation <= 0)
11189 return -1;
11191 // PALIGNR rotates bytes, so we need to scale the
11192 // rotation based on how many bytes are in the vector lane.
11193 int NumElts = RepeatedMask.size();
11194 int Scale = 16 / NumElts;
11195 return Rotation * Scale;
11198 static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
11199 SDValue V2, ArrayRef<int> Mask,
11200 const X86Subtarget &Subtarget,
11201 SelectionDAG &DAG) {
11202 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11204 SDValue Lo = V1, Hi = V2;
11205 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11206 if (ByteRotation <= 0)
11207 return SDValue();
11209 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11210 // PSLLDQ/PSRLDQ.
11211 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11212 Lo = DAG.getBitcast(ByteVT, Lo);
11213 Hi = DAG.getBitcast(ByteVT, Hi);
11215 // SSSE3 targets can use the palignr instruction.
11216 if (Subtarget.hasSSSE3()) {
11217 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11218 "512-bit PALIGNR requires BWI instructions");
11219 return DAG.getBitcast(
11220 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11221 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11224 assert(VT.is128BitVector() &&
11225 "Rotate-based lowering only supports 128-bit lowering!");
11226 assert(Mask.size() <= 16 &&
11227 "Can shuffle at most 16 bytes in a 128-bit vector!");
11228 assert(ByteVT == MVT::v16i8 &&
11229 "SSE2 rotate lowering only needed for v16i8!");
11231 // Default SSE2 implementation
11232 int LoByteShift = 16 - ByteRotation;
11233 int HiByteShift = ByteRotation;
11235 SDValue LoShift =
11236 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11237 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11238 SDValue HiShift =
11239 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11240 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11241 return DAG.getBitcast(VT,
11242 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11245 /// Try to lower a vector shuffle as a dword/qword rotation.
11247 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11248 /// rotation of the concatenation of two vectors; This routine will
11249 /// try to generically lower a vector shuffle through such an pattern.
11251 /// Essentially it concatenates V1 and V2, shifts right by some number of
11252 /// elements, and takes the low elements as the result. Note that while this is
11253 /// specified as a *right shift* because x86 is little-endian, it is a *left
11254 /// rotate* of the vector lanes.
11255 static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
11256 SDValue V2, ArrayRef<int> Mask,
11257 const APInt &Zeroable,
11258 const X86Subtarget &Subtarget,
11259 SelectionDAG &DAG) {
11260 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11261 "Only 32-bit and 64-bit elements are supported!");
11263 // 128/256-bit vectors are only supported with VLX.
11264 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11265 && "VLX required for 128/256-bit vectors");
11267 SDValue Lo = V1, Hi = V2;
11268 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11269 if (0 < Rotation)
11270 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11271 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11273 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11274 // TODO: Pull this out as a matchShuffleAsElementShift helper?
11275 // TODO: We can probably make this more aggressive and use shift-pairs like
11276 // lowerShuffleAsByteShiftMask.
11277 unsigned NumElts = Mask.size();
11278 unsigned ZeroLo = Zeroable.countr_one();
11279 unsigned ZeroHi = Zeroable.countl_one();
11280 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11281 if (!ZeroLo && !ZeroHi)
11282 return SDValue();
11284 if (ZeroLo) {
11285 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11286 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11287 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11288 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11289 getZeroVector(VT, Subtarget, DAG, DL),
11290 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11293 if (ZeroHi) {
11294 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11295 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11296 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11297 return DAG.getNode(X86ISD::VALIGN, DL, VT,
11298 getZeroVector(VT, Subtarget, DAG, DL), Src,
11299 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11302 return SDValue();
11305 /// Try to lower a vector shuffle as a byte shift sequence.
11306 static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
11307 SDValue V2, ArrayRef<int> Mask,
11308 const APInt &Zeroable,
11309 const X86Subtarget &Subtarget,
11310 SelectionDAG &DAG) {
11311 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11312 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11314 // We need a shuffle that has zeros at one/both ends and a sequential
11315 // shuffle from one source within.
11316 unsigned ZeroLo = Zeroable.countr_one();
11317 unsigned ZeroHi = Zeroable.countl_one();
11318 if (!ZeroLo && !ZeroHi)
11319 return SDValue();
11321 unsigned NumElts = Mask.size();
11322 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11323 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11324 return SDValue();
11326 unsigned Scale = VT.getScalarSizeInBits() / 8;
11327 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11328 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11329 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11330 return SDValue();
11332 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11333 Res = DAG.getBitcast(MVT::v16i8, Res);
11335 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11336 // inner sequential set of elements, possibly offset:
11337 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11338 // 01234567 --> 4567zzzz --> zzzzz456
11339 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11340 if (ZeroLo == 0) {
11341 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11342 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11343 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11344 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11345 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11346 } else if (ZeroHi == 0) {
11347 unsigned Shift = Mask[ZeroLo] % NumElts;
11348 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11349 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11350 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11351 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11352 } else if (!Subtarget.hasSSSE3()) {
11353 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11354 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11355 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11356 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11357 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11358 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11359 Shift += Mask[ZeroLo] % NumElts;
11360 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11361 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11362 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11363 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11364 } else
11365 return SDValue();
11367 return DAG.getBitcast(VT, Res);
11370 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11372 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11373 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11374 /// matches elements from one of the input vectors shuffled to the left or
11375 /// right with zeroable elements 'shifted in'. It handles both the strictly
11376 /// bit-wise element shifts and the byte shift across an entire 128-bit double
11377 /// quad word lane.
11379 /// PSHL : (little-endian) left bit shift.
11380 /// [ zz, 0, zz, 2 ]
11381 /// [ -1, 4, zz, -1 ]
11382 /// PSRL : (little-endian) right bit shift.
11383 /// [ 1, zz, 3, zz]
11384 /// [ -1, -1, 7, zz]
11385 /// PSLLDQ : (little-endian) left byte shift
11386 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
11387 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
11388 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
11389 /// PSRLDQ : (little-endian) right byte shift
11390 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
11391 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
11392 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
11393 static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11394 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11395 int MaskOffset, const APInt &Zeroable,
11396 const X86Subtarget &Subtarget) {
11397 int Size = Mask.size();
11398 unsigned SizeInBits = Size * ScalarSizeInBits;
11400 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11401 for (int i = 0; i < Size; i += Scale)
11402 for (int j = 0; j < Shift; ++j)
11403 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11404 return false;
11406 return true;
11409 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11410 for (int i = 0; i != Size; i += Scale) {
11411 unsigned Pos = Left ? i + Shift : i;
11412 unsigned Low = Left ? i : i + Shift;
11413 unsigned Len = Scale - Shift;
11414 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11415 return -1;
11418 int ShiftEltBits = ScalarSizeInBits * Scale;
11419 bool ByteShift = ShiftEltBits > 64;
11420 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11421 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11422 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11424 // Normalize the scale for byte shifts to still produce an i64 element
11425 // type.
11426 Scale = ByteShift ? Scale / 2 : Scale;
11428 // We need to round trip through the appropriate type for the shift.
11429 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11430 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11431 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11432 return (int)ShiftAmt;
11435 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11436 // keep doubling the size of the integer elements up to that. We can
11437 // then shift the elements of the integer vector by whole multiples of
11438 // their width within the elements of the larger integer vector. Test each
11439 // multiple to see if we can find a match with the moved element indices
11440 // and that the shifted in elements are all zeroable.
11441 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11442 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11443 for (int Shift = 1; Shift != Scale; ++Shift)
11444 for (bool Left : {true, false})
11445 if (CheckZeros(Shift, Scale, Left)) {
11446 int ShiftAmt = MatchShift(Shift, Scale, Left);
11447 if (0 < ShiftAmt)
11448 return ShiftAmt;
11451 // no match
11452 return -1;
11455 static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
11456 SDValue V2, ArrayRef<int> Mask,
11457 const APInt &Zeroable,
11458 const X86Subtarget &Subtarget,
11459 SelectionDAG &DAG, bool BitwiseOnly) {
11460 int Size = Mask.size();
11461 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11463 MVT ShiftVT;
11464 SDValue V = V1;
11465 unsigned Opcode;
11467 // Try to match shuffle against V1 shift.
11468 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11469 Mask, 0, Zeroable, Subtarget);
11471 // If V1 failed, try to match shuffle against V2 shift.
11472 if (ShiftAmt < 0) {
11473 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11474 Mask, Size, Zeroable, Subtarget);
11475 V = V2;
11478 if (ShiftAmt < 0)
11479 return SDValue();
11481 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
11482 return SDValue();
11484 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11485 "Illegal integer vector type");
11486 V = DAG.getBitcast(ShiftVT, V);
11487 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11488 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11489 return DAG.getBitcast(VT, V);
11492 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
11493 // Remainder of lower half result is zero and upper half is all undef.
11494 static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
11495 ArrayRef<int> Mask, uint64_t &BitLen,
11496 uint64_t &BitIdx, const APInt &Zeroable) {
11497 int Size = Mask.size();
11498 int HalfSize = Size / 2;
11499 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11500 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
11502 // Upper half must be undefined.
11503 if (!isUndefUpperHalf(Mask))
11504 return false;
11506 // Determine the extraction length from the part of the
11507 // lower half that isn't zeroable.
11508 int Len = HalfSize;
11509 for (; Len > 0; --Len)
11510 if (!Zeroable[Len - 1])
11511 break;
11512 assert(Len > 0 && "Zeroable shuffle mask");
11514 // Attempt to match first Len sequential elements from the lower half.
11515 SDValue Src;
11516 int Idx = -1;
11517 for (int i = 0; i != Len; ++i) {
11518 int M = Mask[i];
11519 if (M == SM_SentinelUndef)
11520 continue;
11521 SDValue &V = (M < Size ? V1 : V2);
11522 M = M % Size;
11524 // The extracted elements must start at a valid index and all mask
11525 // elements must be in the lower half.
11526 if (i > M || M >= HalfSize)
11527 return false;
11529 if (Idx < 0 || (Src == V && Idx == (M - i))) {
11530 Src = V;
11531 Idx = M - i;
11532 continue;
11534 return false;
11537 if (!Src || Idx < 0)
11538 return false;
11540 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
11541 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11542 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11543 V1 = Src;
11544 return true;
11547 // INSERTQ: Extract lowest Len elements from lower half of second source and
11548 // insert over first source, starting at Idx.
11549 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
11550 static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
11551 ArrayRef<int> Mask, uint64_t &BitLen,
11552 uint64_t &BitIdx) {
11553 int Size = Mask.size();
11554 int HalfSize = Size / 2;
11555 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11557 // Upper half must be undefined.
11558 if (!isUndefUpperHalf(Mask))
11559 return false;
11561 for (int Idx = 0; Idx != HalfSize; ++Idx) {
11562 SDValue Base;
11564 // Attempt to match first source from mask before insertion point.
11565 if (isUndefInRange(Mask, 0, Idx)) {
11566 /* EMPTY */
11567 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
11568 Base = V1;
11569 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
11570 Base = V2;
11571 } else {
11572 continue;
11575 // Extend the extraction length looking to match both the insertion of
11576 // the second source and the remaining elements of the first.
11577 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
11578 SDValue Insert;
11579 int Len = Hi - Idx;
11581 // Match insertion.
11582 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
11583 Insert = V1;
11584 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
11585 Insert = V2;
11586 } else {
11587 continue;
11590 // Match the remaining elements of the lower half.
11591 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
11592 /* EMPTY */
11593 } else if ((!Base || (Base == V1)) &&
11594 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
11595 Base = V1;
11596 } else if ((!Base || (Base == V2)) &&
11597 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
11598 Size + Hi)) {
11599 Base = V2;
11600 } else {
11601 continue;
11604 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11605 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11606 V1 = Base;
11607 V2 = Insert;
11608 return true;
11612 return false;
11615 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
11616 static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
11617 SDValue V2, ArrayRef<int> Mask,
11618 const APInt &Zeroable, SelectionDAG &DAG) {
11619 uint64_t BitLen, BitIdx;
11620 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
11621 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
11622 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11623 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11625 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
11626 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
11627 V2 ? V2 : DAG.getUNDEF(VT),
11628 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11629 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11631 return SDValue();
11634 /// Lower a vector shuffle as a zero or any extension.
11636 /// Given a specific number of elements, element bit width, and extension
11637 /// stride, produce either a zero or any extension based on the available
11638 /// features of the subtarget. The extended elements are consecutive and
11639 /// begin and can start from an offsetted element index in the input; to
11640 /// avoid excess shuffling the offset must either being in the bottom lane
11641 /// or at the start of a higher lane. All extended elements must be from
11642 /// the same lane.
11643 static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
11644 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
11645 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11646 assert(Scale > 1 && "Need a scale to extend.");
11647 int EltBits = VT.getScalarSizeInBits();
11648 int NumElements = VT.getVectorNumElements();
11649 int NumEltsPerLane = 128 / EltBits;
11650 int OffsetLane = Offset / NumEltsPerLane;
11651 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
11652 "Only 8, 16, and 32 bit elements can be extended.");
11653 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
11654 assert(0 <= Offset && "Extension offset must be positive.");
11655 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
11656 "Extension offset must be in the first lane or start an upper lane.");
11658 // Check that an index is in same lane as the base offset.
11659 auto SafeOffset = [&](int Idx) {
11660 return OffsetLane == (Idx / NumEltsPerLane);
11663 // Shift along an input so that the offset base moves to the first element.
11664 auto ShuffleOffset = [&](SDValue V) {
11665 if (!Offset)
11666 return V;
11668 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11669 for (int i = 0; i * Scale < NumElements; ++i) {
11670 int SrcIdx = i + Offset;
11671 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
11673 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
11676 // Found a valid a/zext mask! Try various lowering strategies based on the
11677 // input type and available ISA extensions.
11678 if (Subtarget.hasSSE41()) {
11679 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
11680 // PUNPCK will catch this in a later shuffle match.
11681 if (Offset && Scale == 2 && VT.is128BitVector())
11682 return SDValue();
11683 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
11684 NumElements / Scale);
11685 InputV = DAG.getBitcast(VT, InputV);
11686 InputV = ShuffleOffset(InputV);
11687 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
11688 DL, ExtVT, InputV, DAG);
11689 return DAG.getBitcast(VT, InputV);
11692 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
11693 InputV = DAG.getBitcast(VT, InputV);
11695 // For any extends we can cheat for larger element sizes and use shuffle
11696 // instructions that can fold with a load and/or copy.
11697 if (AnyExt && EltBits == 32) {
11698 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
11699 -1};
11700 return DAG.getBitcast(
11701 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11702 DAG.getBitcast(MVT::v4i32, InputV),
11703 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11705 if (AnyExt && EltBits == 16 && Scale > 2) {
11706 int PSHUFDMask[4] = {Offset / 2, -1,
11707 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
11708 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11709 DAG.getBitcast(MVT::v4i32, InputV),
11710 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11711 int PSHUFWMask[4] = {1, -1, -1, -1};
11712 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
11713 return DAG.getBitcast(
11714 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
11715 DAG.getBitcast(MVT::v8i16, InputV),
11716 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
11719 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
11720 // to 64-bits.
11721 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
11722 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
11723 assert(VT.is128BitVector() && "Unexpected vector width!");
11725 int LoIdx = Offset * EltBits;
11726 SDValue Lo = DAG.getBitcast(
11727 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11728 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11729 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
11731 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
11732 return DAG.getBitcast(VT, Lo);
11734 int HiIdx = (Offset + 1) * EltBits;
11735 SDValue Hi = DAG.getBitcast(
11736 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11737 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11738 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
11739 return DAG.getBitcast(VT,
11740 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
11743 // If this would require more than 2 unpack instructions to expand, use
11744 // pshufb when available. We can only use more than 2 unpack instructions
11745 // when zero extending i8 elements which also makes it easier to use pshufb.
11746 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
11747 assert(NumElements == 16 && "Unexpected byte vector width!");
11748 SDValue PSHUFBMask[16];
11749 for (int i = 0; i < 16; ++i) {
11750 int Idx = Offset + (i / Scale);
11751 if ((i % Scale == 0 && SafeOffset(Idx))) {
11752 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
11753 continue;
11755 PSHUFBMask[i] =
11756 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
11758 InputV = DAG.getBitcast(MVT::v16i8, InputV);
11759 return DAG.getBitcast(
11760 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
11761 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
11764 // If we are extending from an offset, ensure we start on a boundary that
11765 // we can unpack from.
11766 int AlignToUnpack = Offset % (NumElements / Scale);
11767 if (AlignToUnpack) {
11768 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11769 for (int i = AlignToUnpack; i < NumElements; ++i)
11770 ShMask[i - AlignToUnpack] = i;
11771 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
11772 Offset -= AlignToUnpack;
11775 // Otherwise emit a sequence of unpacks.
11776 do {
11777 unsigned UnpackLoHi = X86ISD::UNPCKL;
11778 if (Offset >= (NumElements / 2)) {
11779 UnpackLoHi = X86ISD::UNPCKH;
11780 Offset -= (NumElements / 2);
11783 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
11784 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
11785 : getZeroVector(InputVT, Subtarget, DAG, DL);
11786 InputV = DAG.getBitcast(InputVT, InputV);
11787 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
11788 Scale /= 2;
11789 EltBits *= 2;
11790 NumElements /= 2;
11791 } while (Scale > 1);
11792 return DAG.getBitcast(VT, InputV);
11795 /// Try to lower a vector shuffle as a zero extension on any microarch.
11797 /// This routine will try to do everything in its power to cleverly lower
11798 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
11799 /// check for the profitability of this lowering, it tries to aggressively
11800 /// match this pattern. It will use all of the micro-architectural details it
11801 /// can to emit an efficient lowering. It handles both blends with all-zero
11802 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
11803 /// masking out later).
11805 /// The reason we have dedicated lowering for zext-style shuffles is that they
11806 /// are both incredibly common and often quite performance sensitive.
11807 static SDValue lowerShuffleAsZeroOrAnyExtend(
11808 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11809 const APInt &Zeroable, const X86Subtarget &Subtarget,
11810 SelectionDAG &DAG) {
11811 int Bits = VT.getSizeInBits();
11812 int NumLanes = Bits / 128;
11813 int NumElements = VT.getVectorNumElements();
11814 int NumEltsPerLane = NumElements / NumLanes;
11815 assert(VT.getScalarSizeInBits() <= 32 &&
11816 "Exceeds 32-bit integer zero extension limit");
11817 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
11819 // Define a helper function to check a particular ext-scale and lower to it if
11820 // valid.
11821 auto Lower = [&](int Scale) -> SDValue {
11822 SDValue InputV;
11823 bool AnyExt = true;
11824 int Offset = 0;
11825 int Matches = 0;
11826 for (int i = 0; i < NumElements; ++i) {
11827 int M = Mask[i];
11828 if (M < 0)
11829 continue; // Valid anywhere but doesn't tell us anything.
11830 if (i % Scale != 0) {
11831 // Each of the extended elements need to be zeroable.
11832 if (!Zeroable[i])
11833 return SDValue();
11835 // We no longer are in the anyext case.
11836 AnyExt = false;
11837 continue;
11840 // Each of the base elements needs to be consecutive indices into the
11841 // same input vector.
11842 SDValue V = M < NumElements ? V1 : V2;
11843 M = M % NumElements;
11844 if (!InputV) {
11845 InputV = V;
11846 Offset = M - (i / Scale);
11847 } else if (InputV != V)
11848 return SDValue(); // Flip-flopping inputs.
11850 // Offset must start in the lowest 128-bit lane or at the start of an
11851 // upper lane.
11852 // FIXME: Is it ever worth allowing a negative base offset?
11853 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
11854 (Offset % NumEltsPerLane) == 0))
11855 return SDValue();
11857 // If we are offsetting, all referenced entries must come from the same
11858 // lane.
11859 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
11860 return SDValue();
11862 if ((M % NumElements) != (Offset + (i / Scale)))
11863 return SDValue(); // Non-consecutive strided elements.
11864 Matches++;
11867 // If we fail to find an input, we have a zero-shuffle which should always
11868 // have already been handled.
11869 // FIXME: Maybe handle this here in case during blending we end up with one?
11870 if (!InputV)
11871 return SDValue();
11873 // If we are offsetting, don't extend if we only match a single input, we
11874 // can always do better by using a basic PSHUF or PUNPCK.
11875 if (Offset != 0 && Matches < 2)
11876 return SDValue();
11878 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
11879 InputV, Mask, Subtarget, DAG);
11882 // The widest scale possible for extending is to a 64-bit integer.
11883 assert(Bits % 64 == 0 &&
11884 "The number of bits in a vector must be divisible by 64 on x86!");
11885 int NumExtElements = Bits / 64;
11887 // Each iteration, try extending the elements half as much, but into twice as
11888 // many elements.
11889 for (; NumExtElements < NumElements; NumExtElements *= 2) {
11890 assert(NumElements % NumExtElements == 0 &&
11891 "The input vector size must be divisible by the extended size.");
11892 if (SDValue V = Lower(NumElements / NumExtElements))
11893 return V;
11896 // General extends failed, but 128-bit vectors may be able to use MOVQ.
11897 if (Bits != 128)
11898 return SDValue();
11900 // Returns one of the source operands if the shuffle can be reduced to a
11901 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
11902 auto CanZExtLowHalf = [&]() {
11903 for (int i = NumElements / 2; i != NumElements; ++i)
11904 if (!Zeroable[i])
11905 return SDValue();
11906 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
11907 return V1;
11908 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
11909 return V2;
11910 return SDValue();
11913 if (SDValue V = CanZExtLowHalf()) {
11914 V = DAG.getBitcast(MVT::v2i64, V);
11915 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
11916 return DAG.getBitcast(VT, V);
11919 // No viable ext lowering found.
11920 return SDValue();
11923 /// Try to get a scalar value for a specific element of a vector.
11925 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
11926 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
11927 SelectionDAG &DAG) {
11928 MVT VT = V.getSimpleValueType();
11929 MVT EltVT = VT.getVectorElementType();
11930 V = peekThroughBitcasts(V);
11932 // If the bitcasts shift the element size, we can't extract an equivalent
11933 // element from it.
11934 MVT NewVT = V.getSimpleValueType();
11935 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
11936 return SDValue();
11938 if (V.getOpcode() == ISD::BUILD_VECTOR ||
11939 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
11940 // Ensure the scalar operand is the same size as the destination.
11941 // FIXME: Add support for scalar truncation where possible.
11942 SDValue S = V.getOperand(Idx);
11943 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
11944 return DAG.getBitcast(EltVT, S);
11947 return SDValue();
11950 /// Helper to test for a load that can be folded with x86 shuffles.
11952 /// This is particularly important because the set of instructions varies
11953 /// significantly based on whether the operand is a load or not.
11954 static bool isShuffleFoldableLoad(SDValue V) {
11955 return V->hasOneUse() &&
11956 ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
11959 template<typename T>
11960 static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
11961 T EltVT = VT.getScalarType();
11962 return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
11965 /// Try to lower insertion of a single element into a zero vector.
11967 /// This is a common pattern that we have especially efficient patterns to lower
11968 /// across all subtarget feature sets.
11969 static SDValue lowerShuffleAsElementInsertion(
11970 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11971 const APInt &Zeroable, const X86Subtarget &Subtarget,
11972 SelectionDAG &DAG) {
11973 MVT ExtVT = VT;
11974 MVT EltVT = VT.getVectorElementType();
11975 unsigned NumElts = VT.getVectorNumElements();
11976 unsigned EltBits = VT.getScalarSizeInBits();
11978 if (isSoftF16(EltVT, Subtarget))
11979 return SDValue();
11981 int V2Index =
11982 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
11983 Mask.begin();
11984 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
11985 bool IsV1Zeroable = true;
11986 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11987 if (i != V2Index && !Zeroable[i]) {
11988 IsV1Zeroable = false;
11989 break;
11992 // Bail if a non-zero V1 isn't used in place.
11993 if (!IsV1Zeroable) {
11994 SmallVector<int, 8> V1Mask(Mask);
11995 V1Mask[V2Index] = -1;
11996 if (!isNoopShuffleMask(V1Mask))
11997 return SDValue();
12000 // Check for a single input from a SCALAR_TO_VECTOR node.
12001 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12002 // all the smarts here sunk into that routine. However, the current
12003 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12004 // vector shuffle lowering is dead.
12005 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12006 DAG);
12007 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12008 // We need to zext the scalar if it is smaller than an i32.
12009 V2S = DAG.getBitcast(EltVT, V2S);
12010 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12011 // Using zext to expand a narrow element won't work for non-zero
12012 // insertions. But we can use a masked constant vector if we're
12013 // inserting V2 into the bottom of V1.
12014 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12015 return SDValue();
12017 // Zero-extend directly to i32.
12018 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12019 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12021 // If we're inserting into a constant, mask off the inserted index
12022 // and OR with the zero-extended scalar.
12023 if (!IsV1Zeroable) {
12024 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12025 Bits[V2Index] = APInt::getZero(EltBits);
12026 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12027 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12028 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12029 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12030 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12033 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12034 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12035 EltVT == MVT::i16) {
12036 // Either not inserting from the low element of the input or the input
12037 // element size is too small to use VZEXT_MOVL to clear the high bits.
12038 return SDValue();
12041 if (!IsV1Zeroable) {
12042 // If V1 can't be treated as a zero vector we have fewer options to lower
12043 // this. We can't support integer vectors or non-zero targets cheaply.
12044 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12045 if (!VT.isFloatingPoint() || V2Index != 0)
12046 return SDValue();
12047 if (!VT.is128BitVector())
12048 return SDValue();
12050 // Otherwise, use MOVSD, MOVSS or MOVSH.
12051 unsigned MovOpc = 0;
12052 if (EltVT == MVT::f16)
12053 MovOpc = X86ISD::MOVSH;
12054 else if (EltVT == MVT::f32)
12055 MovOpc = X86ISD::MOVSS;
12056 else if (EltVT == MVT::f64)
12057 MovOpc = X86ISD::MOVSD;
12058 else
12059 llvm_unreachable("Unsupported floating point element type to handle!");
12060 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12063 // This lowering only works for the low element with floating point vectors.
12064 if (VT.isFloatingPoint() && V2Index != 0)
12065 return SDValue();
12067 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12068 if (ExtVT != VT)
12069 V2 = DAG.getBitcast(VT, V2);
12071 if (V2Index != 0) {
12072 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12073 // the desired position. Otherwise it is more efficient to do a vector
12074 // shift left. We know that we can do a vector shift left because all
12075 // the inputs are zero.
12076 if (VT.isFloatingPoint() || NumElts <= 4) {
12077 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12078 V2Shuffle[V2Index] = 0;
12079 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12080 } else {
12081 V2 = DAG.getBitcast(MVT::v16i8, V2);
12082 V2 = DAG.getNode(
12083 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12084 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12085 V2 = DAG.getBitcast(VT, V2);
12088 return V2;
12091 /// Try to lower broadcast of a single - truncated - integer element,
12092 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12094 /// This assumes we have AVX2.
12095 static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
12096 int BroadcastIdx,
12097 const X86Subtarget &Subtarget,
12098 SelectionDAG &DAG) {
12099 assert(Subtarget.hasAVX2() &&
12100 "We can only lower integer broadcasts with AVX2!");
12102 MVT EltVT = VT.getVectorElementType();
12103 MVT V0VT = V0.getSimpleValueType();
12105 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12106 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12108 MVT V0EltVT = V0VT.getVectorElementType();
12109 if (!V0EltVT.isInteger())
12110 return SDValue();
12112 const unsigned EltSize = EltVT.getSizeInBits();
12113 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12115 // This is only a truncation if the original element type is larger.
12116 if (V0EltSize <= EltSize)
12117 return SDValue();
12119 assert(((V0EltSize % EltSize) == 0) &&
12120 "Scalar type sizes must all be powers of 2 on x86!");
12122 const unsigned V0Opc = V0.getOpcode();
12123 const unsigned Scale = V0EltSize / EltSize;
12124 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12126 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12127 V0Opc != ISD::BUILD_VECTOR)
12128 return SDValue();
12130 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12132 // If we're extracting non-least-significant bits, shift so we can truncate.
12133 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12134 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12135 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12136 if (const int OffsetIdx = BroadcastIdx % Scale)
12137 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12138 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12140 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12141 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12144 /// Test whether this can be lowered with a single SHUFPS instruction.
12146 /// This is used to disable more specialized lowerings when the shufps lowering
12147 /// will happen to be efficient.
12148 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
12149 // This routine only handles 128-bit shufps.
12150 assert(Mask.size() == 4 && "Unsupported mask size!");
12151 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12152 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12153 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12154 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12156 // To lower with a single SHUFPS we need to have the low half and high half
12157 // each requiring a single input.
12158 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12159 return false;
12160 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12161 return false;
12163 return true;
12166 /// Test whether the specified input (0 or 1) is in-place blended by the
12167 /// given mask.
12169 /// This returns true if the elements from a particular input are already in the
12170 /// slot required by the given mask and require no permutation.
12171 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12172 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12173 int Size = Mask.size();
12174 for (int i = 0; i < Size; ++i)
12175 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12176 return false;
12178 return true;
12181 /// If we are extracting two 128-bit halves of a vector and shuffling the
12182 /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12183 /// multi-shuffle lowering.
12184 static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
12185 SDValue N1, ArrayRef<int> Mask,
12186 SelectionDAG &DAG) {
12187 MVT VT = N0.getSimpleValueType();
12188 assert((VT.is128BitVector() &&
12189 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12190 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12192 // Check that both sources are extracts of the same source vector.
12193 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12194 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12195 N0.getOperand(0) != N1.getOperand(0) ||
12196 !N0.hasOneUse() || !N1.hasOneUse())
12197 return SDValue();
12199 SDValue WideVec = N0.getOperand(0);
12200 MVT WideVT = WideVec.getSimpleValueType();
12201 if (!WideVT.is256BitVector())
12202 return SDValue();
12204 // Match extracts of each half of the wide source vector. Commute the shuffle
12205 // if the extract of the low half is N1.
12206 unsigned NumElts = VT.getVectorNumElements();
12207 SmallVector<int, 4> NewMask(Mask);
12208 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12209 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12210 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12211 ShuffleVectorSDNode::commuteMask(NewMask);
12212 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12213 return SDValue();
12215 // Final bailout: if the mask is simple, we are better off using an extract
12216 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12217 // because that avoids a constant load from memory.
12218 if (NumElts == 4 &&
12219 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12220 return SDValue();
12222 // Extend the shuffle mask with undef elements.
12223 NewMask.append(NumElts, -1);
12225 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12226 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12227 NewMask);
12228 // This is free: ymm -> xmm.
12229 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12230 DAG.getIntPtrConstant(0, DL));
12233 /// Try to lower broadcast of a single element.
12235 /// For convenience, this code also bundles all of the subtarget feature set
12236 /// filtering. While a little annoying to re-dispatch on type here, there isn't
12237 /// a convenient way to factor it out.
12238 static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
12239 SDValue V2, ArrayRef<int> Mask,
12240 const X86Subtarget &Subtarget,
12241 SelectionDAG &DAG) {
12242 MVT EltVT = VT.getVectorElementType();
12243 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12244 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12245 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12246 return SDValue();
12248 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12249 // we can only broadcast from a register with AVX2.
12250 unsigned NumEltBits = VT.getScalarSizeInBits();
12251 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12252 ? X86ISD::MOVDDUP
12253 : X86ISD::VBROADCAST;
12254 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12256 // Check that the mask is a broadcast.
12257 int BroadcastIdx = getSplatIndex(Mask);
12258 if (BroadcastIdx < 0)
12259 return SDValue();
12260 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12261 "a sorted mask where the broadcast "
12262 "comes from V1.");
12264 // Go up the chain of (vector) values to find a scalar load that we can
12265 // combine with the broadcast.
12266 // TODO: Combine this logic with findEltLoadSrc() used by
12267 // EltsFromConsecutiveLoads().
12268 int BitOffset = BroadcastIdx * NumEltBits;
12269 SDValue V = V1;
12270 for (;;) {
12271 switch (V.getOpcode()) {
12272 case ISD::BITCAST: {
12273 V = V.getOperand(0);
12274 continue;
12276 case ISD::CONCAT_VECTORS: {
12277 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12278 int OpIdx = BitOffset / OpBitWidth;
12279 V = V.getOperand(OpIdx);
12280 BitOffset %= OpBitWidth;
12281 continue;
12283 case ISD::EXTRACT_SUBVECTOR: {
12284 // The extraction index adds to the existing offset.
12285 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12286 unsigned Idx = V.getConstantOperandVal(1);
12287 unsigned BeginOffset = Idx * EltBitWidth;
12288 BitOffset += BeginOffset;
12289 V = V.getOperand(0);
12290 continue;
12292 case ISD::INSERT_SUBVECTOR: {
12293 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12294 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12295 int Idx = (int)V.getConstantOperandVal(2);
12296 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12297 int BeginOffset = Idx * EltBitWidth;
12298 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12299 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12300 BitOffset -= BeginOffset;
12301 V = VInner;
12302 } else {
12303 V = VOuter;
12305 continue;
12308 break;
12310 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12311 BroadcastIdx = BitOffset / NumEltBits;
12313 // Do we need to bitcast the source to retrieve the original broadcast index?
12314 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12316 // Check if this is a broadcast of a scalar. We special case lowering
12317 // for scalars so that we can more effectively fold with loads.
12318 // If the original value has a larger element type than the shuffle, the
12319 // broadcast element is in essence truncated. Make that explicit to ease
12320 // folding.
12321 if (BitCastSrc && VT.isInteger())
12322 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12323 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12324 return TruncBroadcast;
12326 // Also check the simpler case, where we can directly reuse the scalar.
12327 if (!BitCastSrc &&
12328 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12329 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12330 V = V.getOperand(BroadcastIdx);
12332 // If we can't broadcast from a register, check that the input is a load.
12333 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12334 return SDValue();
12335 } else if (ISD::isNormalLoad(V.getNode()) &&
12336 cast<LoadSDNode>(V)->isSimple()) {
12337 // We do not check for one-use of the vector load because a broadcast load
12338 // is expected to be a win for code size, register pressure, and possibly
12339 // uops even if the original vector load is not eliminated.
12341 // Reduce the vector load and shuffle to a broadcasted scalar load.
12342 LoadSDNode *Ld = cast<LoadSDNode>(V);
12343 SDValue BaseAddr = Ld->getOperand(1);
12344 MVT SVT = VT.getScalarType();
12345 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12346 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12347 SDValue NewAddr =
12348 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
12350 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12351 // than MOVDDUP.
12352 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12353 if (Opcode == X86ISD::VBROADCAST) {
12354 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12355 SDValue Ops[] = {Ld->getChain(), NewAddr};
12356 V = DAG.getMemIntrinsicNode(
12357 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12358 DAG.getMachineFunction().getMachineMemOperand(
12359 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12360 DAG.makeEquivalentMemoryOrdering(Ld, V);
12361 return DAG.getBitcast(VT, V);
12363 assert(SVT == MVT::f64 && "Unexpected VT!");
12364 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12365 DAG.getMachineFunction().getMachineMemOperand(
12366 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12367 DAG.makeEquivalentMemoryOrdering(Ld, V);
12368 } else if (!BroadcastFromReg) {
12369 // We can't broadcast from a vector register.
12370 return SDValue();
12371 } else if (BitOffset != 0) {
12372 // We can only broadcast from the zero-element of a vector register,
12373 // but it can be advantageous to broadcast from the zero-element of a
12374 // subvector.
12375 if (!VT.is256BitVector() && !VT.is512BitVector())
12376 return SDValue();
12378 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12379 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12380 return SDValue();
12382 // Only broadcast the zero-element of a 128-bit subvector.
12383 if ((BitOffset % 128) != 0)
12384 return SDValue();
12386 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12387 "Unexpected bit-offset");
12388 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12389 "Unexpected vector size");
12390 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12391 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12394 // On AVX we can use VBROADCAST directly for scalar sources.
12395 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12396 V = DAG.getBitcast(MVT::f64, V);
12397 if (Subtarget.hasAVX()) {
12398 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12399 return DAG.getBitcast(VT, V);
12401 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12404 // If this is a scalar, do the broadcast on this type and bitcast.
12405 if (!V.getValueType().isVector()) {
12406 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12407 "Unexpected scalar size");
12408 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12409 VT.getVectorNumElements());
12410 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12413 // We only support broadcasting from 128-bit vectors to minimize the
12414 // number of patterns we need to deal with in isel. So extract down to
12415 // 128-bits, removing as many bitcasts as possible.
12416 if (V.getValueSizeInBits() > 128)
12417 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
12419 // Otherwise cast V to a vector with the same element type as VT, but
12420 // possibly narrower than VT. Then perform the broadcast.
12421 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12422 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12423 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12426 // Check for whether we can use INSERTPS to perform the shuffle. We only use
12427 // INSERTPS when the V1 elements are already in the correct locations
12428 // because otherwise we can just always use two SHUFPS instructions which
12429 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12430 // perform INSERTPS if a single V1 element is out of place and all V2
12431 // elements are zeroable.
12432 static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
12433 unsigned &InsertPSMask,
12434 const APInt &Zeroable,
12435 ArrayRef<int> Mask, SelectionDAG &DAG) {
12436 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12437 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12438 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12440 // Attempt to match INSERTPS with one element from VA or VB being
12441 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12442 // are updated.
12443 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12444 ArrayRef<int> CandidateMask) {
12445 unsigned ZMask = 0;
12446 int VADstIndex = -1;
12447 int VBDstIndex = -1;
12448 bool VAUsedInPlace = false;
12450 for (int i = 0; i < 4; ++i) {
12451 // Synthesize a zero mask from the zeroable elements (includes undefs).
12452 if (Zeroable[i]) {
12453 ZMask |= 1 << i;
12454 continue;
12457 // Flag if we use any VA inputs in place.
12458 if (i == CandidateMask[i]) {
12459 VAUsedInPlace = true;
12460 continue;
12463 // We can only insert a single non-zeroable element.
12464 if (VADstIndex >= 0 || VBDstIndex >= 0)
12465 return false;
12467 if (CandidateMask[i] < 4) {
12468 // VA input out of place for insertion.
12469 VADstIndex = i;
12470 } else {
12471 // VB input for insertion.
12472 VBDstIndex = i;
12476 // Don't bother if we have no (non-zeroable) element for insertion.
12477 if (VADstIndex < 0 && VBDstIndex < 0)
12478 return false;
12480 // Determine element insertion src/dst indices. The src index is from the
12481 // start of the inserted vector, not the start of the concatenated vector.
12482 unsigned VBSrcIndex = 0;
12483 if (VADstIndex >= 0) {
12484 // If we have a VA input out of place, we use VA as the V2 element
12485 // insertion and don't use the original V2 at all.
12486 VBSrcIndex = CandidateMask[VADstIndex];
12487 VBDstIndex = VADstIndex;
12488 VB = VA;
12489 } else {
12490 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
12493 // If no V1 inputs are used in place, then the result is created only from
12494 // the zero mask and the V2 insertion - so remove V1 dependency.
12495 if (!VAUsedInPlace)
12496 VA = DAG.getUNDEF(MVT::v4f32);
12498 // Update V1, V2 and InsertPSMask accordingly.
12499 V1 = VA;
12500 V2 = VB;
12502 // Insert the V2 element into the desired position.
12503 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
12504 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
12505 return true;
12508 if (matchAsInsertPS(V1, V2, Mask))
12509 return true;
12511 // Commute and try again.
12512 SmallVector<int, 4> CommutedMask(Mask);
12513 ShuffleVectorSDNode::commuteMask(CommutedMask);
12514 if (matchAsInsertPS(V2, V1, CommutedMask))
12515 return true;
12517 return false;
12520 static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
12521 ArrayRef<int> Mask, const APInt &Zeroable,
12522 SelectionDAG &DAG) {
12523 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12524 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12526 // Attempt to match the insertps pattern.
12527 unsigned InsertPSMask = 0;
12528 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
12529 return SDValue();
12531 // Insert the V2 element into the desired position.
12532 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
12533 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
12536 /// Handle lowering of 2-lane 64-bit floating point shuffles.
12538 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
12539 /// support for floating point shuffles but not integer shuffles. These
12540 /// instructions will incur a domain crossing penalty on some chips though so
12541 /// it is better to avoid lowering through this for integer vectors where
12542 /// possible.
12543 static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
12544 const APInt &Zeroable, SDValue V1, SDValue V2,
12545 const X86Subtarget &Subtarget,
12546 SelectionDAG &DAG) {
12547 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12548 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12549 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12551 if (V2.isUndef()) {
12552 // Check for being able to broadcast a single element.
12553 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
12554 Mask, Subtarget, DAG))
12555 return Broadcast;
12557 // Straight shuffle of a single input vector. Simulate this by using the
12558 // single input as both of the "inputs" to this instruction..
12559 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
12561 if (Subtarget.hasAVX()) {
12562 // If we have AVX, we can use VPERMILPS which will allow folding a load
12563 // into the shuffle.
12564 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
12565 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12568 return DAG.getNode(
12569 X86ISD::SHUFP, DL, MVT::v2f64,
12570 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12571 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12572 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12574 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12575 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12576 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12577 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12579 if (Subtarget.hasAVX2())
12580 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12581 return Extract;
12583 // When loading a scalar and then shuffling it into a vector we can often do
12584 // the insertion cheaply.
12585 if (SDValue Insertion = lowerShuffleAsElementInsertion(
12586 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12587 return Insertion;
12588 // Try inverting the insertion since for v2 masks it is easy to do and we
12589 // can't reliably sort the mask one way or the other.
12590 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
12591 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
12592 if (SDValue Insertion = lowerShuffleAsElementInsertion(
12593 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12594 return Insertion;
12596 // Try to use one of the special instruction patterns to handle two common
12597 // blend patterns if a zero-blend above didn't work.
12598 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
12599 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
12600 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
12601 // We can either use a special instruction to load over the low double or
12602 // to move just the low double.
12603 return DAG.getNode(
12604 X86ISD::MOVSD, DL, MVT::v2f64, V2,
12605 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
12607 if (Subtarget.hasSSE41())
12608 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
12609 Zeroable, Subtarget, DAG))
12610 return Blend;
12612 // Use dedicated unpack instructions for masks that match their pattern.
12613 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
12614 return V;
12616 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
12617 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
12618 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12621 /// Handle lowering of 2-lane 64-bit integer shuffles.
12623 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
12624 /// the integer unit to minimize domain crossing penalties. However, for blends
12625 /// it falls back to the floating point shuffle operation with appropriate bit
12626 /// casting.
12627 static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
12628 const APInt &Zeroable, SDValue V1, SDValue V2,
12629 const X86Subtarget &Subtarget,
12630 SelectionDAG &DAG) {
12631 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12632 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12633 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12635 if (V2.isUndef()) {
12636 // Check for being able to broadcast a single element.
12637 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
12638 Mask, Subtarget, DAG))
12639 return Broadcast;
12641 // Straight shuffle of a single input vector. For everything from SSE2
12642 // onward this has a single fast instruction with no scary immediates.
12643 // We have to map the mask as it is actually a v4i32 shuffle instruction.
12644 V1 = DAG.getBitcast(MVT::v4i32, V1);
12645 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
12646 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
12647 Mask[1] < 0 ? -1 : (Mask[1] * 2),
12648 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
12649 return DAG.getBitcast(
12650 MVT::v2i64,
12651 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
12652 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
12654 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
12655 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
12656 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12657 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12659 if (Subtarget.hasAVX2())
12660 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12661 return Extract;
12663 // Try to use shift instructions.
12664 if (SDValue Shift =
12665 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
12666 DAG, /*BitwiseOnly*/ false))
12667 return Shift;
12669 // When loading a scalar and then shuffling it into a vector we can often do
12670 // the insertion cheaply.
12671 if (SDValue Insertion = lowerShuffleAsElementInsertion(
12672 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12673 return Insertion;
12674 // Try inverting the insertion since for v2 masks it is easy to do and we
12675 // can't reliably sort the mask one way or the other.
12676 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
12677 if (SDValue Insertion = lowerShuffleAsElementInsertion(
12678 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12679 return Insertion;
12681 // We have different paths for blend lowering, but they all must use the
12682 // *exact* same predicate.
12683 bool IsBlendSupported = Subtarget.hasSSE41();
12684 if (IsBlendSupported)
12685 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
12686 Zeroable, Subtarget, DAG))
12687 return Blend;
12689 // Use dedicated unpack instructions for masks that match their pattern.
12690 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
12691 return V;
12693 // Try to use byte rotation instructions.
12694 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
12695 if (Subtarget.hasSSSE3()) {
12696 if (Subtarget.hasVLX())
12697 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
12698 Zeroable, Subtarget, DAG))
12699 return Rotate;
12701 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
12702 Subtarget, DAG))
12703 return Rotate;
12706 // If we have direct support for blends, we should lower by decomposing into
12707 // a permute. That will be faster than the domain cross.
12708 if (IsBlendSupported)
12709 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
12710 Subtarget, DAG);
12712 // We implement this with SHUFPD which is pretty lame because it will likely
12713 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
12714 // However, all the alternatives are still more cycles and newer chips don't
12715 // have this problem. It would be really nice if x86 had better shuffles here.
12716 V1 = DAG.getBitcast(MVT::v2f64, V1);
12717 V2 = DAG.getBitcast(MVT::v2f64, V2);
12718 return DAG.getBitcast(MVT::v2i64,
12719 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
12722 /// Lower a vector shuffle using the SHUFPS instruction.
12724 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
12725 /// It makes no assumptions about whether this is the *best* lowering, it simply
12726 /// uses it.
12727 static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
12728 ArrayRef<int> Mask, SDValue V1,
12729 SDValue V2, SelectionDAG &DAG) {
12730 SDValue LowV = V1, HighV = V2;
12731 SmallVector<int, 4> NewMask(Mask);
12732 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12734 if (NumV2Elements == 1) {
12735 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
12737 // Compute the index adjacent to V2Index and in the same half by toggling
12738 // the low bit.
12739 int V2AdjIndex = V2Index ^ 1;
12741 if (Mask[V2AdjIndex] < 0) {
12742 // Handles all the cases where we have a single V2 element and an undef.
12743 // This will only ever happen in the high lanes because we commute the
12744 // vector otherwise.
12745 if (V2Index < 2)
12746 std::swap(LowV, HighV);
12747 NewMask[V2Index] -= 4;
12748 } else {
12749 // Handle the case where the V2 element ends up adjacent to a V1 element.
12750 // To make this work, blend them together as the first step.
12751 int V1Index = V2AdjIndex;
12752 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
12753 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
12754 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
12756 // Now proceed to reconstruct the final blend as we have the necessary
12757 // high or low half formed.
12758 if (V2Index < 2) {
12759 LowV = V2;
12760 HighV = V1;
12761 } else {
12762 HighV = V2;
12764 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
12765 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
12767 } else if (NumV2Elements == 2) {
12768 if (Mask[0] < 4 && Mask[1] < 4) {
12769 // Handle the easy case where we have V1 in the low lanes and V2 in the
12770 // high lanes.
12771 NewMask[2] -= 4;
12772 NewMask[3] -= 4;
12773 } else if (Mask[2] < 4 && Mask[3] < 4) {
12774 // We also handle the reversed case because this utility may get called
12775 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
12776 // arrange things in the right direction.
12777 NewMask[0] -= 4;
12778 NewMask[1] -= 4;
12779 HighV = V1;
12780 LowV = V2;
12781 } else {
12782 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
12783 // trying to place elements directly, just blend them and set up the final
12784 // shuffle to place them.
12786 // The first two blend mask elements are for V1, the second two are for
12787 // V2.
12788 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
12789 Mask[2] < 4 ? Mask[2] : Mask[3],
12790 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
12791 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
12792 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12793 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
12795 // Now we do a normal shuffle of V1 by giving V1 as both operands to
12796 // a blend.
12797 LowV = HighV = V1;
12798 NewMask[0] = Mask[0] < 4 ? 0 : 2;
12799 NewMask[1] = Mask[0] < 4 ? 2 : 0;
12800 NewMask[2] = Mask[2] < 4 ? 1 : 3;
12801 NewMask[3] = Mask[2] < 4 ? 3 : 1;
12803 } else if (NumV2Elements == 3) {
12804 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
12805 // we can get here due to other paths (e.g repeated mask matching) that we
12806 // don't want to do another round of lowerVECTOR_SHUFFLE.
12807 ShuffleVectorSDNode::commuteMask(NewMask);
12808 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
12810 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
12811 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
12814 /// Lower 4-lane 32-bit floating point shuffles.
12816 /// Uses instructions exclusively from the floating point unit to minimize
12817 /// domain crossing penalties, as these are sufficient to implement all v4f32
12818 /// shuffles.
12819 static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
12820 const APInt &Zeroable, SDValue V1, SDValue V2,
12821 const X86Subtarget &Subtarget,
12822 SelectionDAG &DAG) {
12823 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12824 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12825 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12827 if (Subtarget.hasSSE41())
12828 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
12829 Zeroable, Subtarget, DAG))
12830 return Blend;
12832 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12834 if (NumV2Elements == 0) {
12835 // Check for being able to broadcast a single element.
12836 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
12837 Mask, Subtarget, DAG))
12838 return Broadcast;
12840 // Use even/odd duplicate instructions for masks that match their pattern.
12841 if (Subtarget.hasSSE3()) {
12842 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
12843 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
12844 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
12845 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
12848 if (Subtarget.hasAVX()) {
12849 // If we have AVX, we can use VPERMILPS which will allow folding a load
12850 // into the shuffle.
12851 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
12852 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12855 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
12856 // in SSE1 because otherwise they are widened to v2f64 and never get here.
12857 if (!Subtarget.hasSSE2()) {
12858 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
12859 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
12860 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
12861 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
12864 // Otherwise, use a straight shuffle of a single input vector. We pass the
12865 // input vector to both operands to simulate this with a SHUFPS.
12866 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
12867 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12870 if (Subtarget.hasSSE2())
12871 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
12872 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
12873 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
12874 return ZExt;
12877 if (Subtarget.hasAVX2())
12878 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12879 return Extract;
12881 // There are special ways we can lower some single-element blends. However, we
12882 // have custom ways we can lower more complex single-element blends below that
12883 // we defer to if both this and BLENDPS fail to match, so restrict this to
12884 // when the V2 input is targeting element 0 of the mask -- that is the fast
12885 // case here.
12886 if (NumV2Elements == 1 && Mask[0] >= 4)
12887 if (SDValue V = lowerShuffleAsElementInsertion(
12888 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12889 return V;
12891 if (Subtarget.hasSSE41()) {
12892 // Use INSERTPS if we can complete the shuffle efficiently.
12893 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
12894 return V;
12896 if (!isSingleSHUFPSMask(Mask))
12897 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
12898 V2, Mask, DAG))
12899 return BlendPerm;
12902 // Use low/high mov instructions. These are only valid in SSE1 because
12903 // otherwise they are widened to v2f64 and never get here.
12904 if (!Subtarget.hasSSE2()) {
12905 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
12906 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
12907 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
12908 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
12911 // Use dedicated unpack instructions for masks that match their pattern.
12912 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
12913 return V;
12915 // Otherwise fall back to a SHUFPS lowering strategy.
12916 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
12919 /// Lower 4-lane i32 vector shuffles.
12921 /// We try to handle these with integer-domain shuffles where we can, but for
12922 /// blends we use the floating point domain blend instructions.
12923 static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
12924 const APInt &Zeroable, SDValue V1, SDValue V2,
12925 const X86Subtarget &Subtarget,
12926 SelectionDAG &DAG) {
12927 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
12928 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
12929 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12931 // Whenever we can lower this as a zext, that instruction is strictly faster
12932 // than any alternative. It also allows us to fold memory operands into the
12933 // shuffle in many cases.
12934 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
12935 Zeroable, Subtarget, DAG))
12936 return ZExt;
12938 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12940 // Try to use shift instructions if fast.
12941 if (Subtarget.preferLowerShuffleAsShift()) {
12942 if (SDValue Shift =
12943 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
12944 Subtarget, DAG, /*BitwiseOnly*/ true))
12945 return Shift;
12946 if (NumV2Elements == 0)
12947 if (SDValue Rotate =
12948 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
12949 return Rotate;
12952 if (NumV2Elements == 0) {
12953 // Try to use broadcast unless the mask only has one non-undef element.
12954 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
12955 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
12956 Mask, Subtarget, DAG))
12957 return Broadcast;
12960 // Straight shuffle of a single input vector. For everything from SSE2
12961 // onward this has a single fast instruction with no scary immediates.
12962 // We coerce the shuffle pattern to be compatible with UNPCK instructions
12963 // but we aren't actually going to use the UNPCK instruction because doing
12964 // so prevents folding a load into this instruction or making a copy.
12965 const int UnpackLoMask[] = {0, 0, 1, 1};
12966 const int UnpackHiMask[] = {2, 2, 3, 3};
12967 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
12968 Mask = UnpackLoMask;
12969 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
12970 Mask = UnpackHiMask;
12972 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
12973 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12976 if (Subtarget.hasAVX2())
12977 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12978 return Extract;
12980 // Try to use shift instructions.
12981 if (SDValue Shift =
12982 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
12983 DAG, /*BitwiseOnly*/ false))
12984 return Shift;
12986 // There are special ways we can lower some single-element blends.
12987 if (NumV2Elements == 1)
12988 if (SDValue V = lowerShuffleAsElementInsertion(
12989 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12990 return V;
12992 // We have different paths for blend lowering, but they all must use the
12993 // *exact* same predicate.
12994 bool IsBlendSupported = Subtarget.hasSSE41();
12995 if (IsBlendSupported)
12996 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
12997 Zeroable, Subtarget, DAG))
12998 return Blend;
13000 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13001 Zeroable, Subtarget, DAG))
13002 return Masked;
13004 // Use dedicated unpack instructions for masks that match their pattern.
13005 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
13006 return V;
13008 // Try to use byte rotation instructions.
13009 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13010 if (Subtarget.hasSSSE3()) {
13011 if (Subtarget.hasVLX())
13012 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13013 Zeroable, Subtarget, DAG))
13014 return Rotate;
13016 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13017 Subtarget, DAG))
13018 return Rotate;
13021 // Assume that a single SHUFPS is faster than an alternative sequence of
13022 // multiple instructions (even if the CPU has a domain penalty).
13023 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13024 if (!isSingleSHUFPSMask(Mask)) {
13025 // If we have direct support for blends, we should lower by decomposing into
13026 // a permute. That will be faster than the domain cross.
13027 if (IsBlendSupported)
13028 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13029 Subtarget, DAG);
13031 // Try to lower by permuting the inputs into an unpack instruction.
13032 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13033 Mask, Subtarget, DAG))
13034 return Unpack;
13037 // We implement this with SHUFPS because it can blend from two vectors.
13038 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13039 // up the inputs, bypassing domain shift penalties that we would incur if we
13040 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13041 // relevant.
13042 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13043 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13044 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13045 return DAG.getBitcast(MVT::v4i32, ShufPS);
13048 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13049 /// shuffle lowering, and the most complex part.
13051 /// The lowering strategy is to try to form pairs of input lanes which are
13052 /// targeted at the same half of the final vector, and then use a dword shuffle
13053 /// to place them onto the right half, and finally unpack the paired lanes into
13054 /// their final position.
13056 /// The exact breakdown of how to form these dword pairs and align them on the
13057 /// correct sides is really tricky. See the comments within the function for
13058 /// more of the details.
13060 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13061 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13062 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13063 /// vector, form the analogous 128-bit 8-element Mask.
13064 static SDValue lowerV8I16GeneralSingleInputShuffle(
13065 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13066 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13067 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13068 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13070 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13071 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13072 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13074 // Attempt to directly match PSHUFLW or PSHUFHW.
13075 if (isUndefOrInRange(LoMask, 0, 4) &&
13076 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13077 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13078 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13080 if (isUndefOrInRange(HiMask, 4, 8) &&
13081 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13082 for (int i = 0; i != 4; ++i)
13083 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13084 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13085 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13088 SmallVector<int, 4> LoInputs;
13089 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13090 array_pod_sort(LoInputs.begin(), LoInputs.end());
13091 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
13092 SmallVector<int, 4> HiInputs;
13093 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13094 array_pod_sort(HiInputs.begin(), HiInputs.end());
13095 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
13096 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13097 int NumHToL = LoInputs.size() - NumLToL;
13098 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13099 int NumHToH = HiInputs.size() - NumLToH;
13100 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13101 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13102 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13103 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13105 // If we are shuffling values from one half - check how many different DWORD
13106 // pairs we need to create. If only 1 or 2 then we can perform this as a
13107 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13108 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13109 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13110 V = DAG.getNode(ShufWOp, DL, VT, V,
13111 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13112 V = DAG.getBitcast(PSHUFDVT, V);
13113 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13114 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13115 return DAG.getBitcast(VT, V);
13118 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13119 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13120 SmallVector<std::pair<int, int>, 4> DWordPairs;
13121 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13123 // Collect the different DWORD pairs.
13124 for (int DWord = 0; DWord != 4; ++DWord) {
13125 int M0 = Mask[2 * DWord + 0];
13126 int M1 = Mask[2 * DWord + 1];
13127 M0 = (M0 >= 0 ? M0 % 4 : M0);
13128 M1 = (M1 >= 0 ? M1 % 4 : M1);
13129 if (M0 < 0 && M1 < 0)
13130 continue;
13132 bool Match = false;
13133 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13134 auto &DWordPair = DWordPairs[j];
13135 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13136 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13137 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13138 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13139 PSHUFDMask[DWord] = DOffset + j;
13140 Match = true;
13141 break;
13144 if (!Match) {
13145 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13146 DWordPairs.push_back(std::make_pair(M0, M1));
13150 if (DWordPairs.size() <= 2) {
13151 DWordPairs.resize(2, std::make_pair(-1, -1));
13152 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13153 DWordPairs[1].first, DWordPairs[1].second};
13154 if ((NumHToL + NumHToH) == 0)
13155 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13156 if ((NumLToL + NumLToH) == 0)
13157 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13161 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13162 // such inputs we can swap two of the dwords across the half mark and end up
13163 // with <=2 inputs to each half in each half. Once there, we can fall through
13164 // to the generic code below. For example:
13166 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13167 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13169 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13170 // and an existing 2-into-2 on the other half. In this case we may have to
13171 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13172 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13173 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13174 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13175 // half than the one we target for fixing) will be fixed when we re-enter this
13176 // path. We will also combine away any sequence of PSHUFD instructions that
13177 // result into a single instruction. Here is an example of the tricky case:
13179 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13180 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13182 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13184 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13185 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13187 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13188 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13190 // The result is fine to be handled by the generic logic.
13191 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13192 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13193 int AOffset, int BOffset) {
13194 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13195 "Must call this with A having 3 or 1 inputs from the A half.");
13196 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13197 "Must call this with B having 1 or 3 inputs from the B half.");
13198 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13199 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13201 bool ThreeAInputs = AToAInputs.size() == 3;
13203 // Compute the index of dword with only one word among the three inputs in
13204 // a half by taking the sum of the half with three inputs and subtracting
13205 // the sum of the actual three inputs. The difference is the remaining
13206 // slot.
13207 int ADWord = 0, BDWord = 0;
13208 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13209 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13210 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13211 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13212 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13213 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13214 int TripleNonInputIdx =
13215 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13216 TripleDWord = TripleNonInputIdx / 2;
13218 // We use xor with one to compute the adjacent DWord to whichever one the
13219 // OneInput is in.
13220 OneInputDWord = (OneInput / 2) ^ 1;
13222 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13223 // and BToA inputs. If there is also such a problem with the BToB and AToB
13224 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13225 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13226 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13227 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13228 // Compute how many inputs will be flipped by swapping these DWords. We
13229 // need
13230 // to balance this to ensure we don't form a 3-1 shuffle in the other
13231 // half.
13232 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13233 llvm::count(AToBInputs, 2 * ADWord + 1);
13234 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13235 llvm::count(BToBInputs, 2 * BDWord + 1);
13236 if ((NumFlippedAToBInputs == 1 &&
13237 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13238 (NumFlippedBToBInputs == 1 &&
13239 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13240 // We choose whether to fix the A half or B half based on whether that
13241 // half has zero flipped inputs. At zero, we may not be able to fix it
13242 // with that half. We also bias towards fixing the B half because that
13243 // will more commonly be the high half, and we have to bias one way.
13244 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13245 ArrayRef<int> Inputs) {
13246 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13247 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13248 // Determine whether the free index is in the flipped dword or the
13249 // unflipped dword based on where the pinned index is. We use this bit
13250 // in an xor to conditionally select the adjacent dword.
13251 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13252 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13253 if (IsFixIdxInput == IsFixFreeIdxInput)
13254 FixFreeIdx += 1;
13255 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13256 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13257 "We need to be changing the number of flipped inputs!");
13258 int PSHUFHalfMask[] = {0, 1, 2, 3};
13259 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13260 V = DAG.getNode(
13261 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13262 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13263 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13265 for (int &M : Mask)
13266 if (M >= 0 && M == FixIdx)
13267 M = FixFreeIdx;
13268 else if (M >= 0 && M == FixFreeIdx)
13269 M = FixIdx;
13271 if (NumFlippedBToBInputs != 0) {
13272 int BPinnedIdx =
13273 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13274 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13275 } else {
13276 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13277 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13278 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13283 int PSHUFDMask[] = {0, 1, 2, 3};
13284 PSHUFDMask[ADWord] = BDWord;
13285 PSHUFDMask[BDWord] = ADWord;
13286 V = DAG.getBitcast(
13288 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13289 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13291 // Adjust the mask to match the new locations of A and B.
13292 for (int &M : Mask)
13293 if (M >= 0 && M/2 == ADWord)
13294 M = 2 * BDWord + M % 2;
13295 else if (M >= 0 && M/2 == BDWord)
13296 M = 2 * ADWord + M % 2;
13298 // Recurse back into this routine to re-compute state now that this isn't
13299 // a 3 and 1 problem.
13300 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13302 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13303 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13304 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13305 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13307 // At this point there are at most two inputs to the low and high halves from
13308 // each half. That means the inputs can always be grouped into dwords and
13309 // those dwords can then be moved to the correct half with a dword shuffle.
13310 // We use at most one low and one high word shuffle to collect these paired
13311 // inputs into dwords, and finally a dword shuffle to place them.
13312 int PSHUFLMask[4] = {-1, -1, -1, -1};
13313 int PSHUFHMask[4] = {-1, -1, -1, -1};
13314 int PSHUFDMask[4] = {-1, -1, -1, -1};
13316 // First fix the masks for all the inputs that are staying in their
13317 // original halves. This will then dictate the targets of the cross-half
13318 // shuffles.
13319 auto fixInPlaceInputs =
13320 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13321 MutableArrayRef<int> SourceHalfMask,
13322 MutableArrayRef<int> HalfMask, int HalfOffset) {
13323 if (InPlaceInputs.empty())
13324 return;
13325 if (InPlaceInputs.size() == 1) {
13326 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13327 InPlaceInputs[0] - HalfOffset;
13328 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13329 return;
13331 if (IncomingInputs.empty()) {
13332 // Just fix all of the in place inputs.
13333 for (int Input : InPlaceInputs) {
13334 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13335 PSHUFDMask[Input / 2] = Input / 2;
13337 return;
13340 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13341 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13342 InPlaceInputs[0] - HalfOffset;
13343 // Put the second input next to the first so that they are packed into
13344 // a dword. We find the adjacent index by toggling the low bit.
13345 int AdjIndex = InPlaceInputs[0] ^ 1;
13346 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13347 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13348 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13350 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13351 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13353 // Now gather the cross-half inputs and place them into a free dword of
13354 // their target half.
13355 // FIXME: This operation could almost certainly be simplified dramatically to
13356 // look more like the 3-1 fixing operation.
13357 auto moveInputsToRightHalf = [&PSHUFDMask](
13358 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13359 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13360 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13361 int DestOffset) {
13362 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13363 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13365 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13366 int Word) {
13367 int LowWord = Word & ~1;
13368 int HighWord = Word | 1;
13369 return isWordClobbered(SourceHalfMask, LowWord) ||
13370 isWordClobbered(SourceHalfMask, HighWord);
13373 if (IncomingInputs.empty())
13374 return;
13376 if (ExistingInputs.empty()) {
13377 // Map any dwords with inputs from them into the right half.
13378 for (int Input : IncomingInputs) {
13379 // If the source half mask maps over the inputs, turn those into
13380 // swaps and use the swapped lane.
13381 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13382 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13383 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13384 Input - SourceOffset;
13385 // We have to swap the uses in our half mask in one sweep.
13386 for (int &M : HalfMask)
13387 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13388 M = Input;
13389 else if (M == Input)
13390 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13391 } else {
13392 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13393 Input - SourceOffset &&
13394 "Previous placement doesn't match!");
13396 // Note that this correctly re-maps both when we do a swap and when
13397 // we observe the other side of the swap above. We rely on that to
13398 // avoid swapping the members of the input list directly.
13399 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13402 // Map the input's dword into the correct half.
13403 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13404 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13405 else
13406 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13407 Input / 2 &&
13408 "Previous placement doesn't match!");
13411 // And just directly shift any other-half mask elements to be same-half
13412 // as we will have mirrored the dword containing the element into the
13413 // same position within that half.
13414 for (int &M : HalfMask)
13415 if (M >= SourceOffset && M < SourceOffset + 4) {
13416 M = M - SourceOffset + DestOffset;
13417 assert(M >= 0 && "This should never wrap below zero!");
13419 return;
13422 // Ensure we have the input in a viable dword of its current half. This
13423 // is particularly tricky because the original position may be clobbered
13424 // by inputs being moved and *staying* in that half.
13425 if (IncomingInputs.size() == 1) {
13426 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13427 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13428 SourceOffset;
13429 SourceHalfMask[InputFixed - SourceOffset] =
13430 IncomingInputs[0] - SourceOffset;
13431 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13432 InputFixed);
13433 IncomingInputs[0] = InputFixed;
13435 } else if (IncomingInputs.size() == 2) {
13436 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13437 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13438 // We have two non-adjacent or clobbered inputs we need to extract from
13439 // the source half. To do this, we need to map them into some adjacent
13440 // dword slot in the source mask.
13441 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13442 IncomingInputs[1] - SourceOffset};
13444 // If there is a free slot in the source half mask adjacent to one of
13445 // the inputs, place the other input in it. We use (Index XOR 1) to
13446 // compute an adjacent index.
13447 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13448 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13449 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13450 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13451 InputsFixed[1] = InputsFixed[0] ^ 1;
13452 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13453 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13454 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13455 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13456 InputsFixed[0] = InputsFixed[1] ^ 1;
13457 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13458 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13459 // The two inputs are in the same DWord but it is clobbered and the
13460 // adjacent DWord isn't used at all. Move both inputs to the free
13461 // slot.
13462 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13463 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13464 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13465 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13466 } else {
13467 // The only way we hit this point is if there is no clobbering
13468 // (because there are no off-half inputs to this half) and there is no
13469 // free slot adjacent to one of the inputs. In this case, we have to
13470 // swap an input with a non-input.
13471 for (int i = 0; i < 4; ++i)
13472 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13473 "We can't handle any clobbers here!");
13474 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13475 "Cannot have adjacent inputs here!");
13477 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13478 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
13480 // We also have to update the final source mask in this case because
13481 // it may need to undo the above swap.
13482 for (int &M : FinalSourceHalfMask)
13483 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
13484 M = InputsFixed[1] + SourceOffset;
13485 else if (M == InputsFixed[1] + SourceOffset)
13486 M = (InputsFixed[0] ^ 1) + SourceOffset;
13488 InputsFixed[1] = InputsFixed[0] ^ 1;
13491 // Point everything at the fixed inputs.
13492 for (int &M : HalfMask)
13493 if (M == IncomingInputs[0])
13494 M = InputsFixed[0] + SourceOffset;
13495 else if (M == IncomingInputs[1])
13496 M = InputsFixed[1] + SourceOffset;
13498 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
13499 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
13501 } else {
13502 llvm_unreachable("Unhandled input size!");
13505 // Now hoist the DWord down to the right half.
13506 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
13507 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
13508 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
13509 for (int &M : HalfMask)
13510 for (int Input : IncomingInputs)
13511 if (M == Input)
13512 M = FreeDWord * 2 + Input % 2;
13514 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
13515 /*SourceOffset*/ 4, /*DestOffset*/ 0);
13516 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
13517 /*SourceOffset*/ 0, /*DestOffset*/ 4);
13519 // Now enact all the shuffles we've computed to move the inputs into their
13520 // target half.
13521 if (!isNoopShuffleMask(PSHUFLMask))
13522 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13523 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
13524 if (!isNoopShuffleMask(PSHUFHMask))
13525 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13526 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
13527 if (!isNoopShuffleMask(PSHUFDMask))
13528 V = DAG.getBitcast(
13530 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13531 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13533 // At this point, each half should contain all its inputs, and we can then
13534 // just shuffle them into their final position.
13535 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
13536 "Failed to lift all the high half inputs to the low mask!");
13537 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
13538 "Failed to lift all the low half inputs to the high mask!");
13540 // Do a half shuffle for the low mask.
13541 if (!isNoopShuffleMask(LoMask))
13542 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13543 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13545 // Do a half shuffle with the high mask after shifting its values down.
13546 for (int &M : HiMask)
13547 if (M >= 0)
13548 M -= 4;
13549 if (!isNoopShuffleMask(HiMask))
13550 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13551 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13553 return V;
13556 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
13557 /// blend if only one input is used.
13558 static SDValue lowerShuffleAsBlendOfPSHUFBs(
13559 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13560 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
13561 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
13562 "Lane crossing shuffle masks not supported");
13564 int NumBytes = VT.getSizeInBits() / 8;
13565 int Size = Mask.size();
13566 int Scale = NumBytes / Size;
13568 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13569 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13570 V1InUse = false;
13571 V2InUse = false;
13573 for (int i = 0; i < NumBytes; ++i) {
13574 int M = Mask[i / Scale];
13575 if (M < 0)
13576 continue;
13578 const int ZeroMask = 0x80;
13579 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
13580 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
13581 if (Zeroable[i / Scale])
13582 V1Idx = V2Idx = ZeroMask;
13584 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
13585 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
13586 V1InUse |= (ZeroMask != V1Idx);
13587 V2InUse |= (ZeroMask != V2Idx);
13590 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
13591 if (V1InUse)
13592 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
13593 DAG.getBuildVector(ShufVT, DL, V1Mask));
13594 if (V2InUse)
13595 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
13596 DAG.getBuildVector(ShufVT, DL, V2Mask));
13598 // If we need shuffled inputs from both, blend the two.
13599 SDValue V;
13600 if (V1InUse && V2InUse)
13601 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
13602 else
13603 V = V1InUse ? V1 : V2;
13605 // Cast the result back to the correct type.
13606 return DAG.getBitcast(VT, V);
13609 /// Generic lowering of 8-lane i16 shuffles.
13611 /// This handles both single-input shuffles and combined shuffle/blends with
13612 /// two inputs. The single input shuffles are immediately delegated to
13613 /// a dedicated lowering routine.
13615 /// The blends are lowered in one of three fundamental ways. If there are few
13616 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
13617 /// of the input is significantly cheaper when lowered as an interleaving of
13618 /// the two inputs, try to interleave them. Otherwise, blend the low and high
13619 /// halves of the inputs separately (making them have relatively few inputs)
13620 /// and then concatenate them.
13621 static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13622 const APInt &Zeroable, SDValue V1, SDValue V2,
13623 const X86Subtarget &Subtarget,
13624 SelectionDAG &DAG) {
13625 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13626 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13627 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13629 // Whenever we can lower this as a zext, that instruction is strictly faster
13630 // than any alternative.
13631 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
13632 Zeroable, Subtarget, DAG))
13633 return ZExt;
13635 // Try to use lower using a truncation.
13636 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13637 Subtarget, DAG))
13638 return V;
13640 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
13642 if (NumV2Inputs == 0) {
13643 // Try to use shift instructions.
13644 if (SDValue Shift =
13645 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
13646 Subtarget, DAG, /*BitwiseOnly*/ false))
13647 return Shift;
13649 // Check for being able to broadcast a single element.
13650 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
13651 Mask, Subtarget, DAG))
13652 return Broadcast;
13654 // Try to use bit rotation instructions.
13655 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
13656 Subtarget, DAG))
13657 return Rotate;
13659 // Use dedicated unpack instructions for masks that match their pattern.
13660 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13661 return V;
13663 // Use dedicated pack instructions for masks that match their pattern.
13664 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13665 Subtarget))
13666 return V;
13668 // Try to use byte rotation instructions.
13669 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
13670 Subtarget, DAG))
13671 return Rotate;
13673 // Make a copy of the mask so it can be modified.
13674 SmallVector<int, 8> MutableMask(Mask);
13675 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
13676 Subtarget, DAG);
13679 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
13680 "All single-input shuffles should be canonicalized to be V1-input "
13681 "shuffles.");
13683 // Try to use shift instructions.
13684 if (SDValue Shift =
13685 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
13686 DAG, /*BitwiseOnly*/ false))
13687 return Shift;
13689 // See if we can use SSE4A Extraction / Insertion.
13690 if (Subtarget.hasSSE4A())
13691 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
13692 Zeroable, DAG))
13693 return V;
13695 // There are special ways we can lower some single-element blends.
13696 if (NumV2Inputs == 1)
13697 if (SDValue V = lowerShuffleAsElementInsertion(
13698 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13699 return V;
13701 // We have different paths for blend lowering, but they all must use the
13702 // *exact* same predicate.
13703 bool IsBlendSupported = Subtarget.hasSSE41();
13704 if (IsBlendSupported)
13705 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
13706 Zeroable, Subtarget, DAG))
13707 return Blend;
13709 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
13710 Zeroable, Subtarget, DAG))
13711 return Masked;
13713 // Use dedicated unpack instructions for masks that match their pattern.
13714 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13715 return V;
13717 // Use dedicated pack instructions for masks that match their pattern.
13718 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13719 Subtarget))
13720 return V;
13722 // Try to use lower using a truncation.
13723 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13724 Subtarget, DAG))
13725 return V;
13727 // Try to use byte rotation instructions.
13728 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
13729 Subtarget, DAG))
13730 return Rotate;
13732 if (SDValue BitBlend =
13733 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
13734 return BitBlend;
13736 // Try to use byte shift instructions to mask.
13737 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
13738 Zeroable, Subtarget, DAG))
13739 return V;
13741 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
13742 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
13743 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
13744 !Subtarget.hasVLX()) {
13745 // Check if this is part of a 256-bit vector truncation.
13746 unsigned PackOpc = 0;
13747 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
13748 peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13749 peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
13750 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
13751 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
13752 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
13753 DAG.getTargetConstant(0xEE, DL, MVT::i8));
13754 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
13755 V1 = extract128BitVector(V1V2, 0, DAG, DL);
13756 V2 = extract128BitVector(V1V2, 4, DAG, DL);
13757 PackOpc = X86ISD::PACKUS;
13758 } else if (Subtarget.hasSSE41()) {
13759 SmallVector<SDValue, 4> DWordClearOps(4,
13760 DAG.getConstant(0, DL, MVT::i32));
13761 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
13762 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
13763 SDValue DWordClearMask =
13764 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
13765 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
13766 DWordClearMask);
13767 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
13768 DWordClearMask);
13769 PackOpc = X86ISD::PACKUS;
13770 } else if (!Subtarget.hasSSSE3()) {
13771 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
13772 V1 = DAG.getBitcast(MVT::v4i32, V1);
13773 V2 = DAG.getBitcast(MVT::v4i32, V2);
13774 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
13775 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
13776 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
13777 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
13778 PackOpc = X86ISD::PACKSS;
13780 if (PackOpc) {
13781 // Now pack things back together.
13782 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
13783 if (NumEvenDrops == 2) {
13784 Result = DAG.getBitcast(MVT::v4i32, Result);
13785 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
13787 return Result;
13791 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
13792 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
13793 if (NumOddDrops == 1) {
13794 bool HasSSE41 = Subtarget.hasSSE41();
13795 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
13796 DAG.getBitcast(MVT::v4i32, V1),
13797 DAG.getTargetConstant(16, DL, MVT::i8));
13798 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
13799 DAG.getBitcast(MVT::v4i32, V2),
13800 DAG.getTargetConstant(16, DL, MVT::i8));
13801 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
13802 MVT::v8i16, V1, V2);
13805 // Try to lower by permuting the inputs into an unpack instruction.
13806 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
13807 Mask, Subtarget, DAG))
13808 return Unpack;
13810 // If we can't directly blend but can use PSHUFB, that will be better as it
13811 // can both shuffle and set up the inefficient blend.
13812 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
13813 bool V1InUse, V2InUse;
13814 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
13815 Zeroable, DAG, V1InUse, V2InUse);
13818 // We can always bit-blend if we have to so the fallback strategy is to
13819 // decompose into single-input permutes and blends/unpacks.
13820 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
13821 Mask, Subtarget, DAG);
13824 /// Lower 8-lane 16-bit floating point shuffles.
13825 static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13826 const APInt &Zeroable, SDValue V1, SDValue V2,
13827 const X86Subtarget &Subtarget,
13828 SelectionDAG &DAG) {
13829 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
13830 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
13831 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13832 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
13834 if (Subtarget.hasFP16()) {
13835 if (NumV2Elements == 0) {
13836 // Check for being able to broadcast a single element.
13837 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
13838 Mask, Subtarget, DAG))
13839 return Broadcast;
13841 if (NumV2Elements == 1 && Mask[0] >= 8)
13842 if (SDValue V = lowerShuffleAsElementInsertion(
13843 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13844 return V;
13847 V1 = DAG.getBitcast(MVT::v8i16, V1);
13848 V2 = DAG.getBitcast(MVT::v8i16, V2);
13849 return DAG.getBitcast(MVT::v8f16,
13850 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
13853 // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
13854 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
13855 // the active subvector is extracted.
13856 static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
13857 ArrayRef<int> Mask, SDValue V1, SDValue V2,
13858 const X86Subtarget &Subtarget,
13859 SelectionDAG &DAG) {
13860 MVT MaskVT = VT.changeTypeToInteger();
13861 SDValue MaskNode;
13862 MVT ShuffleVT = VT;
13863 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
13864 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
13865 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
13866 ShuffleVT = V1.getSimpleValueType();
13868 // Adjust mask to correct indices for the second input.
13869 int NumElts = VT.getVectorNumElements();
13870 unsigned Scale = 512 / VT.getSizeInBits();
13871 SmallVector<int, 32> AdjustedMask(Mask);
13872 for (int &M : AdjustedMask)
13873 if (NumElts <= M)
13874 M += (Scale - 1) * NumElts;
13875 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
13876 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
13877 } else {
13878 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
13881 SDValue Result;
13882 if (V2.isUndef())
13883 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
13884 else
13885 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
13887 if (VT != ShuffleVT)
13888 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
13890 return Result;
13893 /// Generic lowering of v16i8 shuffles.
13895 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
13896 /// detect any complexity reducing interleaving. If that doesn't help, it uses
13897 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
13898 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
13899 /// back together.
13900 static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13901 const APInt &Zeroable, SDValue V1, SDValue V2,
13902 const X86Subtarget &Subtarget,
13903 SelectionDAG &DAG) {
13904 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
13905 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
13906 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13908 // Try to use shift instructions.
13909 if (SDValue Shift =
13910 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
13911 DAG, /*BitwiseOnly*/ false))
13912 return Shift;
13914 // Try to use byte rotation instructions.
13915 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
13916 Subtarget, DAG))
13917 return Rotate;
13919 // Use dedicated pack instructions for masks that match their pattern.
13920 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
13921 Subtarget))
13922 return V;
13924 // Try to use a zext lowering.
13925 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
13926 Zeroable, Subtarget, DAG))
13927 return ZExt;
13929 // Try to use lower using a truncation.
13930 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
13931 Subtarget, DAG))
13932 return V;
13934 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
13935 Subtarget, DAG))
13936 return V;
13938 // See if we can use SSE4A Extraction / Insertion.
13939 if (Subtarget.hasSSE4A())
13940 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
13941 Zeroable, DAG))
13942 return V;
13944 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
13946 // For single-input shuffles, there are some nicer lowering tricks we can use.
13947 if (NumV2Elements == 0) {
13948 // Check for being able to broadcast a single element.
13949 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
13950 Mask, Subtarget, DAG))
13951 return Broadcast;
13953 // Try to use bit rotation instructions.
13954 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
13955 Subtarget, DAG))
13956 return Rotate;
13958 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
13959 return V;
13961 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
13962 // Notably, this handles splat and partial-splat shuffles more efficiently.
13963 // However, it only makes sense if the pre-duplication shuffle simplifies
13964 // things significantly. Currently, this means we need to be able to
13965 // express the pre-duplication shuffle as an i16 shuffle.
13967 // FIXME: We should check for other patterns which can be widened into an
13968 // i16 shuffle as well.
13969 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
13970 for (int i = 0; i < 16; i += 2)
13971 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
13972 return false;
13974 return true;
13976 auto tryToWidenViaDuplication = [&]() -> SDValue {
13977 if (!canWidenViaDuplication(Mask))
13978 return SDValue();
13979 SmallVector<int, 4> LoInputs;
13980 copy_if(Mask, std::back_inserter(LoInputs),
13981 [](int M) { return M >= 0 && M < 8; });
13982 array_pod_sort(LoInputs.begin(), LoInputs.end());
13983 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
13984 LoInputs.end());
13985 SmallVector<int, 4> HiInputs;
13986 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
13987 array_pod_sort(HiInputs.begin(), HiInputs.end());
13988 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
13989 HiInputs.end());
13991 bool TargetLo = LoInputs.size() >= HiInputs.size();
13992 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
13993 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
13995 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
13996 SmallDenseMap<int, int, 8> LaneMap;
13997 for (int I : InPlaceInputs) {
13998 PreDupI16Shuffle[I/2] = I/2;
13999 LaneMap[I] = I;
14001 int j = TargetLo ? 0 : 4, je = j + 4;
14002 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14003 // Check if j is already a shuffle of this input. This happens when
14004 // there are two adjacent bytes after we move the low one.
14005 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14006 // If we haven't yet mapped the input, search for a slot into which
14007 // we can map it.
14008 while (j < je && PreDupI16Shuffle[j] >= 0)
14009 ++j;
14011 if (j == je)
14012 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14013 return SDValue();
14015 // Map this input with the i16 shuffle.
14016 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14019 // Update the lane map based on the mapping we ended up with.
14020 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14022 V1 = DAG.getBitcast(
14023 MVT::v16i8,
14024 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14025 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14027 // Unpack the bytes to form the i16s that will be shuffled into place.
14028 bool EvenInUse = false, OddInUse = false;
14029 for (int i = 0; i < 16; i += 2) {
14030 EvenInUse |= (Mask[i + 0] >= 0);
14031 OddInUse |= (Mask[i + 1] >= 0);
14032 if (EvenInUse && OddInUse)
14033 break;
14035 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14036 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14037 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14039 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14040 for (int i = 0; i < 16; ++i)
14041 if (Mask[i] >= 0) {
14042 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14043 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14044 if (PostDupI16Shuffle[i / 2] < 0)
14045 PostDupI16Shuffle[i / 2] = MappedMask;
14046 else
14047 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14048 "Conflicting entries in the original shuffle!");
14050 return DAG.getBitcast(
14051 MVT::v16i8,
14052 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14053 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14055 if (SDValue V = tryToWidenViaDuplication())
14056 return V;
14059 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14060 Zeroable, Subtarget, DAG))
14061 return Masked;
14063 // Use dedicated unpack instructions for masks that match their pattern.
14064 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14065 return V;
14067 // Try to use byte shift instructions to mask.
14068 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14069 Zeroable, Subtarget, DAG))
14070 return V;
14072 // Check for compaction patterns.
14073 bool IsSingleInput = V2.isUndef();
14074 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14076 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14077 // with PSHUFB. It is important to do this before we attempt to generate any
14078 // blends but after all of the single-input lowerings. If the single input
14079 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14080 // want to preserve that and we can DAG combine any longer sequences into
14081 // a PSHUFB in the end. But once we start blending from multiple inputs,
14082 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14083 // and there are *very* few patterns that would actually be faster than the
14084 // PSHUFB approach because of its ability to zero lanes.
14086 // If the mask is a binary compaction, we can more efficiently perform this
14087 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14089 // FIXME: The only exceptions to the above are blends which are exact
14090 // interleavings with direct instructions supporting them. We currently don't
14091 // handle those well here.
14092 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14093 bool V1InUse = false;
14094 bool V2InUse = false;
14096 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
14097 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14099 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14100 // do so. This avoids using them to handle blends-with-zero which is
14101 // important as a single pshufb is significantly faster for that.
14102 if (V1InUse && V2InUse) {
14103 if (Subtarget.hasSSE41())
14104 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14105 Zeroable, Subtarget, DAG))
14106 return Blend;
14108 // We can use an unpack to do the blending rather than an or in some
14109 // cases. Even though the or may be (very minorly) more efficient, we
14110 // preference this lowering because there are common cases where part of
14111 // the complexity of the shuffles goes away when we do the final blend as
14112 // an unpack.
14113 // FIXME: It might be worth trying to detect if the unpack-feeding
14114 // shuffles will both be pshufb, in which case we shouldn't bother with
14115 // this.
14116 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
14117 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14118 return Unpack;
14120 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14121 if (Subtarget.hasVBMI())
14122 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14123 DAG);
14125 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14126 if (Subtarget.hasXOP()) {
14127 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14128 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14131 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14132 // PALIGNR will be cheaper than the second PSHUFB+OR.
14133 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
14134 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14135 return V;
14138 return PSHUFB;
14141 // There are special ways we can lower some single-element blends.
14142 if (NumV2Elements == 1)
14143 if (SDValue V = lowerShuffleAsElementInsertion(
14144 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14145 return V;
14147 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14148 return Blend;
14150 // Check whether a compaction lowering can be done. This handles shuffles
14151 // which take every Nth element for some even N. See the helper function for
14152 // details.
14154 // We special case these as they can be particularly efficiently handled with
14155 // the PACKUSB instruction on x86 and they show up in common patterns of
14156 // rearranging bytes to truncate wide elements.
14157 if (NumEvenDrops) {
14158 // NumEvenDrops is the power of two stride of the elements. Another way of
14159 // thinking about it is that we need to drop the even elements this many
14160 // times to get the original input.
14162 // First we need to zero all the dropped bytes.
14163 assert(NumEvenDrops <= 3 &&
14164 "No support for dropping even elements more than 3 times.");
14165 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14166 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14167 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14168 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14169 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14170 WordClearMask);
14171 if (!IsSingleInput)
14172 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14173 WordClearMask);
14175 // Now pack things back together.
14176 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14177 IsSingleInput ? V1 : V2);
14178 for (int i = 1; i < NumEvenDrops; ++i) {
14179 Result = DAG.getBitcast(MVT::v8i16, Result);
14180 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14182 return Result;
14185 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14186 if (NumOddDrops == 1) {
14187 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14188 DAG.getBitcast(MVT::v8i16, V1),
14189 DAG.getTargetConstant(8, DL, MVT::i8));
14190 if (!IsSingleInput)
14191 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14192 DAG.getBitcast(MVT::v8i16, V2),
14193 DAG.getTargetConstant(8, DL, MVT::i8));
14194 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14195 IsSingleInput ? V1 : V2);
14198 // Handle multi-input cases by blending/unpacking single-input shuffles.
14199 if (NumV2Elements > 0)
14200 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14201 Subtarget, DAG);
14203 // The fallback path for single-input shuffles widens this into two v8i16
14204 // vectors with unpacks, shuffles those, and then pulls them back together
14205 // with a pack.
14206 SDValue V = V1;
14208 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14209 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14210 for (int i = 0; i < 16; ++i)
14211 if (Mask[i] >= 0)
14212 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14214 SDValue VLoHalf, VHiHalf;
14215 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14216 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14217 // i16s.
14218 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14219 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14220 // Use a mask to drop the high bytes.
14221 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14222 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14223 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14225 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14226 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14228 // Squash the masks to point directly into VLoHalf.
14229 for (int &M : LoBlendMask)
14230 if (M >= 0)
14231 M /= 2;
14232 for (int &M : HiBlendMask)
14233 if (M >= 0)
14234 M /= 2;
14235 } else {
14236 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14237 // VHiHalf so that we can blend them as i16s.
14238 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14240 VLoHalf = DAG.getBitcast(
14241 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14242 VHiHalf = DAG.getBitcast(
14243 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14246 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14247 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14249 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14252 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
14254 /// This routine breaks down the specific type of 128-bit shuffle and
14255 /// dispatches to the lowering routines accordingly.
14256 static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14257 MVT VT, SDValue V1, SDValue V2,
14258 const APInt &Zeroable,
14259 const X86Subtarget &Subtarget,
14260 SelectionDAG &DAG) {
14261 switch (VT.SimpleTy) {
14262 case MVT::v2i64:
14263 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14264 case MVT::v2f64:
14265 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14266 case MVT::v4i32:
14267 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14268 case MVT::v4f32:
14269 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14270 case MVT::v8i16:
14271 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14272 case MVT::v8f16:
14273 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14274 case MVT::v16i8:
14275 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14277 default:
14278 llvm_unreachable("Unimplemented!");
14282 /// Generic routine to split vector shuffle into half-sized shuffles.
14284 /// This routine just extracts two subvectors, shuffles them independently, and
14285 /// then concatenates them back together. This should work effectively with all
14286 /// AVX vector shuffle types.
14287 static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
14288 SDValue V2, ArrayRef<int> Mask,
14289 SelectionDAG &DAG, bool SimpleOnly) {
14290 assert(VT.getSizeInBits() >= 256 &&
14291 "Only for 256-bit or wider vector shuffles!");
14292 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14293 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14295 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14296 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14298 int NumElements = VT.getVectorNumElements();
14299 int SplitNumElements = NumElements / 2;
14300 MVT ScalarVT = VT.getVectorElementType();
14301 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14303 // Use splitVector/extractSubVector so that split build-vectors just build two
14304 // narrower build vectors. This helps shuffling with splats and zeros.
14305 auto SplitVector = [&](SDValue V) {
14306 SDValue LoV, HiV;
14307 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14308 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14309 DAG.getBitcast(SplitVT, HiV));
14312 SDValue LoV1, HiV1, LoV2, HiV2;
14313 std::tie(LoV1, HiV1) = SplitVector(V1);
14314 std::tie(LoV2, HiV2) = SplitVector(V2);
14316 // Now create two 4-way blends of these half-width vectors.
14317 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14318 bool &UseHiV1, bool &UseLoV2,
14319 bool &UseHiV2) {
14320 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14321 for (int i = 0; i < SplitNumElements; ++i) {
14322 int M = HalfMask[i];
14323 if (M >= NumElements) {
14324 if (M >= NumElements + SplitNumElements)
14325 UseHiV2 = true;
14326 else
14327 UseLoV2 = true;
14328 } else if (M >= 0) {
14329 if (M >= SplitNumElements)
14330 UseHiV1 = true;
14331 else
14332 UseLoV1 = true;
14337 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14338 if (!SimpleOnly)
14339 return true;
14341 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14342 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14344 return !(UseHiV1 || UseHiV2);
14347 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14348 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14349 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14350 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14351 for (int i = 0; i < SplitNumElements; ++i) {
14352 int M = HalfMask[i];
14353 if (M >= NumElements) {
14354 V2BlendMask[i] = M - NumElements;
14355 BlendMask[i] = SplitNumElements + i;
14356 } else if (M >= 0) {
14357 V1BlendMask[i] = M;
14358 BlendMask[i] = i;
14362 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14363 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14365 // Because the lowering happens after all combining takes place, we need to
14366 // manually combine these blend masks as much as possible so that we create
14367 // a minimal number of high-level vector shuffle nodes.
14368 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14370 // First try just blending the halves of V1 or V2.
14371 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14372 return DAG.getUNDEF(SplitVT);
14373 if (!UseLoV2 && !UseHiV2)
14374 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14375 if (!UseLoV1 && !UseHiV1)
14376 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14378 SDValue V1Blend, V2Blend;
14379 if (UseLoV1 && UseHiV1) {
14380 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14381 } else {
14382 // We only use half of V1 so map the usage down into the final blend mask.
14383 V1Blend = UseLoV1 ? LoV1 : HiV1;
14384 for (int i = 0; i < SplitNumElements; ++i)
14385 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14386 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14388 if (UseLoV2 && UseHiV2) {
14389 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14390 } else {
14391 // We only use half of V2 so map the usage down into the final blend mask.
14392 V2Blend = UseLoV2 ? LoV2 : HiV2;
14393 for (int i = 0; i < SplitNumElements; ++i)
14394 if (BlendMask[i] >= SplitNumElements)
14395 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14397 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14400 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14401 return SDValue();
14403 SDValue Lo = HalfBlend(LoMask);
14404 SDValue Hi = HalfBlend(HiMask);
14405 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14408 /// Either split a vector in halves or decompose the shuffles and the
14409 /// blend/unpack.
14411 /// This is provided as a good fallback for many lowerings of non-single-input
14412 /// shuffles with more than one 128-bit lane. In those cases, we want to select
14413 /// between splitting the shuffle into 128-bit components and stitching those
14414 /// back together vs. extracting the single-input shuffles and blending those
14415 /// results.
14416 static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
14417 SDValue V2, ArrayRef<int> Mask,
14418 const X86Subtarget &Subtarget,
14419 SelectionDAG &DAG) {
14420 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14421 "shuffles as it could then recurse on itself.");
14422 int Size = Mask.size();
14424 // If this can be modeled as a broadcast of two elements followed by a blend,
14425 // prefer that lowering. This is especially important because broadcasts can
14426 // often fold with memory operands.
14427 auto DoBothBroadcast = [&] {
14428 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14429 for (int M : Mask)
14430 if (M >= Size) {
14431 if (V2BroadcastIdx < 0)
14432 V2BroadcastIdx = M - Size;
14433 else if (M - Size != V2BroadcastIdx)
14434 return false;
14435 } else if (M >= 0) {
14436 if (V1BroadcastIdx < 0)
14437 V1BroadcastIdx = M;
14438 else if (M != V1BroadcastIdx)
14439 return false;
14441 return true;
14443 if (DoBothBroadcast())
14444 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14445 DAG);
14447 // If the inputs all stem from a single 128-bit lane of each input, then we
14448 // split them rather than blending because the split will decompose to
14449 // unusually few instructions.
14450 int LaneCount = VT.getSizeInBits() / 128;
14451 int LaneSize = Size / LaneCount;
14452 SmallBitVector LaneInputs[2];
14453 LaneInputs[0].resize(LaneCount, false);
14454 LaneInputs[1].resize(LaneCount, false);
14455 for (int i = 0; i < Size; ++i)
14456 if (Mask[i] >= 0)
14457 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14458 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14459 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14460 /*SimpleOnly*/ false);
14462 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
14463 // requires that the decomposed single-input shuffles don't end up here.
14464 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14465 DAG);
14468 // Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14469 // TODO: Extend to support v8f32 (+ 512-bit shuffles).
14470 static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
14471 SDValue V1, SDValue V2,
14472 ArrayRef<int> Mask,
14473 SelectionDAG &DAG) {
14474 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
14476 int LHSMask[4] = {-1, -1, -1, -1};
14477 int RHSMask[4] = {-1, -1, -1, -1};
14478 unsigned SHUFPMask = 0;
14480 // As SHUFPD uses a single LHS/RHS element per lane, we can always
14481 // perform the shuffle once the lanes have been shuffled in place.
14482 for (int i = 0; i != 4; ++i) {
14483 int M = Mask[i];
14484 if (M < 0)
14485 continue;
14486 int LaneBase = i & ~1;
14487 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
14488 LaneMask[LaneBase + (M & 1)] = M;
14489 SHUFPMask |= (M & 1) << i;
14492 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
14493 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
14494 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
14495 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
14498 /// Lower a vector shuffle crossing multiple 128-bit lanes as
14499 /// a lane permutation followed by a per-lane permutation.
14501 /// This is mainly for cases where we can have non-repeating permutes
14502 /// in each lane.
14504 /// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
14505 /// we should investigate merging them.
14506 static SDValue lowerShuffleAsLanePermuteAndPermute(
14507 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14508 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14509 int NumElts = VT.getVectorNumElements();
14510 int NumLanes = VT.getSizeInBits() / 128;
14511 int NumEltsPerLane = NumElts / NumLanes;
14512 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
14514 /// Attempts to find a sublane permute with the given size
14515 /// that gets all elements into their target lanes.
14517 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
14518 /// If unsuccessful, returns false and may overwrite InLaneMask.
14519 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
14520 int NumSublanesPerLane = NumSublanes / NumLanes;
14521 int NumEltsPerSublane = NumElts / NumSublanes;
14523 SmallVector<int, 16> CrossLaneMask;
14524 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
14525 // CrossLaneMask but one entry == one sublane.
14526 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
14528 for (int i = 0; i != NumElts; ++i) {
14529 int M = Mask[i];
14530 if (M < 0)
14531 continue;
14533 int SrcSublane = M / NumEltsPerSublane;
14534 int DstLane = i / NumEltsPerLane;
14536 // We only need to get the elements into the right lane, not sublane.
14537 // So search all sublanes that make up the destination lane.
14538 bool Found = false;
14539 int DstSubStart = DstLane * NumSublanesPerLane;
14540 int DstSubEnd = DstSubStart + NumSublanesPerLane;
14541 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
14542 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
14543 continue;
14545 Found = true;
14546 CrossLaneMaskLarge[DstSublane] = SrcSublane;
14547 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
14548 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
14549 break;
14551 if (!Found)
14552 return SDValue();
14555 // Fill CrossLaneMask using CrossLaneMaskLarge.
14556 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
14558 if (!CanUseSublanes) {
14559 // If we're only shuffling a single lowest lane and the rest are identity
14560 // then don't bother.
14561 // TODO - isShuffleMaskInputInPlace could be extended to something like
14562 // this.
14563 int NumIdentityLanes = 0;
14564 bool OnlyShuffleLowestLane = true;
14565 for (int i = 0; i != NumLanes; ++i) {
14566 int LaneOffset = i * NumEltsPerLane;
14567 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
14568 i * NumEltsPerLane))
14569 NumIdentityLanes++;
14570 else if (CrossLaneMask[LaneOffset] != 0)
14571 OnlyShuffleLowestLane = false;
14573 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
14574 return SDValue();
14577 // Avoid returning the same shuffle operation. For example,
14578 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
14579 // undef:v16i16
14580 if (CrossLaneMask == Mask || InLaneMask == Mask)
14581 return SDValue();
14583 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
14584 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
14585 InLaneMask);
14588 // First attempt a solution with full lanes.
14589 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
14590 return V;
14592 // The rest of the solutions use sublanes.
14593 if (!CanUseSublanes)
14594 return SDValue();
14596 // Then attempt a solution with 64-bit sublanes (vpermq).
14597 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
14598 return V;
14600 // If that doesn't work and we have fast variable cross-lane shuffle,
14601 // attempt 32-bit sublanes (vpermd).
14602 if (!Subtarget.hasFastVariableCrossLaneShuffle())
14603 return SDValue();
14605 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
14608 /// Helper to get compute inlane shuffle mask for a complete shuffle mask.
14609 static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
14610 SmallVector<int> &InLaneMask) {
14611 int Size = Mask.size();
14612 InLaneMask.assign(Mask.begin(), Mask.end());
14613 for (int i = 0; i < Size; ++i) {
14614 int &M = InLaneMask[i];
14615 if (M < 0)
14616 continue;
14617 if (((M % Size) / LaneSize) != (i / LaneSize))
14618 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
14622 /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
14623 /// source with a lane permutation.
14625 /// This lowering strategy results in four instructions in the worst case for a
14626 /// single-input cross lane shuffle which is lower than any other fully general
14627 /// cross-lane shuffle strategy I'm aware of. Special cases for each particular
14628 /// shuffle pattern should be handled prior to trying this lowering.
14629 static SDValue lowerShuffleAsLanePermuteAndShuffle(
14630 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14631 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14632 // FIXME: This should probably be generalized for 512-bit vectors as well.
14633 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
14634 int Size = Mask.size();
14635 int LaneSize = Size / 2;
14637 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14638 // Only do this if the elements aren't all from the lower lane,
14639 // otherwise we're (probably) better off doing a split.
14640 if (VT == MVT::v4f64 &&
14641 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
14642 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
14644 // If there are only inputs from one 128-bit lane, splitting will in fact be
14645 // less expensive. The flags track whether the given lane contains an element
14646 // that crosses to another lane.
14647 bool AllLanes;
14648 if (!Subtarget.hasAVX2()) {
14649 bool LaneCrossing[2] = {false, false};
14650 for (int i = 0; i < Size; ++i)
14651 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
14652 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
14653 AllLanes = LaneCrossing[0] && LaneCrossing[1];
14654 } else {
14655 bool LaneUsed[2] = {false, false};
14656 for (int i = 0; i < Size; ++i)
14657 if (Mask[i] >= 0)
14658 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
14659 AllLanes = LaneUsed[0] && LaneUsed[1];
14662 // TODO - we could support shuffling V2 in the Flipped input.
14663 assert(V2.isUndef() &&
14664 "This last part of this routine only works on single input shuffles");
14666 SmallVector<int> InLaneMask;
14667 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
14669 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
14670 "In-lane shuffle mask expected");
14672 // If we're not using both lanes in each lane and the inlane mask is not
14673 // repeating, then we're better off splitting.
14674 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
14675 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14676 /*SimpleOnly*/ false);
14678 // Flip the lanes, and shuffle the results which should now be in-lane.
14679 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
14680 SDValue Flipped = DAG.getBitcast(PVT, V1);
14681 Flipped =
14682 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
14683 Flipped = DAG.getBitcast(VT, Flipped);
14684 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
14687 /// Handle lowering 2-lane 128-bit shuffles.
14688 static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
14689 SDValue V2, ArrayRef<int> Mask,
14690 const APInt &Zeroable,
14691 const X86Subtarget &Subtarget,
14692 SelectionDAG &DAG) {
14693 if (V2.isUndef()) {
14694 // Attempt to match VBROADCAST*128 subvector broadcast load.
14695 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
14696 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
14697 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
14698 X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
14699 MVT MemVT = VT.getHalfNumVectorElementsVT();
14700 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
14701 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
14702 if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
14703 VT, MemVT, Ld, Ofs, DAG))
14704 return BcstLd;
14707 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
14708 if (Subtarget.hasAVX2())
14709 return SDValue();
14712 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
14714 SmallVector<int, 4> WidenedMask;
14715 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
14716 return SDValue();
14718 bool IsLowZero = (Zeroable & 0x3) == 0x3;
14719 bool IsHighZero = (Zeroable & 0xc) == 0xc;
14721 // Try to use an insert into a zero vector.
14722 if (WidenedMask[0] == 0 && IsHighZero) {
14723 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14724 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14725 DAG.getIntPtrConstant(0, DL));
14726 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14727 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14728 DAG.getIntPtrConstant(0, DL));
14731 // TODO: If minimizing size and one of the inputs is a zero vector and the
14732 // the zero vector has only one use, we could use a VPERM2X128 to save the
14733 // instruction bytes needed to explicitly generate the zero vector.
14735 // Blends are faster and handle all the non-lane-crossing cases.
14736 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
14737 Subtarget, DAG))
14738 return Blend;
14740 // If either input operand is a zero vector, use VPERM2X128 because its mask
14741 // allows us to replace the zero input with an implicit zero.
14742 if (!IsLowZero && !IsHighZero) {
14743 // Check for patterns which can be matched with a single insert of a 128-bit
14744 // subvector.
14745 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
14746 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
14748 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
14749 // this will likely become vinsertf128 which can't fold a 256-bit memop.
14750 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
14751 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14752 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14753 OnlyUsesV1 ? V1 : V2,
14754 DAG.getIntPtrConstant(0, DL));
14755 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14756 DAG.getIntPtrConstant(2, DL));
14760 // Try to use SHUF128 if possible.
14761 if (Subtarget.hasVLX()) {
14762 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
14763 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
14764 ((WidenedMask[1] % 2) << 1);
14765 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
14766 DAG.getTargetConstant(PermMask, DL, MVT::i8));
14771 // Otherwise form a 128-bit permutation. After accounting for undefs,
14772 // convert the 64-bit shuffle mask selection values into 128-bit
14773 // selection bits by dividing the indexes by 2 and shifting into positions
14774 // defined by a vperm2*128 instruction's immediate control byte.
14776 // The immediate permute control byte looks like this:
14777 // [1:0] - select 128 bits from sources for low half of destination
14778 // [2] - ignore
14779 // [3] - zero low half of destination
14780 // [5:4] - select 128 bits from sources for high half of destination
14781 // [6] - ignore
14782 // [7] - zero high half of destination
14784 assert((WidenedMask[0] >= 0 || IsLowZero) &&
14785 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
14787 unsigned PermMask = 0;
14788 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
14789 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
14791 // Check the immediate mask and replace unused sources with undef.
14792 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
14793 V1 = DAG.getUNDEF(VT);
14794 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
14795 V2 = DAG.getUNDEF(VT);
14797 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
14798 DAG.getTargetConstant(PermMask, DL, MVT::i8));
14801 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
14802 /// shuffling each lane.
14804 /// This attempts to create a repeated lane shuffle where each lane uses one
14805 /// or two of the lanes of the inputs. The lanes of the input vectors are
14806 /// shuffled in one or two independent shuffles to get the lanes into the
14807 /// position needed by the final shuffle.
14808 static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
14809 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14810 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14811 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
14813 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
14814 return SDValue();
14816 int NumElts = Mask.size();
14817 int NumLanes = VT.getSizeInBits() / 128;
14818 int NumLaneElts = 128 / VT.getScalarSizeInBits();
14819 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
14820 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
14822 // First pass will try to fill in the RepeatMask from lanes that need two
14823 // sources.
14824 for (int Lane = 0; Lane != NumLanes; ++Lane) {
14825 int Srcs[2] = {-1, -1};
14826 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
14827 for (int i = 0; i != NumLaneElts; ++i) {
14828 int M = Mask[(Lane * NumLaneElts) + i];
14829 if (M < 0)
14830 continue;
14831 // Determine which of the possible input lanes (NumLanes from each source)
14832 // this element comes from. Assign that as one of the sources for this
14833 // lane. We can assign up to 2 sources for this lane. If we run out
14834 // sources we can't do anything.
14835 int LaneSrc = M / NumLaneElts;
14836 int Src;
14837 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
14838 Src = 0;
14839 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
14840 Src = 1;
14841 else
14842 return SDValue();
14844 Srcs[Src] = LaneSrc;
14845 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
14848 // If this lane has two sources, see if it fits with the repeat mask so far.
14849 if (Srcs[1] < 0)
14850 continue;
14852 LaneSrcs[Lane][0] = Srcs[0];
14853 LaneSrcs[Lane][1] = Srcs[1];
14855 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
14856 assert(M1.size() == M2.size() && "Unexpected mask size");
14857 for (int i = 0, e = M1.size(); i != e; ++i)
14858 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
14859 return false;
14860 return true;
14863 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
14864 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
14865 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
14866 int M = Mask[i];
14867 if (M < 0)
14868 continue;
14869 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
14870 "Unexpected mask element");
14871 MergedMask[i] = M;
14875 if (MatchMasks(InLaneMask, RepeatMask)) {
14876 // Merge this lane mask into the final repeat mask.
14877 MergeMasks(InLaneMask, RepeatMask);
14878 continue;
14881 // Didn't find a match. Swap the operands and try again.
14882 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
14883 ShuffleVectorSDNode::commuteMask(InLaneMask);
14885 if (MatchMasks(InLaneMask, RepeatMask)) {
14886 // Merge this lane mask into the final repeat mask.
14887 MergeMasks(InLaneMask, RepeatMask);
14888 continue;
14891 // Couldn't find a match with the operands in either order.
14892 return SDValue();
14895 // Now handle any lanes with only one source.
14896 for (int Lane = 0; Lane != NumLanes; ++Lane) {
14897 // If this lane has already been processed, skip it.
14898 if (LaneSrcs[Lane][0] >= 0)
14899 continue;
14901 for (int i = 0; i != NumLaneElts; ++i) {
14902 int M = Mask[(Lane * NumLaneElts) + i];
14903 if (M < 0)
14904 continue;
14906 // If RepeatMask isn't defined yet we can define it ourself.
14907 if (RepeatMask[i] < 0)
14908 RepeatMask[i] = M % NumLaneElts;
14910 if (RepeatMask[i] < NumElts) {
14911 if (RepeatMask[i] != M % NumLaneElts)
14912 return SDValue();
14913 LaneSrcs[Lane][0] = M / NumLaneElts;
14914 } else {
14915 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
14916 return SDValue();
14917 LaneSrcs[Lane][1] = M / NumLaneElts;
14921 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
14922 return SDValue();
14925 SmallVector<int, 16> NewMask(NumElts, -1);
14926 for (int Lane = 0; Lane != NumLanes; ++Lane) {
14927 int Src = LaneSrcs[Lane][0];
14928 for (int i = 0; i != NumLaneElts; ++i) {
14929 int M = -1;
14930 if (Src >= 0)
14931 M = Src * NumLaneElts + i;
14932 NewMask[Lane * NumLaneElts + i] = M;
14935 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14936 // Ensure we didn't get back the shuffle we started with.
14937 // FIXME: This is a hack to make up for some splat handling code in
14938 // getVectorShuffle.
14939 if (isa<ShuffleVectorSDNode>(NewV1) &&
14940 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
14941 return SDValue();
14943 for (int Lane = 0; Lane != NumLanes; ++Lane) {
14944 int Src = LaneSrcs[Lane][1];
14945 for (int i = 0; i != NumLaneElts; ++i) {
14946 int M = -1;
14947 if (Src >= 0)
14948 M = Src * NumLaneElts + i;
14949 NewMask[Lane * NumLaneElts + i] = M;
14952 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14953 // Ensure we didn't get back the shuffle we started with.
14954 // FIXME: This is a hack to make up for some splat handling code in
14955 // getVectorShuffle.
14956 if (isa<ShuffleVectorSDNode>(NewV2) &&
14957 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
14958 return SDValue();
14960 for (int i = 0; i != NumElts; ++i) {
14961 if (Mask[i] < 0) {
14962 NewMask[i] = -1;
14963 continue;
14965 NewMask[i] = RepeatMask[i % NumLaneElts];
14966 if (NewMask[i] < 0)
14967 continue;
14969 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
14971 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
14974 /// If the input shuffle mask results in a vector that is undefined in all upper
14975 /// or lower half elements and that mask accesses only 2 halves of the
14976 /// shuffle's operands, return true. A mask of half the width with mask indexes
14977 /// adjusted to access the extracted halves of the original shuffle operands is
14978 /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
14979 /// lower half of each input operand is accessed.
14980 static bool
14981 getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
14982 int &HalfIdx1, int &HalfIdx2) {
14983 assert((Mask.size() == HalfMask.size() * 2) &&
14984 "Expected input mask to be twice as long as output");
14986 // Exactly one half of the result must be undef to allow narrowing.
14987 bool UndefLower = isUndefLowerHalf(Mask);
14988 bool UndefUpper = isUndefUpperHalf(Mask);
14989 if (UndefLower == UndefUpper)
14990 return false;
14992 unsigned HalfNumElts = HalfMask.size();
14993 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
14994 HalfIdx1 = -1;
14995 HalfIdx2 = -1;
14996 for (unsigned i = 0; i != HalfNumElts; ++i) {
14997 int M = Mask[i + MaskIndexOffset];
14998 if (M < 0) {
14999 HalfMask[i] = M;
15000 continue;
15003 // Determine which of the 4 half vectors this element is from.
15004 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15005 int HalfIdx = M / HalfNumElts;
15007 // Determine the element index into its half vector source.
15008 int HalfElt = M % HalfNumElts;
15010 // We can shuffle with up to 2 half vectors, set the new 'half'
15011 // shuffle mask accordingly.
15012 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15013 HalfMask[i] = HalfElt;
15014 HalfIdx1 = HalfIdx;
15015 continue;
15017 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15018 HalfMask[i] = HalfElt + HalfNumElts;
15019 HalfIdx2 = HalfIdx;
15020 continue;
15023 // Too many half vectors referenced.
15024 return false;
15027 return true;
15030 /// Given the output values from getHalfShuffleMask(), create a half width
15031 /// shuffle of extracted vectors followed by an insert back to full width.
15032 static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
15033 ArrayRef<int> HalfMask, int HalfIdx1,
15034 int HalfIdx2, bool UndefLower,
15035 SelectionDAG &DAG, bool UseConcat = false) {
15036 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15037 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15039 MVT VT = V1.getSimpleValueType();
15040 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15041 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15043 auto getHalfVector = [&](int HalfIdx) {
15044 if (HalfIdx < 0)
15045 return DAG.getUNDEF(HalfVT);
15046 SDValue V = (HalfIdx < 2 ? V1 : V2);
15047 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15048 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15049 DAG.getIntPtrConstant(HalfIdx, DL));
15052 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15053 SDValue Half1 = getHalfVector(HalfIdx1);
15054 SDValue Half2 = getHalfVector(HalfIdx2);
15055 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15056 if (UseConcat) {
15057 SDValue Op0 = V;
15058 SDValue Op1 = DAG.getUNDEF(HalfVT);
15059 if (UndefLower)
15060 std::swap(Op0, Op1);
15061 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15064 unsigned Offset = UndefLower ? HalfNumElts : 0;
15065 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15066 DAG.getIntPtrConstant(Offset, DL));
15069 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15070 /// This allows for fast cases such as subvector extraction/insertion
15071 /// or shuffling smaller vector types which can lower more efficiently.
15072 static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
15073 SDValue V2, ArrayRef<int> Mask,
15074 const X86Subtarget &Subtarget,
15075 SelectionDAG &DAG) {
15076 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15077 "Expected 256-bit or 512-bit vector");
15079 bool UndefLower = isUndefLowerHalf(Mask);
15080 if (!UndefLower && !isUndefUpperHalf(Mask))
15081 return SDValue();
15083 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15084 "Completely undef shuffle mask should have been simplified already");
15086 // Upper half is undef and lower half is whole upper subvector.
15087 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15088 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15089 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15090 if (!UndefLower &&
15091 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15092 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15093 DAG.getIntPtrConstant(HalfNumElts, DL));
15094 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15095 DAG.getIntPtrConstant(0, DL));
15098 // Lower half is undef and upper half is whole lower subvector.
15099 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15100 if (UndefLower &&
15101 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15102 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15103 DAG.getIntPtrConstant(0, DL));
15104 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15105 DAG.getIntPtrConstant(HalfNumElts, DL));
15108 int HalfIdx1, HalfIdx2;
15109 SmallVector<int, 8> HalfMask(HalfNumElts);
15110 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15111 return SDValue();
15113 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15115 // Only shuffle the halves of the inputs when useful.
15116 unsigned NumLowerHalves =
15117 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15118 unsigned NumUpperHalves =
15119 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15120 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15122 // Determine the larger pattern of undef/halves, then decide if it's worth
15123 // splitting the shuffle based on subtarget capabilities and types.
15124 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15125 if (!UndefLower) {
15126 // XXXXuuuu: no insert is needed.
15127 // Always extract lowers when setting lower - these are all free subreg ops.
15128 if (NumUpperHalves == 0)
15129 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15130 UndefLower, DAG);
15132 if (NumUpperHalves == 1) {
15133 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15134 if (Subtarget.hasAVX2()) {
15135 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15136 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15137 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15138 (!isSingleSHUFPSMask(HalfMask) ||
15139 Subtarget.hasFastVariableCrossLaneShuffle()))
15140 return SDValue();
15141 // If this is a unary shuffle (assume that the 2nd operand is
15142 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15143 // are better off extracting the upper half of 1 operand and using a
15144 // narrow shuffle.
15145 if (EltWidth == 64 && V2.isUndef())
15146 return SDValue();
15148 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15149 if (Subtarget.hasAVX512() && VT.is512BitVector())
15150 return SDValue();
15151 // Extract + narrow shuffle is better than the wide alternative.
15152 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15153 UndefLower, DAG);
15156 // Don't extract both uppers, instead shuffle and then extract.
15157 assert(NumUpperHalves == 2 && "Half vector count went wrong");
15158 return SDValue();
15161 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15162 if (NumUpperHalves == 0) {
15163 // AVX2 has efficient 64-bit element cross-lane shuffles.
15164 // TODO: Refine to account for unary shuffle, splat, and other masks?
15165 if (Subtarget.hasAVX2() && EltWidth == 64)
15166 return SDValue();
15167 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15168 if (Subtarget.hasAVX512() && VT.is512BitVector())
15169 return SDValue();
15170 // Narrow shuffle + insert is better than the wide alternative.
15171 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15172 UndefLower, DAG);
15175 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15176 return SDValue();
15179 /// Handle case where shuffle sources are coming from the same 128-bit lane and
15180 /// every lane can be represented as the same repeating mask - allowing us to
15181 /// shuffle the sources with the repeating shuffle and then permute the result
15182 /// to the destination lanes.
15183 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
15184 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15185 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15186 int NumElts = VT.getVectorNumElements();
15187 int NumLanes = VT.getSizeInBits() / 128;
15188 int NumLaneElts = NumElts / NumLanes;
15190 // On AVX2 we may be able to just shuffle the lowest elements and then
15191 // broadcast the result.
15192 if (Subtarget.hasAVX2()) {
15193 for (unsigned BroadcastSize : {16, 32, 64}) {
15194 if (BroadcastSize <= VT.getScalarSizeInBits())
15195 continue;
15196 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15198 // Attempt to match a repeating pattern every NumBroadcastElts,
15199 // accounting for UNDEFs but only references the lowest 128-bit
15200 // lane of the inputs.
15201 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15202 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15203 for (int j = 0; j != NumBroadcastElts; ++j) {
15204 int M = Mask[i + j];
15205 if (M < 0)
15206 continue;
15207 int &R = RepeatMask[j];
15208 if (0 != ((M % NumElts) / NumLaneElts))
15209 return false;
15210 if (0 <= R && R != M)
15211 return false;
15212 R = M;
15214 return true;
15217 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15218 if (!FindRepeatingBroadcastMask(RepeatMask))
15219 continue;
15221 // Shuffle the (lowest) repeated elements in place for broadcast.
15222 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15224 // Shuffle the actual broadcast.
15225 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15226 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15227 for (int j = 0; j != NumBroadcastElts; ++j)
15228 BroadcastMask[i + j] = j;
15230 // Avoid returning the same shuffle operation. For example,
15231 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15232 if (BroadcastMask == Mask)
15233 return SDValue();
15235 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15236 BroadcastMask);
15240 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15241 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15242 return SDValue();
15244 // Bail if we already have a repeated lane shuffle mask.
15245 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15246 return SDValue();
15248 // Helper to look for repeated mask in each split sublane, and that those
15249 // sublanes can then be permuted into place.
15250 auto ShuffleSubLanes = [&](int SubLaneScale) {
15251 int NumSubLanes = NumLanes * SubLaneScale;
15252 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15254 // Check that all the sources are coming from the same lane and see if we
15255 // can form a repeating shuffle mask (local to each sub-lane). At the same
15256 // time, determine the source sub-lane for each destination sub-lane.
15257 int TopSrcSubLane = -1;
15258 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15259 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15260 SubLaneScale,
15261 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15263 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15264 // Extract the sub-lane mask, check that it all comes from the same lane
15265 // and normalize the mask entries to come from the first lane.
15266 int SrcLane = -1;
15267 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15268 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15269 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15270 if (M < 0)
15271 continue;
15272 int Lane = (M % NumElts) / NumLaneElts;
15273 if ((0 <= SrcLane) && (SrcLane != Lane))
15274 return SDValue();
15275 SrcLane = Lane;
15276 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15277 SubLaneMask[Elt] = LocalM;
15280 // Whole sub-lane is UNDEF.
15281 if (SrcLane < 0)
15282 continue;
15284 // Attempt to match against the candidate repeated sub-lane masks.
15285 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15286 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15287 for (int i = 0; i != NumSubLaneElts; ++i) {
15288 if (M1[i] < 0 || M2[i] < 0)
15289 continue;
15290 if (M1[i] != M2[i])
15291 return false;
15293 return true;
15296 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15297 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15298 continue;
15300 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15301 for (int i = 0; i != NumSubLaneElts; ++i) {
15302 int M = SubLaneMask[i];
15303 if (M < 0)
15304 continue;
15305 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15306 "Unexpected mask element");
15307 RepeatedSubLaneMask[i] = M;
15310 // Track the top most source sub-lane - by setting the remaining to
15311 // UNDEF we can greatly simplify shuffle matching.
15312 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15313 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15314 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15315 break;
15318 // Bail if we failed to find a matching repeated sub-lane mask.
15319 if (Dst2SrcSubLanes[DstSubLane] < 0)
15320 return SDValue();
15322 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15323 "Unexpected source lane");
15325 // Create a repeating shuffle mask for the entire vector.
15326 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15327 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15328 int Lane = SubLane / SubLaneScale;
15329 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15330 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15331 int M = RepeatedSubLaneMask[Elt];
15332 if (M < 0)
15333 continue;
15334 int Idx = (SubLane * NumSubLaneElts) + Elt;
15335 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15339 // Shuffle each source sub-lane to its destination.
15340 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15341 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15342 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15343 if (SrcSubLane < 0)
15344 continue;
15345 for (int j = 0; j != NumSubLaneElts; ++j)
15346 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15349 // Avoid returning the same shuffle operation.
15350 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15351 if (RepeatedMask == Mask || SubLaneMask == Mask)
15352 return SDValue();
15354 SDValue RepeatedShuffle =
15355 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15357 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15358 SubLaneMask);
15361 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15362 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15363 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15364 // Otherwise we can only permute whole 128-bit lanes.
15365 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15366 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15367 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15368 MinSubLaneScale = 2;
15369 MaxSubLaneScale =
15370 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15372 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15373 MinSubLaneScale = MaxSubLaneScale = 4;
15375 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15376 if (SDValue Shuffle = ShuffleSubLanes(Scale))
15377 return Shuffle;
15379 return SDValue();
15382 static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
15383 bool &ForceV1Zero, bool &ForceV2Zero,
15384 unsigned &ShuffleImm, ArrayRef<int> Mask,
15385 const APInt &Zeroable) {
15386 int NumElts = VT.getVectorNumElements();
15387 assert(VT.getScalarSizeInBits() == 64 &&
15388 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15389 "Unexpected data type for VSHUFPD");
15390 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15391 "Illegal shuffle mask");
15393 bool ZeroLane[2] = { true, true };
15394 for (int i = 0; i < NumElts; ++i)
15395 ZeroLane[i & 1] &= Zeroable[i];
15397 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15398 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15399 ShuffleImm = 0;
15400 bool ShufpdMask = true;
15401 bool CommutableMask = true;
15402 for (int i = 0; i < NumElts; ++i) {
15403 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15404 continue;
15405 if (Mask[i] < 0)
15406 return false;
15407 int Val = (i & 6) + NumElts * (i & 1);
15408 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15409 if (Mask[i] < Val || Mask[i] > Val + 1)
15410 ShufpdMask = false;
15411 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15412 CommutableMask = false;
15413 ShuffleImm |= (Mask[i] % 2) << i;
15416 if (!ShufpdMask && !CommutableMask)
15417 return false;
15419 if (!ShufpdMask && CommutableMask)
15420 std::swap(V1, V2);
15422 ForceV1Zero = ZeroLane[0];
15423 ForceV2Zero = ZeroLane[1];
15424 return true;
15427 static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
15428 SDValue V2, ArrayRef<int> Mask,
15429 const APInt &Zeroable,
15430 const X86Subtarget &Subtarget,
15431 SelectionDAG &DAG) {
15432 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15433 "Unexpected data type for VSHUFPD");
15435 unsigned Immediate = 0;
15436 bool ForceV1Zero = false, ForceV2Zero = false;
15437 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15438 Mask, Zeroable))
15439 return SDValue();
15441 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15442 if (ForceV1Zero)
15443 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15444 if (ForceV2Zero)
15445 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15447 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15448 DAG.getTargetConstant(Immediate, DL, MVT::i8));
15451 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
15452 // by zeroable elements in the remaining 24 elements. Turn this into two
15453 // vmovqb instructions shuffled together.
15454 static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
15455 SDValue V1, SDValue V2,
15456 ArrayRef<int> Mask,
15457 const APInt &Zeroable,
15458 SelectionDAG &DAG) {
15459 assert(VT == MVT::v32i8 && "Unexpected type!");
15461 // The first 8 indices should be every 8th element.
15462 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
15463 return SDValue();
15465 // Remaining elements need to be zeroable.
15466 if (Zeroable.countl_one() < (Mask.size() - 8))
15467 return SDValue();
15469 V1 = DAG.getBitcast(MVT::v4i64, V1);
15470 V2 = DAG.getBitcast(MVT::v4i64, V2);
15472 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
15473 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
15475 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
15476 // the upper bits of the result using an unpckldq.
15477 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
15478 { 0, 1, 2, 3, 16, 17, 18, 19,
15479 4, 5, 6, 7, 20, 21, 22, 23 });
15480 // Insert the unpckldq into a zero vector to widen to v32i8.
15481 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
15482 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
15483 DAG.getIntPtrConstant(0, DL));
15486 // a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
15487 // b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
15488 // =>
15489 // ul = unpckl v1, v2
15490 // uh = unpckh v1, v2
15491 // a = vperm ul, uh
15492 // b = vperm ul, uh
15494 // Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
15495 // and permute. We cannot directly match v3 because it is split into two
15496 // 256-bit vectors in earlier isel stages. Therefore, this function matches a
15497 // pair of 256-bit shuffles and makes sure the masks are consecutive.
15499 // Once unpck and permute nodes are created, the permute corresponding to this
15500 // shuffle is returned, while the other permute replaces the other half of the
15501 // shuffle in the selection dag.
15502 static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
15503 SDValue V1, SDValue V2,
15504 ArrayRef<int> Mask,
15505 SelectionDAG &DAG) {
15506 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
15507 VT != MVT::v32i8)
15508 return SDValue();
15509 // <B0, B1, B0+1, B1+1, ..., >
15510 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
15511 unsigned Begin1) {
15512 size_t Size = Mask.size();
15513 assert(Size % 2 == 0 && "Expected even mask size");
15514 for (unsigned I = 0; I < Size; I += 2) {
15515 if (Mask[I] != (int)(Begin0 + I / 2) ||
15516 Mask[I + 1] != (int)(Begin1 + I / 2))
15517 return false;
15519 return true;
15521 // Check which half is this shuffle node
15522 int NumElts = VT.getVectorNumElements();
15523 size_t FirstQtr = NumElts / 2;
15524 size_t ThirdQtr = NumElts + NumElts / 2;
15525 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
15526 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
15527 if (!IsFirstHalf && !IsSecondHalf)
15528 return SDValue();
15530 // Find the intersection between shuffle users of V1 and V2.
15531 SmallVector<SDNode *, 2> Shuffles;
15532 for (SDNode *User : V1->uses())
15533 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
15534 User->getOperand(1) == V2)
15535 Shuffles.push_back(User);
15536 // Limit user size to two for now.
15537 if (Shuffles.size() != 2)
15538 return SDValue();
15539 // Find out which half of the 512-bit shuffles is each smaller shuffle
15540 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
15541 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
15542 SDNode *FirstHalf;
15543 SDNode *SecondHalf;
15544 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
15545 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
15546 FirstHalf = Shuffles[0];
15547 SecondHalf = Shuffles[1];
15548 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
15549 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
15550 FirstHalf = Shuffles[1];
15551 SecondHalf = Shuffles[0];
15552 } else {
15553 return SDValue();
15555 // Lower into unpck and perm. Return the perm of this shuffle and replace
15556 // the other.
15557 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
15558 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
15559 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15560 DAG.getTargetConstant(0x20, DL, MVT::i8));
15561 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15562 DAG.getTargetConstant(0x31, DL, MVT::i8));
15563 if (IsFirstHalf) {
15564 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
15565 return Perm1;
15567 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
15568 return Perm2;
15571 /// Handle lowering of 4-lane 64-bit floating point shuffles.
15573 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
15574 /// isn't available.
15575 static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15576 const APInt &Zeroable, SDValue V1, SDValue V2,
15577 const X86Subtarget &Subtarget,
15578 SelectionDAG &DAG) {
15579 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15580 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15581 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15583 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
15584 Subtarget, DAG))
15585 return V;
15587 if (V2.isUndef()) {
15588 // Check for being able to broadcast a single element.
15589 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
15590 Mask, Subtarget, DAG))
15591 return Broadcast;
15593 // Use low duplicate instructions for masks that match their pattern.
15594 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15595 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
15597 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
15598 // Non-half-crossing single input shuffles can be lowered with an
15599 // interleaved permutation.
15600 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
15601 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
15602 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
15603 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
15606 // With AVX2 we have direct support for this permutation.
15607 if (Subtarget.hasAVX2())
15608 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
15609 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15611 // Try to create an in-lane repeating shuffle mask and then shuffle the
15612 // results into the target lanes.
15613 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15614 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15615 return V;
15617 // Try to permute the lanes and then use a per-lane permute.
15618 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
15619 Mask, DAG, Subtarget))
15620 return V;
15622 // Otherwise, fall back.
15623 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
15624 DAG, Subtarget);
15627 // Use dedicated unpack instructions for masks that match their pattern.
15628 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
15629 return V;
15631 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
15632 Zeroable, Subtarget, DAG))
15633 return Blend;
15635 // Check if the blend happens to exactly fit that of SHUFPD.
15636 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
15637 Zeroable, Subtarget, DAG))
15638 return Op;
15640 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
15641 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
15643 // If we have lane crossing shuffles AND they don't all come from the lower
15644 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15645 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
15646 // canonicalize to a blend of splat which isn't necessary for this combine.
15647 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
15648 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
15649 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
15650 (V2.getOpcode() != ISD::BUILD_VECTOR))
15651 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
15653 // If we have one input in place, then we can permute the other input and
15654 // blend the result.
15655 if (V1IsInPlace || V2IsInPlace)
15656 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15657 Subtarget, DAG);
15659 // Try to create an in-lane repeating shuffle mask and then shuffle the
15660 // results into the target lanes.
15661 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15662 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15663 return V;
15665 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15666 // shuffle. However, if we have AVX2 and either inputs are already in place,
15667 // we will be able to shuffle even across lanes the other input in a single
15668 // instruction so skip this pattern.
15669 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
15670 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
15671 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15672 return V;
15674 // If we have VLX support, we can use VEXPAND.
15675 if (Subtarget.hasVLX())
15676 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
15677 DAG, Subtarget))
15678 return V;
15680 // If we have AVX2 then we always want to lower with a blend because an v4 we
15681 // can fully permute the elements.
15682 if (Subtarget.hasAVX2())
15683 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15684 Subtarget, DAG);
15686 // Otherwise fall back on generic lowering.
15687 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
15688 Subtarget, DAG);
15691 /// Handle lowering of 4-lane 64-bit integer shuffles.
15693 /// This routine is only called when we have AVX2 and thus a reasonable
15694 /// instruction set for v4i64 shuffling..
15695 static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15696 const APInt &Zeroable, SDValue V1, SDValue V2,
15697 const X86Subtarget &Subtarget,
15698 SelectionDAG &DAG) {
15699 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15700 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15701 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15702 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
15704 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15705 Subtarget, DAG))
15706 return V;
15708 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
15709 Zeroable, Subtarget, DAG))
15710 return Blend;
15712 // Check for being able to broadcast a single element.
15713 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
15714 Subtarget, DAG))
15715 return Broadcast;
15717 // Try to use shift instructions if fast.
15718 if (Subtarget.preferLowerShuffleAsShift())
15719 if (SDValue Shift =
15720 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15721 Subtarget, DAG, /*BitwiseOnly*/ true))
15722 return Shift;
15724 if (V2.isUndef()) {
15725 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
15726 // can use lower latency instructions that will operate on both lanes.
15727 SmallVector<int, 2> RepeatedMask;
15728 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
15729 SmallVector<int, 4> PSHUFDMask;
15730 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
15731 return DAG.getBitcast(
15732 MVT::v4i64,
15733 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
15734 DAG.getBitcast(MVT::v8i32, V1),
15735 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15738 // AVX2 provides a direct instruction for permuting a single input across
15739 // lanes.
15740 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
15741 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15744 // Try to use shift instructions.
15745 if (SDValue Shift =
15746 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
15747 DAG, /*BitwiseOnly*/ false))
15748 return Shift;
15750 // If we have VLX support, we can use VALIGN or VEXPAND.
15751 if (Subtarget.hasVLX()) {
15752 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
15753 Zeroable, Subtarget, DAG))
15754 return Rotate;
15756 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
15757 DAG, Subtarget))
15758 return V;
15761 // Try to use PALIGNR.
15762 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
15763 Subtarget, DAG))
15764 return Rotate;
15766 // Use dedicated unpack instructions for masks that match their pattern.
15767 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
15768 return V;
15770 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
15771 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
15773 // If we have one input in place, then we can permute the other input and
15774 // blend the result.
15775 if (V1IsInPlace || V2IsInPlace)
15776 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
15777 Subtarget, DAG);
15779 // Try to create an in-lane repeating shuffle mask and then shuffle the
15780 // results into the target lanes.
15781 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15782 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
15783 return V;
15785 // Try to lower to PERMQ(BLENDD(V1,V2)).
15786 if (SDValue V =
15787 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
15788 return V;
15790 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15791 // shuffle. However, if we have AVX2 and either inputs are already in place,
15792 // we will be able to shuffle even across lanes the other input in a single
15793 // instruction so skip this pattern.
15794 if (!V1IsInPlace && !V2IsInPlace)
15795 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
15796 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
15797 return Result;
15799 // Otherwise fall back on generic blend lowering.
15800 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
15801 Subtarget, DAG);
15804 /// Handle lowering of 8-lane 32-bit floating point shuffles.
15806 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
15807 /// isn't available.
15808 static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15809 const APInt &Zeroable, SDValue V1, SDValue V2,
15810 const X86Subtarget &Subtarget,
15811 SelectionDAG &DAG) {
15812 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
15813 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
15814 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15816 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
15817 Zeroable, Subtarget, DAG))
15818 return Blend;
15820 // Check for being able to broadcast a single element.
15821 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
15822 Subtarget, DAG))
15823 return Broadcast;
15825 if (!Subtarget.hasAVX2()) {
15826 SmallVector<int> InLaneMask;
15827 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15829 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
15830 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
15831 /*SimpleOnly*/ true))
15832 return R;
15834 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
15835 Zeroable, Subtarget, DAG))
15836 return DAG.getBitcast(MVT::v8f32, ZExt);
15838 // If the shuffle mask is repeated in each 128-bit lane, we have many more
15839 // options to efficiently lower the shuffle.
15840 SmallVector<int, 4> RepeatedMask;
15841 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
15842 assert(RepeatedMask.size() == 4 &&
15843 "Repeated masks must be half the mask width!");
15845 // Use even/odd duplicate instructions for masks that match their pattern.
15846 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
15847 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
15848 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
15849 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
15851 if (V2.isUndef())
15852 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
15853 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
15855 // Use dedicated unpack instructions for masks that match their pattern.
15856 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
15857 return V;
15859 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
15860 // have already handled any direct blends.
15861 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
15864 // Try to create an in-lane repeating shuffle mask and then shuffle the
15865 // results into the target lanes.
15866 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15867 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
15868 return V;
15870 // If we have a single input shuffle with different shuffle patterns in the
15871 // two 128-bit lanes use the variable mask to VPERMILPS.
15872 if (V2.isUndef()) {
15873 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
15874 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
15875 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
15877 if (Subtarget.hasAVX2()) {
15878 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
15879 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
15881 // Otherwise, fall back.
15882 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
15883 DAG, Subtarget);
15886 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15887 // shuffle.
15888 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
15889 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
15890 return Result;
15892 // If we have VLX support, we can use VEXPAND.
15893 if (Subtarget.hasVLX())
15894 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
15895 DAG, Subtarget))
15896 return V;
15898 // Try to match an interleave of two v8f32s and lower them as unpck and
15899 // permutes using ymms. This needs to go before we try to split the vectors.
15901 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
15902 // this path inadvertently.
15903 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
15904 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
15905 Mask, DAG))
15906 return V;
15908 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
15909 // since after split we get a more efficient code using vpunpcklwd and
15910 // vpunpckhwd instrs than vblend.
15911 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
15912 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
15913 DAG);
15915 // If we have AVX2 then we always want to lower with a blend because at v8 we
15916 // can fully permute the elements.
15917 if (Subtarget.hasAVX2())
15918 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
15919 Subtarget, DAG);
15921 // Otherwise fall back on generic lowering.
15922 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
15923 Subtarget, DAG);
15926 /// Handle lowering of 8-lane 32-bit integer shuffles.
15928 /// This routine is only called when we have AVX2 and thus a reasonable
15929 /// instruction set for v8i32 shuffling..
15930 static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15931 const APInt &Zeroable, SDValue V1, SDValue V2,
15932 const X86Subtarget &Subtarget,
15933 SelectionDAG &DAG) {
15934 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
15935 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
15936 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15937 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
15939 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
15941 // Whenever we can lower this as a zext, that instruction is strictly faster
15942 // than any alternative. It also allows us to fold memory operands into the
15943 // shuffle in many cases.
15944 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
15945 Zeroable, Subtarget, DAG))
15946 return ZExt;
15948 // Try to match an interleave of two v8i32s and lower them as unpck and
15949 // permutes using ymms. This needs to go before we try to split the vectors.
15950 if (!Subtarget.hasAVX512())
15951 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
15952 Mask, DAG))
15953 return V;
15955 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
15956 // since after split we get a more efficient code than vblend by using
15957 // vpunpcklwd and vpunpckhwd instrs.
15958 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
15959 !Subtarget.hasAVX512())
15960 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
15961 DAG);
15963 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
15964 Zeroable, Subtarget, DAG))
15965 return Blend;
15967 // Check for being able to broadcast a single element.
15968 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
15969 Subtarget, DAG))
15970 return Broadcast;
15972 // Try to use shift instructions if fast.
15973 if (Subtarget.preferLowerShuffleAsShift()) {
15974 if (SDValue Shift =
15975 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
15976 Subtarget, DAG, /*BitwiseOnly*/ true))
15977 return Shift;
15978 if (NumV2Elements == 0)
15979 if (SDValue Rotate =
15980 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
15981 return Rotate;
15984 // If the shuffle mask is repeated in each 128-bit lane we can use more
15985 // efficient instructions that mirror the shuffles across the two 128-bit
15986 // lanes.
15987 SmallVector<int, 4> RepeatedMask;
15988 bool Is128BitLaneRepeatedShuffle =
15989 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
15990 if (Is128BitLaneRepeatedShuffle) {
15991 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
15992 if (V2.isUndef())
15993 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
15994 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
15996 // Use dedicated unpack instructions for masks that match their pattern.
15997 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
15998 return V;
16001 // Try to use shift instructions.
16002 if (SDValue Shift =
16003 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16004 DAG, /*BitwiseOnly*/ false))
16005 return Shift;
16007 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16008 if (SDValue Rotate =
16009 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16010 return Rotate;
16012 // If we have VLX support, we can use VALIGN or EXPAND.
16013 if (Subtarget.hasVLX()) {
16014 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16015 Zeroable, Subtarget, DAG))
16016 return Rotate;
16018 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
16019 DAG, Subtarget))
16020 return V;
16023 // Try to use byte rotation instructions.
16024 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16025 Subtarget, DAG))
16026 return Rotate;
16028 // Try to create an in-lane repeating shuffle mask and then shuffle the
16029 // results into the target lanes.
16030 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16031 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16032 return V;
16034 if (V2.isUndef()) {
16035 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16036 // because that should be faster than the variable permute alternatives.
16037 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
16038 return V;
16040 // If the shuffle patterns aren't repeated but it's a single input, directly
16041 // generate a cross-lane VPERMD instruction.
16042 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16043 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16046 // Assume that a single SHUFPS is faster than an alternative sequence of
16047 // multiple instructions (even if the CPU has a domain penalty).
16048 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16049 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16050 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16051 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16052 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16053 CastV1, CastV2, DAG);
16054 return DAG.getBitcast(MVT::v8i32, ShufPS);
16057 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16058 // shuffle.
16059 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16060 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16061 return Result;
16063 // Otherwise fall back on generic blend lowering.
16064 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16065 Subtarget, DAG);
16068 /// Handle lowering of 16-lane 16-bit integer shuffles.
16070 /// This routine is only called when we have AVX2 and thus a reasonable
16071 /// instruction set for v16i16 shuffling..
16072 static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16073 const APInt &Zeroable, SDValue V1, SDValue V2,
16074 const X86Subtarget &Subtarget,
16075 SelectionDAG &DAG) {
16076 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16077 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16078 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16079 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16081 // Whenever we can lower this as a zext, that instruction is strictly faster
16082 // than any alternative. It also allows us to fold memory operands into the
16083 // shuffle in many cases.
16084 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16085 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16086 return ZExt;
16088 // Check for being able to broadcast a single element.
16089 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16090 Subtarget, DAG))
16091 return Broadcast;
16093 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16094 Zeroable, Subtarget, DAG))
16095 return Blend;
16097 // Use dedicated unpack instructions for masks that match their pattern.
16098 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
16099 return V;
16101 // Use dedicated pack instructions for masks that match their pattern.
16102 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
16103 Subtarget))
16104 return V;
16106 // Try to use lower using a truncation.
16107 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16108 Subtarget, DAG))
16109 return V;
16111 // Try to use shift instructions.
16112 if (SDValue Shift =
16113 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16114 Subtarget, DAG, /*BitwiseOnly*/ false))
16115 return Shift;
16117 // Try to use byte rotation instructions.
16118 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16119 Subtarget, DAG))
16120 return Rotate;
16122 // Try to create an in-lane repeating shuffle mask and then shuffle the
16123 // results into the target lanes.
16124 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16125 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16126 return V;
16128 if (V2.isUndef()) {
16129 // Try to use bit rotation instructions.
16130 if (SDValue Rotate =
16131 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16132 return Rotate;
16134 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16135 // because that should be faster than the variable permute alternatives.
16136 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
16137 return V;
16139 // There are no generalized cross-lane shuffle operations available on i16
16140 // element types.
16141 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16142 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16143 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16144 return V;
16146 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16147 DAG, Subtarget);
16150 SmallVector<int, 8> RepeatedMask;
16151 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16152 // As this is a single-input shuffle, the repeated mask should be
16153 // a strictly valid v8i16 mask that we can pass through to the v8i16
16154 // lowering to handle even the v16 case.
16155 return lowerV8I16GeneralSingleInputShuffle(
16156 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16160 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16161 Zeroable, Subtarget, DAG))
16162 return PSHUFB;
16164 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16165 if (Subtarget.hasBWI())
16166 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16168 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16169 // shuffle.
16170 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16171 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16172 return Result;
16174 // Try to permute the lanes and then use a per-lane permute.
16175 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16176 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16177 return V;
16179 // Try to match an interleave of two v16i16s and lower them as unpck and
16180 // permutes using ymms.
16181 if (!Subtarget.hasAVX512())
16182 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16183 Mask, DAG))
16184 return V;
16186 // Otherwise fall back on generic lowering.
16187 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16188 Subtarget, DAG);
16191 /// Handle lowering of 32-lane 8-bit integer shuffles.
16193 /// This routine is only called when we have AVX2 and thus a reasonable
16194 /// instruction set for v32i8 shuffling..
16195 static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16196 const APInt &Zeroable, SDValue V1, SDValue V2,
16197 const X86Subtarget &Subtarget,
16198 SelectionDAG &DAG) {
16199 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16200 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16201 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16202 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16204 // Whenever we can lower this as a zext, that instruction is strictly faster
16205 // than any alternative. It also allows us to fold memory operands into the
16206 // shuffle in many cases.
16207 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16208 Zeroable, Subtarget, DAG))
16209 return ZExt;
16211 // Check for being able to broadcast a single element.
16212 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16213 Subtarget, DAG))
16214 return Broadcast;
16216 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16217 Zeroable, Subtarget, DAG))
16218 return Blend;
16220 // Use dedicated unpack instructions for masks that match their pattern.
16221 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
16222 return V;
16224 // Use dedicated pack instructions for masks that match their pattern.
16225 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
16226 Subtarget))
16227 return V;
16229 // Try to use lower using a truncation.
16230 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16231 Subtarget, DAG))
16232 return V;
16234 // Try to use shift instructions.
16235 if (SDValue Shift =
16236 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16237 DAG, /*BitwiseOnly*/ false))
16238 return Shift;
16240 // Try to use byte rotation instructions.
16241 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16242 Subtarget, DAG))
16243 return Rotate;
16245 // Try to use bit rotation instructions.
16246 if (V2.isUndef())
16247 if (SDValue Rotate =
16248 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16249 return Rotate;
16251 // Try to create an in-lane repeating shuffle mask and then shuffle the
16252 // results into the target lanes.
16253 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16254 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16255 return V;
16257 // There are no generalized cross-lane shuffle operations available on i8
16258 // element types.
16259 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16260 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16261 // because that should be faster than the variable permute alternatives.
16262 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
16263 return V;
16265 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16266 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16267 return V;
16269 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16270 DAG, Subtarget);
16273 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16274 Zeroable, Subtarget, DAG))
16275 return PSHUFB;
16277 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16278 if (Subtarget.hasVBMI())
16279 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16281 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16282 // shuffle.
16283 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16284 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16285 return Result;
16287 // Try to permute the lanes and then use a per-lane permute.
16288 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16289 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16290 return V;
16292 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16293 // by zeroable elements in the remaining 24 elements. Turn this into two
16294 // vmovqb instructions shuffled together.
16295 if (Subtarget.hasVLX())
16296 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16297 Mask, Zeroable, DAG))
16298 return V;
16300 // Try to match an interleave of two v32i8s and lower them as unpck and
16301 // permutes using ymms.
16302 if (!Subtarget.hasAVX512())
16303 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16304 Mask, DAG))
16305 return V;
16307 // Otherwise fall back on generic lowering.
16308 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16309 Subtarget, DAG);
16312 /// High-level routine to lower various 256-bit x86 vector shuffles.
16314 /// This routine either breaks down the specific type of a 256-bit x86 vector
16315 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
16316 /// together based on the available instructions.
16317 static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
16318 SDValue V1, SDValue V2, const APInt &Zeroable,
16319 const X86Subtarget &Subtarget,
16320 SelectionDAG &DAG) {
16321 // If we have a single input to the zero element, insert that into V1 if we
16322 // can do so cheaply.
16323 int NumElts = VT.getVectorNumElements();
16324 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16326 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16327 if (SDValue Insertion = lowerShuffleAsElementInsertion(
16328 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16329 return Insertion;
16331 // Handle special cases where the lower or upper half is UNDEF.
16332 if (SDValue V =
16333 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16334 return V;
16336 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16337 // can check for those subtargets here and avoid much of the subtarget
16338 // querying in the per-vector-type lowering routines. With AVX1 we have
16339 // essentially *zero* ability to manipulate a 256-bit vector with integer
16340 // types. Since we'll use floating point types there eventually, just
16341 // immediately cast everything to a float and operate entirely in that domain.
16342 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16343 int ElementBits = VT.getScalarSizeInBits();
16344 if (ElementBits < 32) {
16345 // No floating point type available, if we can't use the bit operations
16346 // for masking/blending then decompose into 128-bit vectors.
16347 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16348 Subtarget, DAG))
16349 return V;
16350 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16351 return V;
16352 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16355 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16356 VT.getVectorNumElements());
16357 V1 = DAG.getBitcast(FpVT, V1);
16358 V2 = DAG.getBitcast(FpVT, V2);
16359 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16362 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16363 V1 = DAG.getBitcast(MVT::v16i16, V1);
16364 V2 = DAG.getBitcast(MVT::v16i16, V2);
16365 return DAG.getBitcast(VT,
16366 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16369 switch (VT.SimpleTy) {
16370 case MVT::v4f64:
16371 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16372 case MVT::v4i64:
16373 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16374 case MVT::v8f32:
16375 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16376 case MVT::v8i32:
16377 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16378 case MVT::v16i16:
16379 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16380 case MVT::v32i8:
16381 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16383 default:
16384 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16388 /// Try to lower a vector shuffle as a 128-bit shuffles.
16389 static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
16390 const APInt &Zeroable, SDValue V1, SDValue V2,
16391 const X86Subtarget &Subtarget,
16392 SelectionDAG &DAG) {
16393 assert(VT.getScalarSizeInBits() == 64 &&
16394 "Unexpected element type size for 128bit shuffle.");
16396 // To handle 256 bit vector requires VLX and most probably
16397 // function lowerV2X128VectorShuffle() is better solution.
16398 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16400 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16401 SmallVector<int, 4> Widened128Mask;
16402 if (!canWidenShuffleElements(Mask, Widened128Mask))
16403 return SDValue();
16404 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16406 // Try to use an insert into a zero vector.
16407 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16408 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16409 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16410 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16411 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16412 DAG.getIntPtrConstant(0, DL));
16413 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16414 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16415 DAG.getIntPtrConstant(0, DL));
16418 // Check for patterns which can be matched with a single insert of a 256-bit
16419 // subvector.
16420 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16421 if (OnlyUsesV1 ||
16422 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16423 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16424 SDValue SubVec =
16425 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16426 DAG.getIntPtrConstant(0, DL));
16427 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16428 DAG.getIntPtrConstant(4, DL));
16431 // See if this is an insertion of the lower 128-bits of V2 into V1.
16432 bool IsInsert = true;
16433 int V2Index = -1;
16434 for (int i = 0; i < 4; ++i) {
16435 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16436 if (Widened128Mask[i] < 0)
16437 continue;
16439 // Make sure all V1 subvectors are in place.
16440 if (Widened128Mask[i] < 4) {
16441 if (Widened128Mask[i] != i) {
16442 IsInsert = false;
16443 break;
16445 } else {
16446 // Make sure we only have a single V2 index and its the lowest 128-bits.
16447 if (V2Index >= 0 || Widened128Mask[i] != 4) {
16448 IsInsert = false;
16449 break;
16451 V2Index = i;
16454 if (IsInsert && V2Index >= 0) {
16455 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16456 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
16457 DAG.getIntPtrConstant(0, DL));
16458 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
16461 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
16462 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
16463 // possible we at least ensure the lanes stay sequential to help later
16464 // combines.
16465 SmallVector<int, 2> Widened256Mask;
16466 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
16467 Widened128Mask.clear();
16468 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
16471 // Try to lower to vshuf64x2/vshuf32x4.
16472 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
16473 int PermMask[4] = {-1, -1, -1, -1};
16474 // Ensure elements came from the same Op.
16475 for (int i = 0; i < 4; ++i) {
16476 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16477 if (Widened128Mask[i] < 0)
16478 continue;
16480 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
16481 unsigned OpIndex = i / 2;
16482 if (Ops[OpIndex].isUndef())
16483 Ops[OpIndex] = Op;
16484 else if (Ops[OpIndex] != Op)
16485 return SDValue();
16487 PermMask[i] = Widened128Mask[i] % 4;
16490 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
16491 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
16494 /// Handle lowering of 8-lane 64-bit floating point shuffles.
16495 static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16496 const APInt &Zeroable, SDValue V1, SDValue V2,
16497 const X86Subtarget &Subtarget,
16498 SelectionDAG &DAG) {
16499 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16500 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16501 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16503 if (V2.isUndef()) {
16504 // Use low duplicate instructions for masks that match their pattern.
16505 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
16506 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
16508 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
16509 // Non-half-crossing single input shuffles can be lowered with an
16510 // interleaved permutation.
16511 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16512 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
16513 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
16514 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
16515 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
16516 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16519 SmallVector<int, 4> RepeatedMask;
16520 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
16521 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
16522 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16525 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
16526 V2, Subtarget, DAG))
16527 return Shuf128;
16529 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
16530 return Unpck;
16532 // Check if the blend happens to exactly fit that of SHUFPD.
16533 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
16534 Zeroable, Subtarget, DAG))
16535 return Op;
16537 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
16538 DAG, Subtarget))
16539 return V;
16541 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
16542 Zeroable, Subtarget, DAG))
16543 return Blend;
16545 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
16548 /// Handle lowering of 16-lane 32-bit floating point shuffles.
16549 static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16550 const APInt &Zeroable, SDValue V1, SDValue V2,
16551 const X86Subtarget &Subtarget,
16552 SelectionDAG &DAG) {
16553 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16554 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16555 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16557 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16558 // options to efficiently lower the shuffle.
16559 SmallVector<int, 4> RepeatedMask;
16560 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
16561 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16563 // Use even/odd duplicate instructions for masks that match their pattern.
16564 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16565 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
16566 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16567 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
16569 if (V2.isUndef())
16570 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
16571 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16573 // Use dedicated unpack instructions for masks that match their pattern.
16574 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
16575 return V;
16577 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16578 Zeroable, Subtarget, DAG))
16579 return Blend;
16581 // Otherwise, fall back to a SHUFPS sequence.
16582 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
16585 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16586 Zeroable, Subtarget, DAG))
16587 return Blend;
16589 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16590 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16591 return DAG.getBitcast(MVT::v16f32, ZExt);
16593 // Try to create an in-lane repeating shuffle mask and then shuffle the
16594 // results into the target lanes.
16595 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16596 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
16597 return V;
16599 // If we have a single input shuffle with different shuffle patterns in the
16600 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
16601 if (V2.isUndef() &&
16602 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
16603 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
16604 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
16607 // If we have AVX512F support, we can use VEXPAND.
16608 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
16609 V1, V2, DAG, Subtarget))
16610 return V;
16612 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
16615 /// Handle lowering of 8-lane 64-bit integer shuffles.
16616 static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16617 const APInt &Zeroable, SDValue V1, SDValue V2,
16618 const X86Subtarget &Subtarget,
16619 SelectionDAG &DAG) {
16620 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16621 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16622 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16624 // Try to use shift instructions if fast.
16625 if (Subtarget.preferLowerShuffleAsShift())
16626 if (SDValue Shift =
16627 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
16628 Subtarget, DAG, /*BitwiseOnly*/ true))
16629 return Shift;
16631 if (V2.isUndef()) {
16632 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16633 // can use lower latency instructions that will operate on all four
16634 // 128-bit lanes.
16635 SmallVector<int, 2> Repeated128Mask;
16636 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
16637 SmallVector<int, 4> PSHUFDMask;
16638 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
16639 return DAG.getBitcast(
16640 MVT::v8i64,
16641 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
16642 DAG.getBitcast(MVT::v16i32, V1),
16643 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16646 SmallVector<int, 4> Repeated256Mask;
16647 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
16648 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
16649 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
16652 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
16653 V2, Subtarget, DAG))
16654 return Shuf128;
16656 // Try to use shift instructions.
16657 if (SDValue Shift =
16658 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
16659 DAG, /*BitwiseOnly*/ false))
16660 return Shift;
16662 // Try to use VALIGN.
16663 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
16664 Zeroable, Subtarget, DAG))
16665 return Rotate;
16667 // Try to use PALIGNR.
16668 if (Subtarget.hasBWI())
16669 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
16670 Subtarget, DAG))
16671 return Rotate;
16673 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
16674 return Unpck;
16676 // If we have AVX512F support, we can use VEXPAND.
16677 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
16678 DAG, Subtarget))
16679 return V;
16681 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
16682 Zeroable, Subtarget, DAG))
16683 return Blend;
16685 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
16688 /// Handle lowering of 16-lane 32-bit integer shuffles.
16689 static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16690 const APInt &Zeroable, SDValue V1, SDValue V2,
16691 const X86Subtarget &Subtarget,
16692 SelectionDAG &DAG) {
16693 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16694 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16695 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16697 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16699 // Whenever we can lower this as a zext, that instruction is strictly faster
16700 // than any alternative. It also allows us to fold memory operands into the
16701 // shuffle in many cases.
16702 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16703 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16704 return ZExt;
16706 // Try to use shift instructions if fast.
16707 if (Subtarget.preferLowerShuffleAsShift()) {
16708 if (SDValue Shift =
16709 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16710 Subtarget, DAG, /*BitwiseOnly*/ true))
16711 return Shift;
16712 if (NumV2Elements == 0)
16713 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
16714 Subtarget, DAG))
16715 return Rotate;
16718 // If the shuffle mask is repeated in each 128-bit lane we can use more
16719 // efficient instructions that mirror the shuffles across the four 128-bit
16720 // lanes.
16721 SmallVector<int, 4> RepeatedMask;
16722 bool Is128BitLaneRepeatedShuffle =
16723 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
16724 if (Is128BitLaneRepeatedShuffle) {
16725 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16726 if (V2.isUndef())
16727 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
16728 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16730 // Use dedicated unpack instructions for masks that match their pattern.
16731 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
16732 return V;
16735 // Try to use shift instructions.
16736 if (SDValue Shift =
16737 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16738 Subtarget, DAG, /*BitwiseOnly*/ false))
16739 return Shift;
16741 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
16742 if (SDValue Rotate =
16743 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
16744 return Rotate;
16746 // Try to use VALIGN.
16747 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
16748 Zeroable, Subtarget, DAG))
16749 return Rotate;
16751 // Try to use byte rotation instructions.
16752 if (Subtarget.hasBWI())
16753 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
16754 Subtarget, DAG))
16755 return Rotate;
16757 // Assume that a single SHUFPS is faster than using a permv shuffle.
16758 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16759 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16760 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
16761 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
16762 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
16763 CastV1, CastV2, DAG);
16764 return DAG.getBitcast(MVT::v16i32, ShufPS);
16767 // Try to create an in-lane repeating shuffle mask and then shuffle the
16768 // results into the target lanes.
16769 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16770 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
16771 return V;
16773 // If we have AVX512F support, we can use VEXPAND.
16774 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
16775 DAG, Subtarget))
16776 return V;
16778 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
16779 Zeroable, Subtarget, DAG))
16780 return Blend;
16782 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
16785 /// Handle lowering of 32-lane 16-bit integer shuffles.
16786 static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16787 const APInt &Zeroable, SDValue V1, SDValue V2,
16788 const X86Subtarget &Subtarget,
16789 SelectionDAG &DAG) {
16790 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
16791 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
16792 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16793 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
16795 // Whenever we can lower this as a zext, that instruction is strictly faster
16796 // than any alternative. It also allows us to fold memory operands into the
16797 // shuffle in many cases.
16798 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16799 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16800 return ZExt;
16802 // Use dedicated unpack instructions for masks that match their pattern.
16803 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
16804 return V;
16806 // Use dedicated pack instructions for masks that match their pattern.
16807 if (SDValue V =
16808 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
16809 return V;
16811 // Try to use shift instructions.
16812 if (SDValue Shift =
16813 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
16814 Subtarget, DAG, /*BitwiseOnly*/ false))
16815 return Shift;
16817 // Try to use byte rotation instructions.
16818 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
16819 Subtarget, DAG))
16820 return Rotate;
16822 if (V2.isUndef()) {
16823 // Try to use bit rotation instructions.
16824 if (SDValue Rotate =
16825 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
16826 return Rotate;
16828 SmallVector<int, 8> RepeatedMask;
16829 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
16830 // As this is a single-input shuffle, the repeated mask should be
16831 // a strictly valid v8i16 mask that we can pass through to the v8i16
16832 // lowering to handle even the v32 case.
16833 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
16834 RepeatedMask, Subtarget, DAG);
16838 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
16839 Zeroable, Subtarget, DAG))
16840 return Blend;
16842 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
16843 Zeroable, Subtarget, DAG))
16844 return PSHUFB;
16846 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
16849 /// Handle lowering of 64-lane 8-bit integer shuffles.
16850 static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16851 const APInt &Zeroable, SDValue V1, SDValue V2,
16852 const X86Subtarget &Subtarget,
16853 SelectionDAG &DAG) {
16854 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
16855 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
16856 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
16857 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
16859 // Whenever we can lower this as a zext, that instruction is strictly faster
16860 // than any alternative. It also allows us to fold memory operands into the
16861 // shuffle in many cases.
16862 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16863 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
16864 return ZExt;
16866 // Use dedicated unpack instructions for masks that match their pattern.
16867 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
16868 return V;
16870 // Use dedicated pack instructions for masks that match their pattern.
16871 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
16872 Subtarget))
16873 return V;
16875 // Try to use shift instructions.
16876 if (SDValue Shift =
16877 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
16878 DAG, /*BitwiseOnly*/ false))
16879 return Shift;
16881 // Try to use byte rotation instructions.
16882 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
16883 Subtarget, DAG))
16884 return Rotate;
16886 // Try to use bit rotation instructions.
16887 if (V2.isUndef())
16888 if (SDValue Rotate =
16889 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
16890 return Rotate;
16892 // Lower as AND if possible.
16893 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
16894 Zeroable, Subtarget, DAG))
16895 return Masked;
16897 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
16898 Zeroable, Subtarget, DAG))
16899 return PSHUFB;
16901 // Try to create an in-lane repeating shuffle mask and then shuffle the
16902 // results into the target lanes.
16903 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16904 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
16905 return V;
16907 if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(
16908 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
16909 return Result;
16911 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
16912 Zeroable, Subtarget, DAG))
16913 return Blend;
16915 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
16916 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
16917 // PALIGNR will be cheaper than the second PSHUFB+OR.
16918 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
16919 Mask, Subtarget, DAG))
16920 return V;
16922 // If we can't directly blend but can use PSHUFB, that will be better as it
16923 // can both shuffle and set up the inefficient blend.
16924 bool V1InUse, V2InUse;
16925 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
16926 DAG, V1InUse, V2InUse);
16929 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16930 // shuffle.
16931 if (!V2.isUndef())
16932 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16933 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
16934 return Result;
16936 // VBMI can use VPERMV/VPERMV3 byte shuffles.
16937 if (Subtarget.hasVBMI())
16938 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
16940 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16943 /// High-level routine to lower various 512-bit x86 vector shuffles.
16945 /// This routine either breaks down the specific type of a 512-bit x86 vector
16946 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
16947 /// together based on the available instructions.
16948 static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16949 MVT VT, SDValue V1, SDValue V2,
16950 const APInt &Zeroable,
16951 const X86Subtarget &Subtarget,
16952 SelectionDAG &DAG) {
16953 assert(Subtarget.hasAVX512() &&
16954 "Cannot lower 512-bit vectors w/ basic ISA!");
16956 // If we have a single input to the zero element, insert that into V1 if we
16957 // can do so cheaply.
16958 int NumElts = Mask.size();
16959 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16961 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16962 if (SDValue Insertion = lowerShuffleAsElementInsertion(
16963 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16964 return Insertion;
16966 // Handle special cases where the lower or upper half is UNDEF.
16967 if (SDValue V =
16968 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16969 return V;
16971 // Check for being able to broadcast a single element.
16972 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
16973 Subtarget, DAG))
16974 return Broadcast;
16976 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
16977 // Try using bit ops for masking and blending before falling back to
16978 // splitting.
16979 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16980 Subtarget, DAG))
16981 return V;
16982 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16983 return V;
16985 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16988 if (VT == MVT::v32f16) {
16989 if (!Subtarget.hasBWI())
16990 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
16991 /*SimpleOnly*/ false);
16993 V1 = DAG.getBitcast(MVT::v32i16, V1);
16994 V2 = DAG.getBitcast(MVT::v32i16, V2);
16995 return DAG.getBitcast(MVT::v32f16,
16996 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
16999 // Dispatch to each element type for lowering. If we don't have support for
17000 // specific element type shuffles at 512 bits, immediately split them and
17001 // lower them. Each lowering routine of a given type is allowed to assume that
17002 // the requisite ISA extensions for that element type are available.
17003 switch (VT.SimpleTy) {
17004 case MVT::v8f64:
17005 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17006 case MVT::v16f32:
17007 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17008 case MVT::v8i64:
17009 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17010 case MVT::v16i32:
17011 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17012 case MVT::v32i16:
17013 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17014 case MVT::v64i8:
17015 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17017 default:
17018 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17022 static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
17023 MVT VT, SDValue V1, SDValue V2,
17024 const X86Subtarget &Subtarget,
17025 SelectionDAG &DAG) {
17026 // Shuffle should be unary.
17027 if (!V2.isUndef())
17028 return SDValue();
17030 int ShiftAmt = -1;
17031 int NumElts = Mask.size();
17032 for (int i = 0; i != NumElts; ++i) {
17033 int M = Mask[i];
17034 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17035 "Unexpected mask index.");
17036 if (M < 0)
17037 continue;
17039 // The first non-undef element determines our shift amount.
17040 if (ShiftAmt < 0) {
17041 ShiftAmt = M - i;
17042 // Need to be shifting right.
17043 if (ShiftAmt <= 0)
17044 return SDValue();
17046 // All non-undef elements must shift by the same amount.
17047 if (ShiftAmt != M - i)
17048 return SDValue();
17050 assert(ShiftAmt >= 0 && "All undef?");
17052 // Great we found a shift right.
17053 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17054 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17055 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17056 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17057 DAG.getIntPtrConstant(0, DL));
17060 // Determine if this shuffle can be implemented with a KSHIFT instruction.
17061 // Returns the shift amount if possible or -1 if not. This is a simplified
17062 // version of matchShuffleAsShift.
17063 static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17064 int MaskOffset, const APInt &Zeroable) {
17065 int Size = Mask.size();
17067 auto CheckZeros = [&](int Shift, bool Left) {
17068 for (int j = 0; j < Shift; ++j)
17069 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17070 return false;
17072 return true;
17075 auto MatchShift = [&](int Shift, bool Left) {
17076 unsigned Pos = Left ? Shift : 0;
17077 unsigned Low = Left ? 0 : Shift;
17078 unsigned Len = Size - Shift;
17079 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17082 for (int Shift = 1; Shift != Size; ++Shift)
17083 for (bool Left : {true, false})
17084 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17085 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
17086 return Shift;
17089 return -1;
17093 // Lower vXi1 vector shuffles.
17094 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
17095 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
17096 // vector, shuffle and then truncate it back.
17097 static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
17098 MVT VT, SDValue V1, SDValue V2,
17099 const APInt &Zeroable,
17100 const X86Subtarget &Subtarget,
17101 SelectionDAG &DAG) {
17102 assert(Subtarget.hasAVX512() &&
17103 "Cannot lower 512-bit vectors w/o basic ISA!");
17105 int NumElts = Mask.size();
17107 // Try to recognize shuffles that are just padding a subvector with zeros.
17108 int SubvecElts = 0;
17109 int Src = -1;
17110 for (int i = 0; i != NumElts; ++i) {
17111 if (Mask[i] >= 0) {
17112 // Grab the source from the first valid mask. All subsequent elements need
17113 // to use this same source.
17114 if (Src < 0)
17115 Src = Mask[i] / NumElts;
17116 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17117 break;
17120 ++SubvecElts;
17122 assert(SubvecElts != NumElts && "Identity shuffle?");
17124 // Clip to a power 2.
17125 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17127 // Make sure the number of zeroable bits in the top at least covers the bits
17128 // not covered by the subvector.
17129 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17130 assert(Src >= 0 && "Expected a source!");
17131 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17132 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
17133 Src == 0 ? V1 : V2,
17134 DAG.getIntPtrConstant(0, DL));
17135 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17136 DAG.getConstant(0, DL, VT),
17137 Extract, DAG.getIntPtrConstant(0, DL));
17140 // Try a simple shift right with undef elements. Later we'll try with zeros.
17141 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
17142 DAG))
17143 return Shift;
17145 // Try to match KSHIFTs.
17146 unsigned Offset = 0;
17147 for (SDValue V : { V1, V2 }) {
17148 unsigned Opcode;
17149 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17150 if (ShiftAmt >= 0) {
17151 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17152 MVT WideVT = Res.getSimpleValueType();
17153 // Widened right shifts need two shifts to ensure we shift in zeroes.
17154 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17155 int WideElts = WideVT.getVectorNumElements();
17156 // Shift left to put the original vector in the MSBs of the new size.
17157 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17158 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17159 // Increase the shift amount to account for the left shift.
17160 ShiftAmt += WideElts - NumElts;
17163 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17164 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17165 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17166 DAG.getIntPtrConstant(0, DL));
17168 Offset += NumElts; // Increment for next iteration.
17171 // If we're broadcasting a SETCC result, try to broadcast the ops instead.
17172 // TODO: What other unary shuffles would benefit from this?
17173 if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
17174 V1->hasOneUse()) {
17175 SDValue Op0 = V1.getOperand(0);
17176 SDValue Op1 = V1.getOperand(1);
17177 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17178 EVT OpVT = Op0.getValueType();
17179 return DAG.getSetCC(
17180 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17181 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17184 MVT ExtVT;
17185 switch (VT.SimpleTy) {
17186 default:
17187 llvm_unreachable("Expected a vector of i1 elements");
17188 case MVT::v2i1:
17189 ExtVT = MVT::v2i64;
17190 break;
17191 case MVT::v4i1:
17192 ExtVT = MVT::v4i32;
17193 break;
17194 case MVT::v8i1:
17195 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17196 // shuffle.
17197 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17198 break;
17199 case MVT::v16i1:
17200 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17201 // 256-bit operation available.
17202 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17203 break;
17204 case MVT::v32i1:
17205 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17206 // 256-bit operation available.
17207 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17208 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17209 break;
17210 case MVT::v64i1:
17211 // Fall back to scalarization. FIXME: We can do better if the shuffle
17212 // can be partitioned cleanly.
17213 if (!Subtarget.useBWIRegs())
17214 return SDValue();
17215 ExtVT = MVT::v64i8;
17216 break;
17219 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17220 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17222 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17223 // i1 was sign extended we can use X86ISD::CVT2MASK.
17224 int NumElems = VT.getVectorNumElements();
17225 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17226 (Subtarget.hasDQI() && (NumElems < 32)))
17227 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17228 Shuffle, ISD::SETGT);
17230 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17233 /// Helper function that returns true if the shuffle mask should be
17234 /// commuted to improve canonicalization.
17235 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
17236 int NumElements = Mask.size();
17238 int NumV1Elements = 0, NumV2Elements = 0;
17239 for (int M : Mask)
17240 if (M < 0)
17241 continue;
17242 else if (M < NumElements)
17243 ++NumV1Elements;
17244 else
17245 ++NumV2Elements;
17247 // Commute the shuffle as needed such that more elements come from V1 than
17248 // V2. This allows us to match the shuffle pattern strictly on how many
17249 // elements come from V1 without handling the symmetric cases.
17250 if (NumV2Elements > NumV1Elements)
17251 return true;
17253 assert(NumV1Elements > 0 && "No V1 indices");
17255 if (NumV2Elements == 0)
17256 return false;
17258 // When the number of V1 and V2 elements are the same, try to minimize the
17259 // number of uses of V2 in the low half of the vector. When that is tied,
17260 // ensure that the sum of indices for V1 is equal to or lower than the sum
17261 // indices for V2. When those are equal, try to ensure that the number of odd
17262 // indices for V1 is lower than the number of odd indices for V2.
17263 if (NumV1Elements == NumV2Elements) {
17264 int LowV1Elements = 0, LowV2Elements = 0;
17265 for (int M : Mask.slice(0, NumElements / 2))
17266 if (M >= NumElements)
17267 ++LowV2Elements;
17268 else if (M >= 0)
17269 ++LowV1Elements;
17270 if (LowV2Elements > LowV1Elements)
17271 return true;
17272 if (LowV2Elements == LowV1Elements) {
17273 int SumV1Indices = 0, SumV2Indices = 0;
17274 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17275 if (Mask[i] >= NumElements)
17276 SumV2Indices += i;
17277 else if (Mask[i] >= 0)
17278 SumV1Indices += i;
17279 if (SumV2Indices < SumV1Indices)
17280 return true;
17281 if (SumV2Indices == SumV1Indices) {
17282 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17283 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17284 if (Mask[i] >= NumElements)
17285 NumV2OddIndices += i % 2;
17286 else if (Mask[i] >= 0)
17287 NumV1OddIndices += i % 2;
17288 if (NumV2OddIndices < NumV1OddIndices)
17289 return true;
17294 return false;
17297 static bool canCombineAsMaskOperation(SDValue V,
17298 const X86Subtarget &Subtarget) {
17299 if (!Subtarget.hasAVX512())
17300 return false;
17302 if (!V.getValueType().isSimple())
17303 return false;
17305 MVT VT = V.getSimpleValueType().getScalarType();
17306 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17307 return false;
17309 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17310 // are preferable to blendw/blendvb/masked-mov.
17311 if ((VT == MVT::i16 || VT == MVT::i8) &&
17312 V.getSimpleValueType().getSizeInBits() < 512)
17313 return false;
17315 auto HasMaskOperation = [&](SDValue V) {
17316 // TODO: Currently we only check limited opcode. We probably extend
17317 // it to all binary operation by checking TLI.isBinOp().
17318 switch (V->getOpcode()) {
17319 default:
17320 return false;
17321 case ISD::ADD:
17322 case ISD::SUB:
17323 case ISD::AND:
17324 case ISD::XOR:
17325 case ISD::OR:
17326 case ISD::SMAX:
17327 case ISD::SMIN:
17328 case ISD::UMAX:
17329 case ISD::UMIN:
17330 case ISD::ABS:
17331 case ISD::SHL:
17332 case ISD::SRL:
17333 case ISD::SRA:
17334 case ISD::MUL:
17335 break;
17337 if (!V->hasOneUse())
17338 return false;
17340 return true;
17343 if (HasMaskOperation(V))
17344 return true;
17346 return false;
17349 // Forward declaration.
17350 static SDValue canonicalizeShuffleMaskWithHorizOp(
17351 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
17352 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17353 const X86Subtarget &Subtarget);
17355 /// Top-level lowering for x86 vector shuffles.
17357 /// This handles decomposition, canonicalization, and lowering of all x86
17358 /// vector shuffles. Most of the specific lowering strategies are encapsulated
17359 /// above in helper routines. The canonicalization attempts to widen shuffles
17360 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
17361 /// s.t. only one of the two inputs needs to be tested, etc.
17362 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
17363 SelectionDAG &DAG) {
17364 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17365 ArrayRef<int> OrigMask = SVOp->getMask();
17366 SDValue V1 = Op.getOperand(0);
17367 SDValue V2 = Op.getOperand(1);
17368 MVT VT = Op.getSimpleValueType();
17369 int NumElements = VT.getVectorNumElements();
17370 SDLoc DL(Op);
17371 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17373 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17374 "Can't lower MMX shuffles");
17376 bool V1IsUndef = V1.isUndef();
17377 bool V2IsUndef = V2.isUndef();
17378 if (V1IsUndef && V2IsUndef)
17379 return DAG.getUNDEF(VT);
17381 // When we create a shuffle node we put the UNDEF node to second operand,
17382 // but in some cases the first operand may be transformed to UNDEF.
17383 // In this case we should just commute the node.
17384 if (V1IsUndef)
17385 return DAG.getCommutedVectorShuffle(*SVOp);
17387 // Check for non-undef masks pointing at an undef vector and make the masks
17388 // undef as well. This makes it easier to match the shuffle based solely on
17389 // the mask.
17390 if (V2IsUndef &&
17391 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17392 SmallVector<int, 8> NewMask(OrigMask);
17393 for (int &M : NewMask)
17394 if (M >= NumElements)
17395 M = -1;
17396 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17399 // Check for illegal shuffle mask element index values.
17400 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17401 (void)MaskUpperLimit;
17402 assert(llvm::all_of(OrigMask,
17403 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17404 "Out of bounds shuffle index");
17406 // We actually see shuffles that are entirely re-arrangements of a set of
17407 // zero inputs. This mostly happens while decomposing complex shuffles into
17408 // simple ones. Directly lower these as a buildvector of zeros.
17409 APInt KnownUndef, KnownZero;
17410 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17412 APInt Zeroable = KnownUndef | KnownZero;
17413 if (Zeroable.isAllOnes())
17414 return getZeroVector(VT, Subtarget, DAG, DL);
17416 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17418 // Try to collapse shuffles into using a vector type with fewer elements but
17419 // wider element types. We cap this to not form integers or floating point
17420 // elements wider than 64 bits. It does not seem beneficial to form i128
17421 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17422 SmallVector<int, 16> WidenedMask;
17423 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17424 !canCombineAsMaskOperation(V1, Subtarget) &&
17425 !canCombineAsMaskOperation(V2, Subtarget) &&
17426 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17427 // Shuffle mask widening should not interfere with a broadcast opportunity
17428 // by obfuscating the operands with bitcasts.
17429 // TODO: Avoid lowering directly from this top-level function: make this
17430 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17431 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17432 Subtarget, DAG))
17433 return Broadcast;
17435 MVT NewEltVT = VT.isFloatingPoint()
17436 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
17437 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
17438 int NewNumElts = NumElements / 2;
17439 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
17440 // Make sure that the new vector type is legal. For example, v2f64 isn't
17441 // legal on SSE1.
17442 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
17443 if (V2IsZero) {
17444 // Modify the new Mask to take all zeros from the all-zero vector.
17445 // Choose indices that are blend-friendly.
17446 bool UsedZeroVector = false;
17447 assert(is_contained(WidenedMask, SM_SentinelZero) &&
17448 "V2's non-undef elements are used?!");
17449 for (int i = 0; i != NewNumElts; ++i)
17450 if (WidenedMask[i] == SM_SentinelZero) {
17451 WidenedMask[i] = i + NewNumElts;
17452 UsedZeroVector = true;
17454 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17455 // some elements to be undef.
17456 if (UsedZeroVector)
17457 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
17459 V1 = DAG.getBitcast(NewVT, V1);
17460 V2 = DAG.getBitcast(NewVT, V2);
17461 return DAG.getBitcast(
17462 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
17466 SmallVector<SDValue> Ops = {V1, V2};
17467 SmallVector<int> Mask(OrigMask);
17469 // Canonicalize the shuffle with any horizontal ops inputs.
17470 // NOTE: This may update Ops and Mask.
17471 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
17472 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
17473 return DAG.getBitcast(VT, HOp);
17475 V1 = DAG.getBitcast(VT, Ops[0]);
17476 V2 = DAG.getBitcast(VT, Ops[1]);
17477 assert(NumElements == (int)Mask.size() &&
17478 "canonicalizeShuffleMaskWithHorizOp "
17479 "shouldn't alter the shuffle mask size");
17481 // Commute the shuffle if it will improve canonicalization.
17482 if (canonicalizeShuffleMaskWithCommute(Mask)) {
17483 ShuffleVectorSDNode::commuteMask(Mask);
17484 std::swap(V1, V2);
17487 // For each vector width, delegate to a specialized lowering routine.
17488 if (VT.is128BitVector())
17489 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17491 if (VT.is256BitVector())
17492 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17494 if (VT.is512BitVector())
17495 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17497 if (Is1BitVector)
17498 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17500 llvm_unreachable("Unimplemented!");
17503 /// Try to lower a VSELECT instruction to a vector shuffle.
17504 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
17505 const X86Subtarget &Subtarget,
17506 SelectionDAG &DAG) {
17507 SDValue Cond = Op.getOperand(0);
17508 SDValue LHS = Op.getOperand(1);
17509 SDValue RHS = Op.getOperand(2);
17510 MVT VT = Op.getSimpleValueType();
17512 // Only non-legal VSELECTs reach this lowering, convert those into generic
17513 // shuffles and re-use the shuffle lowering path for blends.
17514 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
17515 SmallVector<int, 32> Mask;
17516 if (createShuffleMaskFromVSELECT(Mask, Cond))
17517 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
17520 return SDValue();
17523 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
17524 SDValue Cond = Op.getOperand(0);
17525 SDValue LHS = Op.getOperand(1);
17526 SDValue RHS = Op.getOperand(2);
17528 SDLoc dl(Op);
17529 MVT VT = Op.getSimpleValueType();
17530 if (isSoftF16(VT, Subtarget)) {
17531 MVT NVT = VT.changeVectorElementTypeToInteger();
17532 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
17533 DAG.getBitcast(NVT, LHS),
17534 DAG.getBitcast(NVT, RHS)));
17537 // A vselect where all conditions and data are constants can be optimized into
17538 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
17539 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
17540 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
17541 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
17542 return SDValue();
17544 // Try to lower this to a blend-style vector shuffle. This can handle all
17545 // constant condition cases.
17546 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
17547 return BlendOp;
17549 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
17550 // with patterns on the mask registers on AVX-512.
17551 MVT CondVT = Cond.getSimpleValueType();
17552 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
17553 if (CondEltSize == 1)
17554 return Op;
17556 // Variable blends are only legal from SSE4.1 onward.
17557 if (!Subtarget.hasSSE41())
17558 return SDValue();
17560 unsigned EltSize = VT.getScalarSizeInBits();
17561 unsigned NumElts = VT.getVectorNumElements();
17563 // Expand v32i16/v64i8 without BWI.
17564 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
17565 return SDValue();
17567 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
17568 // into an i1 condition so that we can use the mask-based 512-bit blend
17569 // instructions.
17570 if (VT.getSizeInBits() == 512) {
17571 // Build a mask by testing the condition against zero.
17572 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
17573 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
17574 DAG.getConstant(0, dl, CondVT),
17575 ISD::SETNE);
17576 // Now return a new VSELECT using the mask.
17577 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
17580 // SEXT/TRUNC cases where the mask doesn't match the destination size.
17581 if (CondEltSize != EltSize) {
17582 // If we don't have a sign splat, rely on the expansion.
17583 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
17584 return SDValue();
17586 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
17587 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
17588 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
17589 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
17592 // Only some types will be legal on some subtargets. If we can emit a legal
17593 // VSELECT-matching blend, return Op, and but if we need to expand, return
17594 // a null value.
17595 switch (VT.SimpleTy) {
17596 default:
17597 // Most of the vector types have blends past SSE4.1.
17598 return Op;
17600 case MVT::v32i8:
17601 // The byte blends for AVX vectors were introduced only in AVX2.
17602 if (Subtarget.hasAVX2())
17603 return Op;
17605 return SDValue();
17607 case MVT::v8i16:
17608 case MVT::v16i16: {
17609 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
17610 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
17611 Cond = DAG.getBitcast(CastVT, Cond);
17612 LHS = DAG.getBitcast(CastVT, LHS);
17613 RHS = DAG.getBitcast(CastVT, RHS);
17614 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
17615 return DAG.getBitcast(VT, Select);
17620 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
17621 MVT VT = Op.getSimpleValueType();
17622 SDValue Vec = Op.getOperand(0);
17623 SDValue Idx = Op.getOperand(1);
17624 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
17625 SDLoc dl(Op);
17627 if (!Vec.getSimpleValueType().is128BitVector())
17628 return SDValue();
17630 if (VT.getSizeInBits() == 8) {
17631 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
17632 // we're going to zero extend the register or fold the store.
17633 if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&
17634 !X86::mayFoldIntoStore(Op))
17635 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
17636 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17637 DAG.getBitcast(MVT::v4i32, Vec), Idx));
17639 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
17640 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
17641 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17642 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17645 if (VT == MVT::f32) {
17646 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
17647 // the result back to FR32 register. It's only worth matching if the
17648 // result has a single use which is a store or a bitcast to i32. And in
17649 // the case of a store, it's not worth it if the index is a constant 0,
17650 // because a MOVSSmr can be used instead, which is smaller and faster.
17651 if (!Op.hasOneUse())
17652 return SDValue();
17653 SDNode *User = *Op.getNode()->use_begin();
17654 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
17655 (User->getOpcode() != ISD::BITCAST ||
17656 User->getValueType(0) != MVT::i32))
17657 return SDValue();
17658 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17659 DAG.getBitcast(MVT::v4i32, Vec), Idx);
17660 return DAG.getBitcast(MVT::f32, Extract);
17663 if (VT == MVT::i32 || VT == MVT::i64)
17664 return Op;
17666 return SDValue();
17669 /// Extract one bit from mask vector, like v16i1 or v8i1.
17670 /// AVX-512 feature.
17671 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
17672 const X86Subtarget &Subtarget) {
17673 SDValue Vec = Op.getOperand(0);
17674 SDLoc dl(Vec);
17675 MVT VecVT = Vec.getSimpleValueType();
17676 SDValue Idx = Op.getOperand(1);
17677 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17678 MVT EltVT = Op.getSimpleValueType();
17680 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
17681 "Unexpected vector type in ExtractBitFromMaskVector");
17683 // variable index can't be handled in mask registers,
17684 // extend vector to VR512/128
17685 if (!IdxC) {
17686 unsigned NumElts = VecVT.getVectorNumElements();
17687 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
17688 // than extending to 128/256bit.
17689 if (NumElts == 1) {
17690 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17691 MVT IntVT = MVT::getIntegerVT(Vec.getValueType().getVectorNumElements());
17692 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
17694 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17695 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17696 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
17697 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
17698 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
17701 unsigned IdxVal = IdxC->getZExtValue();
17702 if (IdxVal == 0) // the operation is legal
17703 return Op;
17705 // Extend to natively supported kshift.
17706 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17708 // Use kshiftr instruction to move to the lower element.
17709 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
17710 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17712 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17713 DAG.getIntPtrConstant(0, dl));
17716 // Helper to find all the extracted elements from a vector.
17717 static APInt getExtractedDemandedElts(SDNode *N) {
17718 MVT VT = N->getSimpleValueType(0);
17719 unsigned NumElts = VT.getVectorNumElements();
17720 APInt DemandedElts = APInt::getZero(NumElts);
17721 for (SDNode *User : N->uses()) {
17722 switch (User->getOpcode()) {
17723 case X86ISD::PEXTRB:
17724 case X86ISD::PEXTRW:
17725 case ISD::EXTRACT_VECTOR_ELT:
17726 if (!isa<ConstantSDNode>(User->getOperand(1))) {
17727 DemandedElts.setAllBits();
17728 return DemandedElts;
17730 DemandedElts.setBit(User->getConstantOperandVal(1));
17731 break;
17732 case ISD::BITCAST: {
17733 if (!User->getValueType(0).isSimple() ||
17734 !User->getValueType(0).isVector()) {
17735 DemandedElts.setAllBits();
17736 return DemandedElts;
17738 APInt DemandedSrcElts = getExtractedDemandedElts(User);
17739 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
17740 break;
17742 default:
17743 DemandedElts.setAllBits();
17744 return DemandedElts;
17747 return DemandedElts;
17750 SDValue
17751 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
17752 SelectionDAG &DAG) const {
17753 SDLoc dl(Op);
17754 SDValue Vec = Op.getOperand(0);
17755 MVT VecVT = Vec.getSimpleValueType();
17756 SDValue Idx = Op.getOperand(1);
17757 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17759 if (VecVT.getVectorElementType() == MVT::i1)
17760 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
17762 if (!IdxC) {
17763 // Its more profitable to go through memory (1 cycles throughput)
17764 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
17765 // IACA tool was used to get performance estimation
17766 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
17768 // example : extractelement <16 x i8> %a, i32 %i
17770 // Block Throughput: 3.00 Cycles
17771 // Throughput Bottleneck: Port5
17773 // | Num Of | Ports pressure in cycles | |
17774 // | Uops | 0 - DV | 5 | 6 | 7 | |
17775 // ---------------------------------------------
17776 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
17777 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
17778 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
17779 // Total Num Of Uops: 4
17782 // Block Throughput: 1.00 Cycles
17783 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
17785 // | | Ports pressure in cycles | |
17786 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
17787 // ---------------------------------------------------------
17788 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
17789 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
17790 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
17791 // Total Num Of Uops: 4
17793 return SDValue();
17796 unsigned IdxVal = IdxC->getZExtValue();
17798 // If this is a 256-bit vector result, first extract the 128-bit vector and
17799 // then extract the element from the 128-bit vector.
17800 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
17801 // Get the 128-bit vector.
17802 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
17803 MVT EltVT = VecVT.getVectorElementType();
17805 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
17806 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
17808 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
17809 // this can be done with a mask.
17810 IdxVal &= ElemsPerChunk - 1;
17811 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17812 DAG.getIntPtrConstant(IdxVal, dl));
17815 assert(VecVT.is128BitVector() && "Unexpected vector length");
17817 MVT VT = Op.getSimpleValueType();
17819 if (VT == MVT::i16) {
17820 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
17821 // we're going to zero extend the register or fold the store (SSE41 only).
17822 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
17823 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
17824 if (Subtarget.hasFP16())
17825 return Op;
17827 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
17828 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17829 DAG.getBitcast(MVT::v4i32, Vec), Idx));
17832 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
17833 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17834 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17837 if (Subtarget.hasSSE41())
17838 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
17839 return Res;
17841 // Only extract a single element from a v16i8 source - determine the common
17842 // DWORD/WORD that all extractions share, and extract the sub-byte.
17843 // TODO: Add QWORD MOVQ extraction?
17844 if (VT == MVT::i8) {
17845 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
17846 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
17848 // Extract either the lowest i32 or any i16, and extract the sub-byte.
17849 int DWordIdx = IdxVal / 4;
17850 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
17851 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17852 DAG.getBitcast(MVT::v4i32, Vec),
17853 DAG.getIntPtrConstant(DWordIdx, dl));
17854 int ShiftVal = (IdxVal % 4) * 8;
17855 if (ShiftVal != 0)
17856 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
17857 DAG.getConstant(ShiftVal, dl, MVT::i8));
17858 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
17861 int WordIdx = IdxVal / 2;
17862 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
17863 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
17864 DAG.getBitcast(MVT::v8i16, Vec),
17865 DAG.getIntPtrConstant(WordIdx, dl));
17866 int ShiftVal = (IdxVal % 2) * 8;
17867 if (ShiftVal != 0)
17868 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
17869 DAG.getConstant(ShiftVal, dl, MVT::i8));
17870 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
17874 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
17875 if (IdxVal == 0)
17876 return Op;
17878 // Shuffle the element to the lowest element, then movss or movsh.
17879 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
17880 Mask[0] = static_cast<int>(IdxVal);
17881 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
17882 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
17883 DAG.getIntPtrConstant(0, dl));
17886 if (VT.getSizeInBits() == 64) {
17887 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
17888 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
17889 // to match extract_elt for f64.
17890 if (IdxVal == 0)
17891 return Op;
17893 // UNPCKHPD the element to the lowest double word, then movsd.
17894 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
17895 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
17896 int Mask[2] = { 1, -1 };
17897 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
17898 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
17899 DAG.getIntPtrConstant(0, dl));
17902 return SDValue();
17905 /// Insert one bit to mask vector, like v16i1 or v8i1.
17906 /// AVX-512 feature.
17907 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
17908 const X86Subtarget &Subtarget) {
17909 SDLoc dl(Op);
17910 SDValue Vec = Op.getOperand(0);
17911 SDValue Elt = Op.getOperand(1);
17912 SDValue Idx = Op.getOperand(2);
17913 MVT VecVT = Vec.getSimpleValueType();
17915 if (!isa<ConstantSDNode>(Idx)) {
17916 // Non constant index. Extend source and destination,
17917 // insert element and then truncate the result.
17918 unsigned NumElts = VecVT.getVectorNumElements();
17919 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17920 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17921 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
17922 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
17923 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
17924 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
17927 // Copy into a k-register, extract to v1i1 and insert_subvector.
17928 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
17929 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
17932 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
17933 SelectionDAG &DAG) const {
17934 MVT VT = Op.getSimpleValueType();
17935 MVT EltVT = VT.getVectorElementType();
17936 unsigned NumElts = VT.getVectorNumElements();
17937 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
17939 if (EltVT == MVT::i1)
17940 return InsertBitToMaskVector(Op, DAG, Subtarget);
17942 SDLoc dl(Op);
17943 SDValue N0 = Op.getOperand(0);
17944 SDValue N1 = Op.getOperand(1);
17945 SDValue N2 = Op.getOperand(2);
17946 auto *N2C = dyn_cast<ConstantSDNode>(N2);
17948 if (EltVT == MVT::bf16) {
17949 MVT IVT = VT.changeVectorElementTypeToInteger();
17950 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
17951 DAG.getBitcast(IVT, N0),
17952 DAG.getBitcast(MVT::i16, N1), N2);
17953 return DAG.getBitcast(VT, Res);
17956 if (!N2C) {
17957 // Variable insertion indices, usually we're better off spilling to stack,
17958 // but AVX512 can use a variable compare+select by comparing against all
17959 // possible vector indices, and FP insertion has less gpr->simd traffic.
17960 if (!(Subtarget.hasBWI() ||
17961 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
17962 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
17963 return SDValue();
17965 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
17966 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
17967 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
17968 return SDValue();
17970 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
17971 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
17972 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
17974 SmallVector<SDValue, 16> RawIndices;
17975 for (unsigned I = 0; I != NumElts; ++I)
17976 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
17977 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
17979 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
17980 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
17981 ISD::CondCode::SETEQ);
17984 if (N2C->getAPIntValue().uge(NumElts))
17985 return SDValue();
17986 uint64_t IdxVal = N2C->getZExtValue();
17988 bool IsZeroElt = X86::isZeroNode(N1);
17989 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
17991 if (IsZeroElt || IsAllOnesElt) {
17992 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
17993 // We don't deal with i8 0 since it appears to be handled elsewhere.
17994 if (IsAllOnesElt &&
17995 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
17996 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
17997 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
17998 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
17999 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18000 CstVectorElts[IdxVal] = OnesCst;
18001 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18002 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18004 // See if we can do this more efficiently with a blend shuffle with a
18005 // rematerializable vector.
18006 if (Subtarget.hasSSE41() &&
18007 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18008 SmallVector<int, 8> BlendMask;
18009 for (unsigned i = 0; i != NumElts; ++i)
18010 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18011 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18012 : getOnesVector(VT, DAG, dl);
18013 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18017 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18018 // into that, and then insert the subvector back into the result.
18019 if (VT.is256BitVector() || VT.is512BitVector()) {
18020 // With a 256-bit vector, we can insert into the zero element efficiently
18021 // using a blend if we have AVX or AVX2 and the right data type.
18022 if (VT.is256BitVector() && IdxVal == 0) {
18023 // TODO: It is worthwhile to cast integer to floating point and back
18024 // and incur a domain crossing penalty if that's what we'll end up
18025 // doing anyway after extracting to a 128-bit vector.
18026 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18027 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18028 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18029 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18030 DAG.getTargetConstant(1, dl, MVT::i8));
18034 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18035 assert(isPowerOf2_32(NumEltsIn128) &&
18036 "Vectors will always have power-of-two number of elements.");
18038 // If we are not inserting into the low 128-bit vector chunk,
18039 // then prefer the broadcast+blend sequence.
18040 // FIXME: relax the profitability check iff all N1 uses are insertions.
18041 if (IdxVal >= NumEltsIn128 &&
18042 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18043 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18044 X86::mayFoldLoad(N1, Subtarget)))) {
18045 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18046 SmallVector<int, 8> BlendMask;
18047 for (unsigned i = 0; i != NumElts; ++i)
18048 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18049 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18052 // Get the desired 128-bit vector chunk.
18053 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18055 // Insert the element into the desired chunk.
18056 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18057 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18059 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18060 DAG.getIntPtrConstant(IdxIn128, dl));
18062 // Insert the changed part back into the bigger vector
18063 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18065 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18067 // This will be just movw/movd/movq/movsh/movss/movsd.
18068 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18069 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18070 EltVT == MVT::f16 || EltVT == MVT::i64) {
18071 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18072 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18075 // We can't directly insert an i8 or i16 into a vector, so zero extend
18076 // it to i32 first.
18077 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18078 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18079 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18080 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18081 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18082 return DAG.getBitcast(VT, N1);
18086 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18087 // argument. SSE41 required for pinsrb.
18088 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18089 unsigned Opc;
18090 if (VT == MVT::v8i16) {
18091 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18092 Opc = X86ISD::PINSRW;
18093 } else {
18094 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18095 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18096 Opc = X86ISD::PINSRB;
18099 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18100 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18101 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18102 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18105 if (Subtarget.hasSSE41()) {
18106 if (EltVT == MVT::f32) {
18107 // Bits [7:6] of the constant are the source select. This will always be
18108 // zero here. The DAG Combiner may combine an extract_elt index into
18109 // these bits. For example (insert (extract, 3), 2) could be matched by
18110 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18111 // Bits [5:4] of the constant are the destination select. This is the
18112 // value of the incoming immediate.
18113 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18114 // combine either bitwise AND or insert of float 0.0 to set these bits.
18116 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18117 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18118 // If this is an insertion of 32-bits into the low 32-bits of
18119 // a vector, we prefer to generate a blend with immediate rather
18120 // than an insertps. Blends are simpler operations in hardware and so
18121 // will always have equal or better performance than insertps.
18122 // But if optimizing for size and there's a load folding opportunity,
18123 // generate insertps because blendps does not have a 32-bit memory
18124 // operand form.
18125 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18126 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18127 DAG.getTargetConstant(1, dl, MVT::i8));
18129 // Create this as a scalar to vector..
18130 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18131 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18132 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18135 // PINSR* works with constant index.
18136 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18137 return Op;
18140 return SDValue();
18143 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
18144 SelectionDAG &DAG) {
18145 SDLoc dl(Op);
18146 MVT OpVT = Op.getSimpleValueType();
18148 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18149 // combines.
18150 if (X86::isZeroNode(Op.getOperand(0)))
18151 return getZeroVector(OpVT, Subtarget, DAG, dl);
18153 // If this is a 256-bit vector result, first insert into a 128-bit
18154 // vector and then insert into the 256-bit vector.
18155 if (!OpVT.is128BitVector()) {
18156 // Insert into a 128-bit vector.
18157 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18158 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
18159 OpVT.getVectorNumElements() / SizeFactor);
18161 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18163 // Insert the 128-bit vector.
18164 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18166 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18167 "Expected an SSE type!");
18169 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18170 // tblgen.
18171 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18172 return Op;
18174 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18175 return DAG.getBitcast(
18176 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18179 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18180 // simple superregister reference or explicit instructions to insert
18181 // the upper bits of a vector.
18182 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
18183 SelectionDAG &DAG) {
18184 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18186 return insert1BitVector(Op, DAG, Subtarget);
18189 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
18190 SelectionDAG &DAG) {
18191 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18192 "Only vXi1 extract_subvectors need custom lowering");
18194 SDLoc dl(Op);
18195 SDValue Vec = Op.getOperand(0);
18196 uint64_t IdxVal = Op.getConstantOperandVal(1);
18198 if (IdxVal == 0) // the operation is legal
18199 return Op;
18201 // Extend to natively supported kshift.
18202 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18204 // Shift to the LSB.
18205 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18206 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18208 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18209 DAG.getIntPtrConstant(0, dl));
18212 // Returns the appropriate wrapper opcode for a global reference.
18213 unsigned X86TargetLowering::getGlobalWrapperKind(
18214 const GlobalValue *GV, const unsigned char OpFlags) const {
18215 // References to absolute symbols are never PC-relative.
18216 if (GV && GV->isAbsoluteSymbolRef())
18217 return X86ISD::Wrapper;
18219 // The following OpFlags under RIP-rel PIC use RIP.
18220 if (Subtarget.isPICStyleRIPRel() &&
18221 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18222 OpFlags == X86II::MO_DLLIMPORT))
18223 return X86ISD::WrapperRIP;
18225 // In the medium model, functions can always be referenced RIP-relatively,
18226 // since they must be within 2GiB. This is also possible in non-PIC mode, and
18227 // shorter than the 64-bit absolute immediate that would otherwise be emitted.
18228 if (getTargetMachine().getCodeModel() == CodeModel::Medium &&
18229 isa_and_nonnull<Function>(GV))
18230 return X86ISD::WrapperRIP;
18232 // GOTPCREL references must always use RIP.
18233 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18234 return X86ISD::WrapperRIP;
18236 return X86ISD::Wrapper;
18239 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18240 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18241 // one of the above mentioned nodes. It has to be wrapped because otherwise
18242 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18243 // be used to form addressing mode. These wrapped nodes will be selected
18244 // into MOV32ri.
18245 SDValue
18246 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18247 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18249 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18250 // global base reg.
18251 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18253 auto PtrVT = getPointerTy(DAG.getDataLayout());
18254 SDValue Result = DAG.getTargetConstantPool(
18255 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18256 SDLoc DL(CP);
18257 Result =
18258 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18259 // With PIC, the address is actually $g + Offset.
18260 if (OpFlag) {
18261 Result =
18262 DAG.getNode(ISD::ADD, DL, PtrVT,
18263 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18266 return Result;
18269 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18270 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18272 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18273 // global base reg.
18274 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18276 auto PtrVT = getPointerTy(DAG.getDataLayout());
18277 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18278 SDLoc DL(JT);
18279 Result =
18280 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18282 // With PIC, the address is actually $g + Offset.
18283 if (OpFlag)
18284 Result =
18285 DAG.getNode(ISD::ADD, DL, PtrVT,
18286 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18288 return Result;
18291 SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18292 SelectionDAG &DAG) const {
18293 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18296 SDValue
18297 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18298 // Create the TargetBlockAddressAddress node.
18299 unsigned char OpFlags =
18300 Subtarget.classifyBlockAddressReference();
18301 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18302 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18303 SDLoc dl(Op);
18304 auto PtrVT = getPointerTy(DAG.getDataLayout());
18305 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18306 Result =
18307 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18309 // With PIC, the address is actually $g + Offset.
18310 if (isGlobalRelativeToPICBase(OpFlags)) {
18311 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18312 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18315 return Result;
18318 /// Creates target global address or external symbol nodes for calls or
18319 /// other uses.
18320 SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18321 bool ForCall) const {
18322 // Unpack the global address or external symbol.
18323 const SDLoc &dl = SDLoc(Op);
18324 const GlobalValue *GV = nullptr;
18325 int64_t Offset = 0;
18326 const char *ExternalSym = nullptr;
18327 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18328 GV = G->getGlobal();
18329 Offset = G->getOffset();
18330 } else {
18331 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18332 ExternalSym = ES->getSymbol();
18335 // Calculate some flags for address lowering.
18336 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
18337 unsigned char OpFlags;
18338 if (ForCall)
18339 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18340 else
18341 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18342 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18343 bool NeedsLoad = isGlobalStubReference(OpFlags);
18345 CodeModel::Model M = DAG.getTarget().getCodeModel();
18346 auto PtrVT = getPointerTy(DAG.getDataLayout());
18347 SDValue Result;
18349 if (GV) {
18350 // Create a target global address if this is a global. If possible, fold the
18351 // offset into the global address reference. Otherwise, ADD it on later.
18352 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18353 // allowed because if the address of foo is 0, the ELF R_X86_64_32
18354 // relocation will compute to a negative value, which is invalid.
18355 int64_t GlobalOffset = 0;
18356 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
18357 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
18358 std::swap(GlobalOffset, Offset);
18360 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18361 } else {
18362 // If this is not a global address, this must be an external symbol.
18363 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18366 // If this is a direct call, avoid the wrapper if we don't need to do any
18367 // loads or adds. This allows SDAG ISel to match direct calls.
18368 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
18369 return Result;
18371 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
18373 // With PIC, the address is actually $g + Offset.
18374 if (HasPICReg) {
18375 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18376 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18379 // For globals that require a load from a stub to get the address, emit the
18380 // load.
18381 if (NeedsLoad)
18382 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
18383 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
18385 // If there was a non-zero offset that we didn't fold, create an explicit
18386 // addition for it.
18387 if (Offset != 0)
18388 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
18389 DAG.getConstant(Offset, dl, PtrVT));
18391 return Result;
18394 SDValue
18395 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
18396 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18399 static SDValue
18400 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
18401 SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
18402 unsigned char OperandFlags, bool LocalDynamic = false) {
18403 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
18404 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18405 SDLoc dl(GA);
18406 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18407 GA->getValueType(0),
18408 GA->getOffset(),
18409 OperandFlags);
18411 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
18412 : X86ISD::TLSADDR;
18414 if (InGlue) {
18415 SDValue Ops[] = { Chain, TGA, *InGlue };
18416 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18417 } else {
18418 SDValue Ops[] = { Chain, TGA };
18419 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18422 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
18423 MFI.setAdjustsStack(true);
18424 MFI.setHasCalls(true);
18426 SDValue Glue = Chain.getValue(1);
18427 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
18430 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
18431 static SDValue
18432 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18433 const EVT PtrVT) {
18434 SDValue InGlue;
18435 SDLoc dl(GA); // ? function entry point might be better
18436 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18437 DAG.getNode(X86ISD::GlobalBaseReg,
18438 SDLoc(), PtrVT), InGlue);
18439 InGlue = Chain.getValue(1);
18441 return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
18444 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
18445 static SDValue
18446 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18447 const EVT PtrVT) {
18448 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18449 X86::RAX, X86II::MO_TLSGD);
18452 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
18453 static SDValue
18454 LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18455 const EVT PtrVT) {
18456 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18457 X86::EAX, X86II::MO_TLSGD);
18460 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
18461 SelectionDAG &DAG, const EVT PtrVT,
18462 bool Is64Bit, bool Is64BitLP64) {
18463 SDLoc dl(GA);
18465 // Get the start address of the TLS block for this module.
18466 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
18467 .getInfo<X86MachineFunctionInfo>();
18468 MFI->incNumLocalDynamicTLSAccesses();
18470 SDValue Base;
18471 if (Is64Bit) {
18472 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
18473 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
18474 X86II::MO_TLSLD, /*LocalDynamic=*/true);
18475 } else {
18476 SDValue InGlue;
18477 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18478 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
18479 InGlue = Chain.getValue(1);
18480 Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
18481 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
18484 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
18485 // of Base.
18487 // Build x@dtpoff.
18488 unsigned char OperandFlags = X86II::MO_DTPOFF;
18489 unsigned WrapperKind = X86ISD::Wrapper;
18490 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18491 GA->getValueType(0),
18492 GA->getOffset(), OperandFlags);
18493 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18495 // Add x@dtpoff with the base.
18496 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
18499 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
18500 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18501 const EVT PtrVT, TLSModel::Model model,
18502 bool is64Bit, bool isPIC) {
18503 SDLoc dl(GA);
18505 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
18506 Value *Ptr = Constant::getNullValue(
18507 PointerType::get(*DAG.getContext(), is64Bit ? 257 : 256));
18509 SDValue ThreadPointer =
18510 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18511 MachinePointerInfo(Ptr));
18513 unsigned char OperandFlags = 0;
18514 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
18515 // initialexec.
18516 unsigned WrapperKind = X86ISD::Wrapper;
18517 if (model == TLSModel::LocalExec) {
18518 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
18519 } else if (model == TLSModel::InitialExec) {
18520 if (is64Bit) {
18521 OperandFlags = X86II::MO_GOTTPOFF;
18522 WrapperKind = X86ISD::WrapperRIP;
18523 } else {
18524 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
18526 } else {
18527 llvm_unreachable("Unexpected model");
18530 // emit "addl x@ntpoff,%eax" (local exec)
18531 // or "addl x@indntpoff,%eax" (initial exec)
18532 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
18533 SDValue TGA =
18534 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18535 GA->getOffset(), OperandFlags);
18536 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18538 if (model == TLSModel::InitialExec) {
18539 if (isPIC && !is64Bit) {
18540 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
18541 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18542 Offset);
18545 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
18546 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
18549 // The address of the thread local variable is the add of the thread
18550 // pointer with the offset of the variable.
18551 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
18554 SDValue
18555 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
18557 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
18559 if (DAG.getTarget().useEmulatedTLS())
18560 return LowerToTLSEmulatedModel(GA, DAG);
18562 const GlobalValue *GV = GA->getGlobal();
18563 auto PtrVT = getPointerTy(DAG.getDataLayout());
18564 bool PositionIndependent = isPositionIndependent();
18566 if (Subtarget.isTargetELF()) {
18567 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
18568 switch (model) {
18569 case TLSModel::GeneralDynamic:
18570 if (Subtarget.is64Bit()) {
18571 if (Subtarget.isTarget64BitLP64())
18572 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
18573 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
18575 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
18576 case TLSModel::LocalDynamic:
18577 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
18578 Subtarget.isTarget64BitLP64());
18579 case TLSModel::InitialExec:
18580 case TLSModel::LocalExec:
18581 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
18582 PositionIndependent);
18584 llvm_unreachable("Unknown TLS model.");
18587 if (Subtarget.isTargetDarwin()) {
18588 // Darwin only has one model of TLS. Lower to that.
18589 unsigned char OpFlag = 0;
18590 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
18591 X86ISD::WrapperRIP : X86ISD::Wrapper;
18593 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18594 // global base reg.
18595 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
18596 if (PIC32)
18597 OpFlag = X86II::MO_TLVP_PIC_BASE;
18598 else
18599 OpFlag = X86II::MO_TLVP;
18600 SDLoc DL(Op);
18601 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
18602 GA->getValueType(0),
18603 GA->getOffset(), OpFlag);
18604 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
18606 // With PIC32, the address is actually $g + Offset.
18607 if (PIC32)
18608 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
18609 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18610 Offset);
18612 // Lowering the machine isd will make sure everything is in the right
18613 // location.
18614 SDValue Chain = DAG.getEntryNode();
18615 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18616 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
18617 SDValue Args[] = { Chain, Offset };
18618 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
18619 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
18621 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
18622 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
18623 MFI.setAdjustsStack(true);
18625 // And our return value (tls address) is in the standard call return value
18626 // location.
18627 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
18628 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
18631 if (Subtarget.isOSWindows()) {
18632 // Just use the implicit TLS architecture
18633 // Need to generate something similar to:
18634 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
18635 // ; from TEB
18636 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
18637 // mov rcx, qword [rdx+rcx*8]
18638 // mov eax, .tls$:tlsvar
18639 // [rax+rcx] contains the address
18640 // Windows 64bit: gs:0x58
18641 // Windows 32bit: fs:__tls_array
18643 SDLoc dl(GA);
18644 SDValue Chain = DAG.getEntryNode();
18646 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
18647 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
18648 // use its literal value of 0x2C.
18649 Value *Ptr = Constant::getNullValue(
18650 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), 256)
18651 : PointerType::get(*DAG.getContext(), 257));
18653 SDValue TlsArray = Subtarget.is64Bit()
18654 ? DAG.getIntPtrConstant(0x58, dl)
18655 : (Subtarget.isTargetWindowsGNU()
18656 ? DAG.getIntPtrConstant(0x2C, dl)
18657 : DAG.getExternalSymbol("_tls_array", PtrVT));
18659 SDValue ThreadPointer =
18660 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
18662 SDValue res;
18663 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
18664 res = ThreadPointer;
18665 } else {
18666 // Load the _tls_index variable
18667 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
18668 if (Subtarget.is64Bit())
18669 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
18670 MachinePointerInfo(), MVT::i32);
18671 else
18672 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
18674 const DataLayout &DL = DAG.getDataLayout();
18675 SDValue Scale =
18676 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
18677 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
18679 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
18682 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
18684 // Get the offset of start of .tls section
18685 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18686 GA->getValueType(0),
18687 GA->getOffset(), X86II::MO_SECREL);
18688 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
18690 // The address of the thread local variable is the add of the thread
18691 // pointer with the offset of the variable.
18692 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
18695 llvm_unreachable("TLS not implemented for this target.");
18698 /// Lower SRA_PARTS and friends, which return two i32 values
18699 /// and take a 2 x i32 value to shift plus a shift amount.
18700 /// TODO: Can this be moved to general expansion code?
18701 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
18702 SDValue Lo, Hi;
18703 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
18704 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
18707 // Try to use a packed vector operation to handle i64 on 32-bit targets when
18708 // AVX512DQ is enabled.
18709 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
18710 const X86Subtarget &Subtarget) {
18711 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
18712 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
18713 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
18714 Op.getOpcode() == ISD::UINT_TO_FP) &&
18715 "Unexpected opcode!");
18716 bool IsStrict = Op->isStrictFPOpcode();
18717 unsigned OpNo = IsStrict ? 1 : 0;
18718 SDValue Src = Op.getOperand(OpNo);
18719 MVT SrcVT = Src.getSimpleValueType();
18720 MVT VT = Op.getSimpleValueType();
18722 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
18723 (VT != MVT::f32 && VT != MVT::f64))
18724 return SDValue();
18726 // Pack the i64 into a vector, do the operation and extract.
18728 // Using 256-bit to ensure result is 128-bits for f32 case.
18729 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
18730 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
18731 MVT VecVT = MVT::getVectorVT(VT, NumElts);
18733 SDLoc dl(Op);
18734 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
18735 if (IsStrict) {
18736 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
18737 {Op.getOperand(0), InVec});
18738 SDValue Chain = CvtVec.getValue(1);
18739 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18740 DAG.getIntPtrConstant(0, dl));
18741 return DAG.getMergeValues({Value, Chain}, dl);
18744 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
18746 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18747 DAG.getIntPtrConstant(0, dl));
18750 // Try to use a packed vector operation to handle i64 on 32-bit targets.
18751 static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
18752 const X86Subtarget &Subtarget) {
18753 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
18754 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
18755 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
18756 Op.getOpcode() == ISD::UINT_TO_FP) &&
18757 "Unexpected opcode!");
18758 bool IsStrict = Op->isStrictFPOpcode();
18759 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
18760 MVT SrcVT = Src.getSimpleValueType();
18761 MVT VT = Op.getSimpleValueType();
18763 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
18764 return SDValue();
18766 // Pack the i64 into a vector, do the operation and extract.
18768 assert(Subtarget.hasFP16() && "Expected FP16");
18770 SDLoc dl(Op);
18771 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
18772 if (IsStrict) {
18773 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
18774 {Op.getOperand(0), InVec});
18775 SDValue Chain = CvtVec.getValue(1);
18776 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18777 DAG.getIntPtrConstant(0, dl));
18778 return DAG.getMergeValues({Value, Chain}, dl);
18781 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
18783 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18784 DAG.getIntPtrConstant(0, dl));
18787 static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
18788 const X86Subtarget &Subtarget) {
18789 switch (Opcode) {
18790 case ISD::SINT_TO_FP:
18791 // TODO: Handle wider types with AVX/AVX512.
18792 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
18793 return false;
18794 // CVTDQ2PS or (V)CVTDQ2PD
18795 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
18797 case ISD::UINT_TO_FP:
18798 // TODO: Handle wider types and i64 elements.
18799 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
18800 return false;
18801 // VCVTUDQ2PS or VCVTUDQ2PD
18802 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
18804 default:
18805 return false;
18809 /// Given a scalar cast operation that is extracted from a vector, try to
18810 /// vectorize the cast op followed by extraction. This will avoid an expensive
18811 /// round-trip between XMM and GPR.
18812 static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
18813 const X86Subtarget &Subtarget) {
18814 // TODO: This could be enhanced to handle smaller integer types by peeking
18815 // through an extend.
18816 SDValue Extract = Cast.getOperand(0);
18817 MVT DestVT = Cast.getSimpleValueType();
18818 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
18819 !isa<ConstantSDNode>(Extract.getOperand(1)))
18820 return SDValue();
18822 // See if we have a 128-bit vector cast op for this type of cast.
18823 SDValue VecOp = Extract.getOperand(0);
18824 MVT FromVT = VecOp.getSimpleValueType();
18825 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
18826 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
18827 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
18828 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
18829 return SDValue();
18831 // If we are extracting from a non-zero element, first shuffle the source
18832 // vector to allow extracting from element zero.
18833 SDLoc DL(Cast);
18834 if (!isNullConstant(Extract.getOperand(1))) {
18835 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
18836 Mask[0] = Extract.getConstantOperandVal(1);
18837 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
18839 // If the source vector is wider than 128-bits, extract the low part. Do not
18840 // create an unnecessarily wide vector cast op.
18841 if (FromVT != Vec128VT)
18842 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
18844 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
18845 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
18846 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
18847 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
18848 DAG.getIntPtrConstant(0, DL));
18851 /// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
18852 /// try to vectorize the cast ops. This will avoid an expensive round-trip
18853 /// between XMM and GPR.
18854 static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
18855 const X86Subtarget &Subtarget) {
18856 // TODO: Allow FP_TO_UINT.
18857 SDValue CastToInt = CastToFP.getOperand(0);
18858 MVT VT = CastToFP.getSimpleValueType();
18859 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
18860 return SDValue();
18862 MVT IntVT = CastToInt.getSimpleValueType();
18863 SDValue X = CastToInt.getOperand(0);
18864 MVT SrcVT = X.getSimpleValueType();
18865 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
18866 return SDValue();
18868 // See if we have 128-bit vector cast instructions for this type of cast.
18869 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
18870 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
18871 IntVT != MVT::i32)
18872 return SDValue();
18874 unsigned SrcSize = SrcVT.getSizeInBits();
18875 unsigned IntSize = IntVT.getSizeInBits();
18876 unsigned VTSize = VT.getSizeInBits();
18877 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
18878 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
18879 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
18881 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
18882 unsigned ToIntOpcode =
18883 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
18884 unsigned ToFPOpcode =
18885 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
18887 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
18889 // We are not defining the high elements (for example, zero them) because
18890 // that could nullify any performance advantage that we hoped to gain from
18891 // this vector op hack. We do not expect any adverse effects (like denorm
18892 // penalties) with cast ops.
18893 SDLoc DL(CastToFP);
18894 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
18895 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
18896 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
18897 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
18898 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
18901 static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
18902 const X86Subtarget &Subtarget) {
18903 SDLoc DL(Op);
18904 bool IsStrict = Op->isStrictFPOpcode();
18905 MVT VT = Op->getSimpleValueType(0);
18906 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
18908 if (Subtarget.hasDQI()) {
18909 assert(!Subtarget.hasVLX() && "Unexpected features");
18911 assert((Src.getSimpleValueType() == MVT::v2i64 ||
18912 Src.getSimpleValueType() == MVT::v4i64) &&
18913 "Unsupported custom type");
18915 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
18916 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
18917 "Unexpected VT!");
18918 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
18920 // Need to concat with zero vector for strict fp to avoid spurious
18921 // exceptions.
18922 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
18923 : DAG.getUNDEF(MVT::v8i64);
18924 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
18925 DAG.getIntPtrConstant(0, DL));
18926 SDValue Res, Chain;
18927 if (IsStrict) {
18928 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
18929 {Op->getOperand(0), Src});
18930 Chain = Res.getValue(1);
18931 } else {
18932 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
18935 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18936 DAG.getIntPtrConstant(0, DL));
18938 if (IsStrict)
18939 return DAG.getMergeValues({Res, Chain}, DL);
18940 return Res;
18943 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
18944 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
18945 if (VT != MVT::v4f32 || IsSigned)
18946 return SDValue();
18948 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
18949 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
18950 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
18951 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
18952 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
18953 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
18954 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
18955 SmallVector<SDValue, 4> SignCvts(4);
18956 SmallVector<SDValue, 4> Chains(4);
18957 for (int i = 0; i != 4; ++i) {
18958 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
18959 DAG.getIntPtrConstant(i, DL));
18960 if (IsStrict) {
18961 SignCvts[i] =
18962 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
18963 {Op.getOperand(0), Elt});
18964 Chains[i] = SignCvts[i].getValue(1);
18965 } else {
18966 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
18969 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
18971 SDValue Slow, Chain;
18972 if (IsStrict) {
18973 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18974 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
18975 {Chain, SignCvt, SignCvt});
18976 Chain = Slow.getValue(1);
18977 } else {
18978 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
18981 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
18982 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
18984 if (IsStrict)
18985 return DAG.getMergeValues({Cvt, Chain}, DL);
18987 return Cvt;
18990 static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
18991 bool IsStrict = Op->isStrictFPOpcode();
18992 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
18993 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
18994 MVT VT = Op.getSimpleValueType();
18995 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
18996 SDLoc dl(Op);
18998 SDValue Rnd = DAG.getIntPtrConstant(0, dl);
18999 if (IsStrict)
19000 return DAG.getNode(
19001 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19002 {Chain,
19003 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19004 Rnd});
19005 return DAG.getNode(ISD::FP_ROUND, dl, VT,
19006 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19009 static bool isLegalConversion(MVT VT, bool IsSigned,
19010 const X86Subtarget &Subtarget) {
19011 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19012 return true;
19013 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19014 return true;
19015 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19016 return true;
19017 if (Subtarget.useAVX512Regs()) {
19018 if (VT == MVT::v16i32)
19019 return true;
19020 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19021 return true;
19023 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19024 (VT == MVT::v2i64 || VT == MVT::v4i64))
19025 return true;
19026 return false;
19029 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19030 SelectionDAG &DAG) const {
19031 bool IsStrict = Op->isStrictFPOpcode();
19032 unsigned OpNo = IsStrict ? 1 : 0;
19033 SDValue Src = Op.getOperand(OpNo);
19034 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19035 MVT SrcVT = Src.getSimpleValueType();
19036 MVT VT = Op.getSimpleValueType();
19037 SDLoc dl(Op);
19039 if (isSoftF16(VT, Subtarget))
19040 return promoteXINT_TO_FP(Op, DAG);
19041 else if (isLegalConversion(SrcVT, true, Subtarget))
19042 return Op;
19044 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19045 return LowerWin64_INT128_TO_FP(Op, DAG);
19047 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
19048 return Extract;
19050 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
19051 return R;
19053 if (SrcVT.isVector()) {
19054 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19055 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19056 // source for strict FP.
19057 if (IsStrict)
19058 return DAG.getNode(
19059 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19060 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19061 DAG.getUNDEF(SrcVT))});
19062 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19063 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19064 DAG.getUNDEF(SrcVT)));
19066 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19067 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
19069 return SDValue();
19072 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19073 "Unknown SINT_TO_FP to lower!");
19075 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19077 // These are really Legal; return the operand so the caller accepts it as
19078 // Legal.
19079 if (SrcVT == MVT::i32 && UseSSEReg)
19080 return Op;
19081 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19082 return Op;
19084 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
19085 return V;
19086 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
19087 return V;
19089 // SSE doesn't have an i16 conversion so we need to promote.
19090 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19091 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19092 if (IsStrict)
19093 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19094 {Chain, Ext});
19096 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19099 if (VT == MVT::f128 || !Subtarget.hasX87())
19100 return SDValue();
19102 SDValue ValueToStore = Src;
19103 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19104 // Bitcasting to f64 here allows us to do a single 64-bit store from
19105 // an SSE register, avoiding the store forwarding penalty that would come
19106 // with two 32-bit stores.
19107 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19109 unsigned Size = SrcVT.getStoreSize();
19110 Align Alignment(Size);
19111 MachineFunction &MF = DAG.getMachineFunction();
19112 auto PtrVT = getPointerTy(MF.getDataLayout());
19113 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19114 MachinePointerInfo MPI =
19115 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
19116 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19117 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19118 std::pair<SDValue, SDValue> Tmp =
19119 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19121 if (IsStrict)
19122 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19124 return Tmp.first;
19127 std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19128 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19129 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19130 // Build the FILD
19131 SDVTList Tys;
19132 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19133 if (useSSE)
19134 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19135 else
19136 Tys = DAG.getVTList(DstVT, MVT::Other);
19138 SDValue FILDOps[] = {Chain, Pointer};
19139 SDValue Result =
19140 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19141 Alignment, MachineMemOperand::MOLoad);
19142 Chain = Result.getValue(1);
19144 if (useSSE) {
19145 MachineFunction &MF = DAG.getMachineFunction();
19146 unsigned SSFISize = DstVT.getStoreSize();
19147 int SSFI =
19148 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19149 auto PtrVT = getPointerTy(MF.getDataLayout());
19150 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19151 Tys = DAG.getVTList(MVT::Other);
19152 SDValue FSTOps[] = {Chain, Result, StackSlot};
19153 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
19154 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
19155 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19157 Chain =
19158 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19159 Result = DAG.getLoad(
19160 DstVT, DL, Chain, StackSlot,
19161 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
19162 Chain = Result.getValue(1);
19165 return { Result, Chain };
19168 /// Horizontal vector math instructions may be slower than normal math with
19169 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19170 /// implementation, and likely shuffle complexity of the alternate sequence.
19171 static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19172 const X86Subtarget &Subtarget) {
19173 bool IsOptimizingSize = DAG.shouldOptForSize();
19174 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19175 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19178 /// 64-bit unsigned integer to double expansion.
19179 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
19180 const X86Subtarget &Subtarget) {
19181 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19182 // when converting 0 when rounding toward negative infinity. Caller will
19183 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19184 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19185 // This algorithm is not obvious. Here it is what we're trying to output:
19187 movq %rax, %xmm0
19188 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19189 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19190 #ifdef __SSE3__
19191 haddpd %xmm0, %xmm0
19192 #else
19193 pshufd $0x4e, %xmm0, %xmm1
19194 addpd %xmm1, %xmm0
19195 #endif
19198 SDLoc dl(Op);
19199 LLVMContext *Context = DAG.getContext();
19201 // Build some magic constants.
19202 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19203 Constant *C0 = ConstantDataVector::get(*Context, CV0);
19204 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19205 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19207 SmallVector<Constant*,2> CV1;
19208 CV1.push_back(
19209 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19210 APInt(64, 0x4330000000000000ULL))));
19211 CV1.push_back(
19212 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19213 APInt(64, 0x4530000000000000ULL))));
19214 Constant *C1 = ConstantVector::get(CV1);
19215 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19217 // Load the 64-bit value into an XMM register.
19218 SDValue XR1 =
19219 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19220 SDValue CLod0 = DAG.getLoad(
19221 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19222 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
19223 SDValue Unpck1 =
19224 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19226 SDValue CLod1 = DAG.getLoad(
19227 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19228 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
19229 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19230 // TODO: Are there any fast-math-flags to propagate here?
19231 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19232 SDValue Result;
19234 if (Subtarget.hasSSE3() &&
19235 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19236 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19237 } else {
19238 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19239 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19241 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19242 DAG.getIntPtrConstant(0, dl));
19243 return Result;
19246 /// 32-bit unsigned integer to float expansion.
19247 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
19248 const X86Subtarget &Subtarget) {
19249 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19250 SDLoc dl(Op);
19251 // FP constant to bias correct the final result.
19252 SDValue Bias = DAG.getConstantFP(
19253 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19255 // Load the 32-bit value into an XMM register.
19256 SDValue Load =
19257 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19259 // Zero out the upper parts of the register.
19260 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19262 // Or the load with the bias.
19263 SDValue Or = DAG.getNode(
19264 ISD::OR, dl, MVT::v2i64,
19265 DAG.getBitcast(MVT::v2i64, Load),
19266 DAG.getBitcast(MVT::v2i64,
19267 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19268 Or =
19269 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19270 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
19272 if (Op.getNode()->isStrictFPOpcode()) {
19273 // Subtract the bias.
19274 // TODO: Are there any fast-math-flags to propagate here?
19275 SDValue Chain = Op.getOperand(0);
19276 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19277 {Chain, Or, Bias});
19279 if (Op.getValueType() == Sub.getValueType())
19280 return Sub;
19282 // Handle final rounding.
19283 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19284 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19286 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19289 // Subtract the bias.
19290 // TODO: Are there any fast-math-flags to propagate here?
19291 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19293 // Handle final rounding.
19294 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19297 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
19298 const X86Subtarget &Subtarget,
19299 const SDLoc &DL) {
19300 if (Op.getSimpleValueType() != MVT::v2f64)
19301 return SDValue();
19303 bool IsStrict = Op->isStrictFPOpcode();
19305 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19306 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
19308 if (Subtarget.hasAVX512()) {
19309 if (!Subtarget.hasVLX()) {
19310 // Let generic type legalization widen this.
19311 if (!IsStrict)
19312 return SDValue();
19313 // Otherwise pad the integer input with 0s and widen the operation.
19314 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19315 DAG.getConstant(0, DL, MVT::v2i32));
19316 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
19317 {Op.getOperand(0), N0});
19318 SDValue Chain = Res.getValue(1);
19319 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
19320 DAG.getIntPtrConstant(0, DL));
19321 return DAG.getMergeValues({Res, Chain}, DL);
19324 // Legalize to v4i32 type.
19325 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19326 DAG.getUNDEF(MVT::v2i32));
19327 if (IsStrict)
19328 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
19329 {Op.getOperand(0), N0});
19330 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
19333 // Zero extend to 2i64, OR with the floating point representation of 2^52.
19334 // This gives us the floating point equivalent of 2^52 + the i32 integer
19335 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
19336 // point leaving just our i32 integers in double format.
19337 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
19338 SDValue VBias = DAG.getConstantFP(
19339 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
19340 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
19341 DAG.getBitcast(MVT::v2i64, VBias));
19342 Or = DAG.getBitcast(MVT::v2f64, Or);
19344 if (IsStrict)
19345 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
19346 {Op.getOperand(0), Or, VBias});
19347 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
19350 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
19351 const X86Subtarget &Subtarget) {
19352 SDLoc DL(Op);
19353 bool IsStrict = Op->isStrictFPOpcode();
19354 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
19355 MVT VecIntVT = V.getSimpleValueType();
19356 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
19357 "Unsupported custom type");
19359 if (Subtarget.hasAVX512()) {
19360 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19361 assert(!Subtarget.hasVLX() && "Unexpected features");
19362 MVT VT = Op->getSimpleValueType(0);
19364 // v8i32->v8f64 is legal with AVX512 so just return it.
19365 if (VT == MVT::v8f64)
19366 return Op;
19368 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
19369 "Unexpected VT!");
19370 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
19371 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
19372 // Need to concat with zero vector for strict fp to avoid spurious
19373 // exceptions.
19374 SDValue Tmp =
19375 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
19376 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
19377 DAG.getIntPtrConstant(0, DL));
19378 SDValue Res, Chain;
19379 if (IsStrict) {
19380 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
19381 {Op->getOperand(0), V});
19382 Chain = Res.getValue(1);
19383 } else {
19384 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
19387 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19388 DAG.getIntPtrConstant(0, DL));
19390 if (IsStrict)
19391 return DAG.getMergeValues({Res, Chain}, DL);
19392 return Res;
19395 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
19396 Op->getSimpleValueType(0) == MVT::v4f64) {
19397 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
19398 Constant *Bias = ConstantFP::get(
19399 *DAG.getContext(),
19400 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
19401 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19402 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
19403 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
19404 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
19405 SDValue VBias = DAG.getMemIntrinsicNode(
19406 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
19407 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
19408 MachineMemOperand::MOLoad);
19410 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
19411 DAG.getBitcast(MVT::v4i64, VBias));
19412 Or = DAG.getBitcast(MVT::v4f64, Or);
19414 if (IsStrict)
19415 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
19416 {Op.getOperand(0), Or, VBias});
19417 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
19420 // The algorithm is the following:
19421 // #ifdef __SSE4_1__
19422 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19423 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19424 // (uint4) 0x53000000, 0xaa);
19425 // #else
19426 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19427 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19428 // #endif
19429 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19430 // return (float4) lo + fhi;
19432 bool Is128 = VecIntVT == MVT::v4i32;
19433 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
19434 // If we convert to something else than the supported type, e.g., to v4f64,
19435 // abort early.
19436 if (VecFloatVT != Op->getSimpleValueType(0))
19437 return SDValue();
19439 // In the #idef/#else code, we have in common:
19440 // - The vector of constants:
19441 // -- 0x4b000000
19442 // -- 0x53000000
19443 // - A shift:
19444 // -- v >> 16
19446 // Create the splat vector for 0x4b000000.
19447 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
19448 // Create the splat vector for 0x53000000.
19449 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
19451 // Create the right shift.
19452 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
19453 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
19455 SDValue Low, High;
19456 if (Subtarget.hasSSE41()) {
19457 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
19458 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19459 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
19460 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
19461 // Low will be bitcasted right away, so do not bother bitcasting back to its
19462 // original type.
19463 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
19464 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19465 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19466 // (uint4) 0x53000000, 0xaa);
19467 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
19468 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
19469 // High will be bitcasted right away, so do not bother bitcasting back to
19470 // its original type.
19471 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
19472 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19473 } else {
19474 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
19475 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19476 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
19477 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
19479 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19480 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
19483 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
19484 SDValue VecCstFSub = DAG.getConstantFP(
19485 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
19487 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19488 // NOTE: By using fsub of a positive constant instead of fadd of a negative
19489 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
19490 // enabled. See PR24512.
19491 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
19492 // TODO: Are there any fast-math-flags to propagate here?
19493 // (float4) lo;
19494 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
19495 // return (float4) lo + fhi;
19496 if (IsStrict) {
19497 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
19498 {Op.getOperand(0), HighBitcast, VecCstFSub});
19499 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
19500 {FHigh.getValue(1), LowBitcast, FHigh});
19503 SDValue FHigh =
19504 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
19505 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
19508 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
19509 const X86Subtarget &Subtarget) {
19510 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19511 SDValue N0 = Op.getOperand(OpNo);
19512 MVT SrcVT = N0.getSimpleValueType();
19513 SDLoc dl(Op);
19515 switch (SrcVT.SimpleTy) {
19516 default:
19517 llvm_unreachable("Custom UINT_TO_FP is not supported!");
19518 case MVT::v2i32:
19519 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
19520 case MVT::v4i32:
19521 case MVT::v8i32:
19522 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
19523 case MVT::v2i64:
19524 case MVT::v4i64:
19525 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
19529 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
19530 SelectionDAG &DAG) const {
19531 bool IsStrict = Op->isStrictFPOpcode();
19532 unsigned OpNo = IsStrict ? 1 : 0;
19533 SDValue Src = Op.getOperand(OpNo);
19534 SDLoc dl(Op);
19535 auto PtrVT = getPointerTy(DAG.getDataLayout());
19536 MVT SrcVT = Src.getSimpleValueType();
19537 MVT DstVT = Op->getSimpleValueType(0);
19538 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19540 // Bail out when we don't have native conversion instructions.
19541 if (DstVT == MVT::f128)
19542 return SDValue();
19544 if (isSoftF16(DstVT, Subtarget))
19545 return promoteXINT_TO_FP(Op, DAG);
19546 else if (isLegalConversion(SrcVT, false, Subtarget))
19547 return Op;
19549 if (DstVT.isVector())
19550 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
19552 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19553 return LowerWin64_INT128_TO_FP(Op, DAG);
19555 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
19556 return Extract;
19558 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
19559 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
19560 // Conversions from unsigned i32 to f32/f64 are legal,
19561 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
19562 return Op;
19565 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
19566 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
19567 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
19568 if (IsStrict)
19569 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
19570 {Chain, Src});
19571 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
19574 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
19575 return V;
19576 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
19577 return V;
19579 // The transform for i64->f64 isn't correct for 0 when rounding to negative
19580 // infinity. It produces -0.0, so disable under strictfp.
19581 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
19582 !IsStrict)
19583 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
19584 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
19585 // negative infinity. So disable under strictfp. Using FILD instead.
19586 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
19587 !IsStrict)
19588 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
19589 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
19590 (DstVT == MVT::f32 || DstVT == MVT::f64))
19591 return SDValue();
19593 // Make a 64-bit buffer, and use it to build an FILD.
19594 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
19595 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
19596 Align SlotAlign(8);
19597 MachinePointerInfo MPI =
19598 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
19599 if (SrcVT == MVT::i32) {
19600 SDValue OffsetSlot =
19601 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
19602 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
19603 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
19604 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
19605 std::pair<SDValue, SDValue> Tmp =
19606 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
19607 if (IsStrict)
19608 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19610 return Tmp.first;
19613 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
19614 SDValue ValueToStore = Src;
19615 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
19616 // Bitcasting to f64 here allows us to do a single 64-bit store from
19617 // an SSE register, avoiding the store forwarding penalty that would come
19618 // with two 32-bit stores.
19619 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19621 SDValue Store =
19622 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
19623 // For i64 source, we need to add the appropriate power of 2 if the input
19624 // was negative. We must be careful to do the computation in x87 extended
19625 // precision, not in SSE.
19626 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19627 SDValue Ops[] = { Store, StackSlot };
19628 SDValue Fild =
19629 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
19630 SlotAlign, MachineMemOperand::MOLoad);
19631 Chain = Fild.getValue(1);
19634 // Check whether the sign bit is set.
19635 SDValue SignSet = DAG.getSetCC(
19636 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
19637 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
19639 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
19640 APInt FF(64, 0x5F80000000000000ULL);
19641 SDValue FudgePtr = DAG.getConstantPool(
19642 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
19643 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
19645 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
19646 SDValue Zero = DAG.getIntPtrConstant(0, dl);
19647 SDValue Four = DAG.getIntPtrConstant(4, dl);
19648 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
19649 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
19651 // Load the value out, extending it from f32 to f80.
19652 SDValue Fudge = DAG.getExtLoad(
19653 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
19654 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
19655 CPAlignment);
19656 Chain = Fudge.getValue(1);
19657 // Extend everything to 80 bits to force it to be done on x87.
19658 // TODO: Are there any fast-math-flags to propagate here?
19659 if (IsStrict) {
19660 unsigned Opc = ISD::STRICT_FADD;
19661 // Windows needs the precision control changed to 80bits around this add.
19662 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19663 Opc = X86ISD::STRICT_FP80_ADD;
19665 SDValue Add =
19666 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
19667 // STRICT_FP_ROUND can't handle equal types.
19668 if (DstVT == MVT::f80)
19669 return Add;
19670 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
19671 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
19673 unsigned Opc = ISD::FADD;
19674 // Windows needs the precision control changed to 80bits around this add.
19675 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19676 Opc = X86ISD::FP80_ADD;
19678 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
19679 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
19680 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
19683 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
19684 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
19685 // just return an SDValue().
19686 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
19687 // to i16, i32 or i64, and we lower it to a legal sequence and return the
19688 // result.
19689 SDValue
19690 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
19691 bool IsSigned, SDValue &Chain) const {
19692 bool IsStrict = Op->isStrictFPOpcode();
19693 SDLoc DL(Op);
19695 EVT DstTy = Op.getValueType();
19696 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
19697 EVT TheVT = Value.getValueType();
19698 auto PtrVT = getPointerTy(DAG.getDataLayout());
19700 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
19701 // f16 must be promoted before using the lowering in this routine.
19702 // fp128 does not use this lowering.
19703 return SDValue();
19706 // If using FIST to compute an unsigned i64, we'll need some fixup
19707 // to handle values above the maximum signed i64. A FIST is always
19708 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
19709 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
19711 // FIXME: This does not generate an invalid exception if the input does not
19712 // fit in i32. PR44019
19713 if (!IsSigned && DstTy != MVT::i64) {
19714 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
19715 // The low 32 bits of the fist result will have the correct uint32 result.
19716 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
19717 DstTy = MVT::i64;
19720 assert(DstTy.getSimpleVT() <= MVT::i64 &&
19721 DstTy.getSimpleVT() >= MVT::i16 &&
19722 "Unknown FP_TO_INT to lower!");
19724 // We lower FP->int64 into FISTP64 followed by a load from a temporary
19725 // stack slot.
19726 MachineFunction &MF = DAG.getMachineFunction();
19727 unsigned MemSize = DstTy.getStoreSize();
19728 int SSFI =
19729 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
19730 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19732 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19734 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
19736 if (UnsignedFixup) {
19738 // Conversion to unsigned i64 is implemented with a select,
19739 // depending on whether the source value fits in the range
19740 // of a signed i64. Let Thresh be the FP equivalent of
19741 // 0x8000000000000000ULL.
19743 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
19744 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
19745 // FistSrc = (Value - FltOfs);
19746 // Fist-to-mem64 FistSrc
19747 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
19748 // to XOR'ing the high 32 bits with Adjust.
19750 // Being a power of 2, Thresh is exactly representable in all FP formats.
19751 // For X87 we'd like to use the smallest FP type for this constant, but
19752 // for DAG type consistency we have to match the FP operand type.
19754 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
19755 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
19756 bool LosesInfo = false;
19757 if (TheVT == MVT::f64)
19758 // The rounding mode is irrelevant as the conversion should be exact.
19759 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
19760 &LosesInfo);
19761 else if (TheVT == MVT::f80)
19762 Status = Thresh.convert(APFloat::x87DoubleExtended(),
19763 APFloat::rmNearestTiesToEven, &LosesInfo);
19765 assert(Status == APFloat::opOK && !LosesInfo &&
19766 "FP conversion should have been exact");
19768 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
19770 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
19771 *DAG.getContext(), TheVT);
19772 SDValue Cmp;
19773 if (IsStrict) {
19774 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
19775 /*IsSignaling*/ true);
19776 Chain = Cmp.getValue(1);
19777 } else {
19778 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
19781 // Our preferred lowering of
19783 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
19785 // is
19787 // (Value >= Thresh) << 63
19789 // but since we can get here after LegalOperations, DAGCombine might do the
19790 // wrong thing if we create a select. So, directly create the preferred
19791 // version.
19792 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
19793 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
19794 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
19796 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
19797 DAG.getConstantFP(0.0, DL, TheVT));
19799 if (IsStrict) {
19800 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
19801 { Chain, Value, FltOfs });
19802 Chain = Value.getValue(1);
19803 } else
19804 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
19807 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
19809 // FIXME This causes a redundant load/store if the SSE-class value is already
19810 // in memory, such as if it is on the callstack.
19811 if (isScalarFPTypeInSSEReg(TheVT)) {
19812 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
19813 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
19814 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19815 SDValue Ops[] = { Chain, StackSlot };
19817 unsigned FLDSize = TheVT.getStoreSize();
19818 assert(FLDSize <= MemSize && "Stack slot not big enough");
19819 MachineMemOperand *MMO = MF.getMachineMemOperand(
19820 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
19821 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
19822 Chain = Value.getValue(1);
19825 // Build the FP_TO_INT*_IN_MEM
19826 MachineMemOperand *MMO = MF.getMachineMemOperand(
19827 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
19828 SDValue Ops[] = { Chain, Value, StackSlot };
19829 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
19830 DAG.getVTList(MVT::Other),
19831 Ops, DstTy, MMO);
19833 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
19834 Chain = Res.getValue(1);
19836 // If we need an unsigned fixup, XOR the result with adjust.
19837 if (UnsignedFixup)
19838 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
19840 return Res;
19843 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
19844 const X86Subtarget &Subtarget) {
19845 MVT VT = Op.getSimpleValueType();
19846 SDValue In = Op.getOperand(0);
19847 MVT InVT = In.getSimpleValueType();
19848 SDLoc dl(Op);
19849 unsigned Opc = Op.getOpcode();
19851 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
19852 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
19853 "Unexpected extension opcode");
19854 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
19855 "Expected same number of elements");
19856 assert((VT.getVectorElementType() == MVT::i16 ||
19857 VT.getVectorElementType() == MVT::i32 ||
19858 VT.getVectorElementType() == MVT::i64) &&
19859 "Unexpected element type");
19860 assert((InVT.getVectorElementType() == MVT::i8 ||
19861 InVT.getVectorElementType() == MVT::i16 ||
19862 InVT.getVectorElementType() == MVT::i32) &&
19863 "Unexpected element type");
19865 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
19867 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
19868 assert(InVT == MVT::v32i8 && "Unexpected VT!");
19869 return splitVectorIntUnary(Op, DAG);
19872 if (Subtarget.hasInt256())
19873 return Op;
19875 // Optimize vectors in AVX mode:
19877 // v8i16 -> v8i32
19878 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
19879 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
19880 // Concat upper and lower parts.
19882 // v4i32 -> v4i64
19883 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
19884 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
19885 // Concat upper and lower parts.
19887 MVT HalfVT = VT.getHalfNumVectorElementsVT();
19888 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
19890 // Short-circuit if we can determine that each 128-bit half is the same value.
19891 // Otherwise, this is difficult to match and optimize.
19892 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
19893 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
19894 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
19896 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
19897 SDValue Undef = DAG.getUNDEF(InVT);
19898 bool NeedZero = Opc == ISD::ZERO_EXTEND;
19899 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
19900 OpHi = DAG.getBitcast(HalfVT, OpHi);
19902 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
19905 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
19906 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
19907 const SDLoc &dl, SelectionDAG &DAG) {
19908 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
19909 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
19910 DAG.getIntPtrConstant(0, dl));
19911 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
19912 DAG.getIntPtrConstant(8, dl));
19913 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
19914 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
19915 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
19916 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19919 static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
19920 const X86Subtarget &Subtarget,
19921 SelectionDAG &DAG) {
19922 MVT VT = Op->getSimpleValueType(0);
19923 SDValue In = Op->getOperand(0);
19924 MVT InVT = In.getSimpleValueType();
19925 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
19926 SDLoc DL(Op);
19927 unsigned NumElts = VT.getVectorNumElements();
19929 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
19930 // avoids a constant pool load.
19931 if (VT.getVectorElementType() != MVT::i8) {
19932 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
19933 return DAG.getNode(ISD::SRL, DL, VT, Extend,
19934 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
19937 // Extend VT if BWI is not supported.
19938 MVT ExtVT = VT;
19939 if (!Subtarget.hasBWI()) {
19940 // If v16i32 is to be avoided, we'll need to split and concatenate.
19941 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
19942 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
19944 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
19947 // Widen to 512-bits if VLX is not supported.
19948 MVT WideVT = ExtVT;
19949 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
19950 NumElts *= 512 / ExtVT.getSizeInBits();
19951 InVT = MVT::getVectorVT(MVT::i1, NumElts);
19952 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
19953 In, DAG.getIntPtrConstant(0, DL));
19954 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
19955 NumElts);
19958 SDValue One = DAG.getConstant(1, DL, WideVT);
19959 SDValue Zero = DAG.getConstant(0, DL, WideVT);
19961 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
19963 // Truncate if we had to extend above.
19964 if (VT != ExtVT) {
19965 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
19966 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
19969 // Extract back to 128/256-bit if we widened.
19970 if (WideVT != VT)
19971 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
19972 DAG.getIntPtrConstant(0, DL));
19974 return SelectedVal;
19977 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19978 SelectionDAG &DAG) {
19979 SDValue In = Op.getOperand(0);
19980 MVT SVT = In.getSimpleValueType();
19982 if (SVT.getVectorElementType() == MVT::i1)
19983 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
19985 assert(Subtarget.hasAVX() && "Expected AVX support");
19986 return LowerAVXExtend(Op, DAG, Subtarget);
19989 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
19990 /// It makes use of the fact that vectors with enough leading sign/zero bits
19991 /// prevent the PACKSS/PACKUS from saturating the results.
19992 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
19993 /// within each 128-bit lane.
19994 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
19995 const SDLoc &DL, SelectionDAG &DAG,
19996 const X86Subtarget &Subtarget) {
19997 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
19998 "Unexpected PACK opcode");
19999 assert(DstVT.isVector() && "VT not a vector?");
20001 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20002 if (!Subtarget.hasSSE2())
20003 return SDValue();
20005 EVT SrcVT = In.getValueType();
20007 // No truncation required, we might get here due to recursive calls.
20008 if (SrcVT == DstVT)
20009 return In;
20011 unsigned NumElems = SrcVT.getVectorNumElements();
20012 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20013 return SDValue();
20015 unsigned DstSizeInBits = DstVT.getSizeInBits();
20016 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20017 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20018 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20020 LLVMContext &Ctx = *DAG.getContext();
20021 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20022 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20024 // Pack to the largest type possible:
20025 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20026 EVT InVT = MVT::i16, OutVT = MVT::i8;
20027 if (SrcVT.getScalarSizeInBits() > 16 &&
20028 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20029 InVT = MVT::i32;
20030 OutVT = MVT::i16;
20033 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20034 // On pre-AVX512, pack the src in both halves to help value tracking.
20035 if (SrcSizeInBits <= 128) {
20036 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20037 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20038 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20039 SDValue LHS = DAG.getBitcast(InVT, In);
20040 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20041 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20042 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20043 Res = DAG.getBitcast(PackedVT, Res);
20044 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20047 // Split lower/upper subvectors.
20048 SDValue Lo, Hi;
20049 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20051 // If Hi is undef, then don't bother packing it and widen the result instead.
20052 if (Hi.isUndef()) {
20053 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20054 if (SDValue Res =
20055 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20056 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20059 unsigned SubSizeInBits = SrcSizeInBits / 2;
20060 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20061 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20063 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20064 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20065 Lo = DAG.getBitcast(InVT, Lo);
20066 Hi = DAG.getBitcast(InVT, Hi);
20067 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20068 return DAG.getBitcast(DstVT, Res);
20071 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20072 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20073 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20074 Lo = DAG.getBitcast(InVT, Lo);
20075 Hi = DAG.getBitcast(InVT, Hi);
20076 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20078 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20079 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20080 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20081 SmallVector<int, 64> Mask;
20082 int Scale = 64 / OutVT.getScalarSizeInBits();
20083 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20084 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20086 if (DstVT.is256BitVector())
20087 return DAG.getBitcast(DstVT, Res);
20089 // If 512bit -> 128bit truncate another stage.
20090 Res = DAG.getBitcast(PackedVT, Res);
20091 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20094 // Recursively pack lower/upper subvectors, concat result and pack again.
20095 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20097 if (PackedVT.is128BitVector()) {
20098 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20099 // type legalization.
20100 SDValue Res =
20101 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20102 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20105 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20106 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20107 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20108 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20109 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20112 /// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20113 /// e.g. trunc <8 x i32> X to <8 x i16> -->
20114 /// MaskX = X & 0xffff (clear high bits to prevent saturation)
20115 /// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20116 static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL,
20117 const X86Subtarget &Subtarget,
20118 SelectionDAG &DAG) {
20119 In = DAG.getZeroExtendInReg(In, DL, DstVT);
20120 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20123 /// Truncate using inreg sign extension and X86ISD::PACKSS.
20124 static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL,
20125 const X86Subtarget &Subtarget,
20126 SelectionDAG &DAG) {
20127 EVT SrcVT = In.getValueType();
20128 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20129 DAG.getValueType(DstVT));
20130 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20133 /// Helper to determine if \p In truncated to \p DstVT has the necessary
20134 /// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20135 /// possibly by converting a SRL node to SRA for sign extension.
20136 static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20137 SDValue In, const SDLoc &DL,
20138 SelectionDAG &DAG,
20139 const X86Subtarget &Subtarget) {
20140 // Requires SSE2.
20141 if (!Subtarget.hasSSE2())
20142 return SDValue();
20144 EVT SrcVT = In.getValueType();
20145 EVT DstSVT = DstVT.getVectorElementType();
20146 EVT SrcSVT = SrcVT.getVectorElementType();
20148 // Check we have a truncation suited for PACKSS/PACKUS.
20149 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20150 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20151 return SDValue();
20153 assert(SrcSVT.getSizeInBits() > DstSVT.getSizeInBits() && "Bad truncation");
20154 unsigned NumStages = Log2_32(SrcSVT.getSizeInBits() / DstSVT.getSizeInBits());
20156 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20157 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20158 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20159 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20160 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20161 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20162 return SDValue();
20164 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20165 // split this for packing.
20166 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20167 !isFreeToSplitVector(In.getNode(), DAG) &&
20168 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20169 return SDValue();
20171 // Don't truncate AVX512 targets as multiple PACK nodes stages.
20172 if (Subtarget.hasAVX512() && NumStages > 1)
20173 return SDValue();
20175 unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
20176 unsigned NumPackedSignBits = std::min<unsigned>(DstSVT.getSizeInBits(), 16);
20177 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20179 // Truncate with PACKUS if we are truncating a vector with leading zero
20180 // bits that extend all the way to the packed/truncated value.
20181 // e.g. Masks, zext_in_reg, etc.
20182 // Pre-SSE41 we can only use PACKUSWB.
20183 KnownBits Known = DAG.computeKnownBits(In);
20184 if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20185 PackOpcode = X86ISD::PACKUS;
20186 return In;
20189 // Truncate with PACKSS if we are truncating a vector with sign-bits
20190 // that extend all the way to the packed/truncated value.
20191 // e.g. Comparison result, sext_in_reg, etc.
20192 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20194 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20195 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20196 // see through BITCASTs later on and combines/simplifications can't then use
20197 // it.
20198 if (DstSVT == MVT::i32 && NumSignBits != SrcSVT.getSizeInBits() &&
20199 !Subtarget.hasAVX512())
20200 return SDValue();
20202 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20203 if (MinSignBits < NumSignBits) {
20204 PackOpcode = X86ISD::PACKSS;
20205 return In;
20208 // If we have a srl that only generates signbits that we will discard in
20209 // the truncation then we can use PACKSS by converting the srl to a sra.
20210 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20211 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20212 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
20213 In, APInt::getAllOnes(SrcVT.getVectorNumElements()))) {
20214 if (*ShAmt == MinSignBits) {
20215 PackOpcode = X86ISD::PACKSS;
20216 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20220 return SDValue();
20223 /// This function lowers a vector truncation of 'extended sign-bits' or
20224 /// 'extended zero-bits' values.
20225 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20226 static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In,
20227 const SDLoc &DL,
20228 const X86Subtarget &Subtarget,
20229 SelectionDAG &DAG) {
20230 MVT SrcVT = In.getSimpleValueType();
20231 MVT DstSVT = DstVT.getVectorElementType();
20232 MVT SrcSVT = SrcVT.getVectorElementType();
20233 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20234 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20235 return SDValue();
20237 // If the upper half of the source is undef, then attempt to split and
20238 // only truncate the lower half.
20239 if (DstVT.getSizeInBits() >= 128) {
20240 SmallVector<SDValue> LowerOps;
20241 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20242 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20243 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20244 Subtarget, DAG))
20245 return widenSubVector(Res, false, Subtarget, DAG, DL,
20246 DstVT.getSizeInBits());
20250 unsigned PackOpcode;
20251 if (SDValue Src =
20252 matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
20253 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20255 return SDValue();
20258 /// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20259 /// X86ISD::PACKUS/X86ISD::PACKSS operations.
20260 static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL,
20261 const X86Subtarget &Subtarget,
20262 SelectionDAG &DAG) {
20263 MVT SrcVT = In.getSimpleValueType();
20264 MVT DstSVT = DstVT.getVectorElementType();
20265 MVT SrcSVT = SrcVT.getVectorElementType();
20266 unsigned NumElems = DstVT.getVectorNumElements();
20267 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20268 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20269 NumElems >= 8))
20270 return SDValue();
20272 // SSSE3's pshufb results in less instructions in the cases below.
20273 if (Subtarget.hasSSSE3() && NumElems == 8) {
20274 if (SrcSVT == MVT::i16)
20275 return SDValue();
20276 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
20277 return SDValue();
20280 // If the upper half of the source is undef, then attempt to split and
20281 // only truncate the lower half.
20282 if (DstVT.getSizeInBits() >= 128) {
20283 SmallVector<SDValue> LowerOps;
20284 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20285 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20286 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
20287 return widenSubVector(Res, false, Subtarget, DAG, DL,
20288 DstVT.getSizeInBits());
20292 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20293 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20294 // truncate 2 x v4i32 to v8i16.
20295 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
20296 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
20298 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20299 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
20301 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20302 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20303 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
20304 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
20305 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
20308 return SDValue();
20311 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
20312 const X86Subtarget &Subtarget) {
20314 SDLoc DL(Op);
20315 MVT VT = Op.getSimpleValueType();
20316 SDValue In = Op.getOperand(0);
20317 MVT InVT = In.getSimpleValueType();
20319 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
20321 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
20322 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20323 if (InVT.getScalarSizeInBits() <= 16) {
20324 if (Subtarget.hasBWI()) {
20325 // legal, will go to VPMOVB2M, VPMOVW2M
20326 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20327 // We need to shift to get the lsb into sign position.
20328 // Shift packed bytes not supported natively, bitcast to word
20329 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
20330 In = DAG.getNode(ISD::SHL, DL, ExtVT,
20331 DAG.getBitcast(ExtVT, In),
20332 DAG.getConstant(ShiftInx, DL, ExtVT));
20333 In = DAG.getBitcast(InVT, In);
20335 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
20336 In, ISD::SETGT);
20338 // Use TESTD/Q, extended vector to packed dword/qword.
20339 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
20340 "Unexpected vector type.");
20341 unsigned NumElts = InVT.getVectorNumElements();
20342 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
20343 // We need to change to a wider element type that we have support for.
20344 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
20345 // For 16 element vectors we extend to v16i32 unless we are explicitly
20346 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
20347 // we need to split into two 8 element vectors which we can extend to v8i32,
20348 // truncate and concat the results. There's an additional complication if
20349 // the original type is v16i8. In that case we can't split the v16i8
20350 // directly, so we need to shuffle high elements to low and use
20351 // sign_extend_vector_inreg.
20352 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
20353 SDValue Lo, Hi;
20354 if (InVT == MVT::v16i8) {
20355 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
20356 Hi = DAG.getVectorShuffle(
20357 InVT, DL, In, In,
20358 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
20359 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
20360 } else {
20361 assert(InVT == MVT::v16i16 && "Unexpected VT!");
20362 Lo = extract128BitVector(In, 0, DAG, DL);
20363 Hi = extract128BitVector(In, 8, DAG, DL);
20365 // We're split now, just emit two truncates and a concat. The two
20366 // truncates will trigger legalization to come back to this function.
20367 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
20368 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
20369 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20371 // We either have 8 elements or we're allowed to use 512-bit vectors.
20372 // If we have VLX, we want to use the narrowest vector that can get the
20373 // job done so we use vXi32.
20374 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
20375 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
20376 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
20377 InVT = ExtVT;
20378 ShiftInx = InVT.getScalarSizeInBits() - 1;
20381 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20382 // We need to shift to get the lsb into sign position.
20383 In = DAG.getNode(ISD::SHL, DL, InVT, In,
20384 DAG.getConstant(ShiftInx, DL, InVT));
20386 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
20387 if (Subtarget.hasDQI())
20388 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
20389 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
20392 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
20393 SDLoc DL(Op);
20394 MVT VT = Op.getSimpleValueType();
20395 SDValue In = Op.getOperand(0);
20396 MVT InVT = In.getSimpleValueType();
20397 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
20398 "Invalid TRUNCATE operation");
20400 // If we're called by the type legalizer, handle a few cases.
20401 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20402 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
20403 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
20404 VT.is128BitVector() && Subtarget.hasAVX512()) {
20405 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
20406 "Unexpected subtarget!");
20407 // The default behavior is to truncate one step, concatenate, and then
20408 // truncate the remainder. We'd rather produce two 64-bit results and
20409 // concatenate those.
20410 SDValue Lo, Hi;
20411 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
20413 EVT LoVT, HiVT;
20414 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
20416 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
20417 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
20418 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20421 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
20422 if (!Subtarget.hasAVX512() ||
20423 (InVT.is512BitVector() && VT.is256BitVector()))
20424 if (SDValue SignPack =
20425 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20426 return SignPack;
20428 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
20429 if (!Subtarget.hasAVX512())
20430 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
20432 // Otherwise let default legalization handle it.
20433 return SDValue();
20436 if (VT.getVectorElementType() == MVT::i1)
20437 return LowerTruncateVecI1(Op, DAG, Subtarget);
20439 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
20440 // concat from subvectors to use VPTRUNC etc.
20441 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
20442 if (SDValue SignPack =
20443 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20444 return SignPack;
20446 // vpmovqb/w/d, vpmovdb/w, vpmovwb
20447 if (Subtarget.hasAVX512()) {
20448 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
20449 assert(VT == MVT::v32i8 && "Unexpected VT!");
20450 return splitVectorIntUnary(Op, DAG);
20453 // word to byte only under BWI. Otherwise we have to promoted to v16i32
20454 // and then truncate that. But we should only do that if we haven't been
20455 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
20456 // handled by isel patterns.
20457 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
20458 Subtarget.canExtendTo512DQ())
20459 return Op;
20462 // Handle truncation of V256 to V128 using shuffles.
20463 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
20465 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
20466 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
20467 if (Subtarget.hasInt256()) {
20468 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
20469 In = DAG.getBitcast(MVT::v8i32, In);
20470 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
20471 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
20472 DAG.getIntPtrConstant(0, DL));
20475 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20476 DAG.getIntPtrConstant(0, DL));
20477 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20478 DAG.getIntPtrConstant(2, DL));
20479 static const int ShufMask[] = {0, 2, 4, 6};
20480 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
20481 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
20484 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
20485 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
20486 if (Subtarget.hasInt256()) {
20487 // The PSHUFB mask:
20488 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
20489 -1, -1, -1, -1, -1, -1, -1, -1,
20490 16, 17, 20, 21, 24, 25, 28, 29,
20491 -1, -1, -1, -1, -1, -1, -1, -1 };
20492 In = DAG.getBitcast(MVT::v32i8, In);
20493 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
20494 In = DAG.getBitcast(MVT::v4i64, In);
20496 static const int ShufMask2[] = {0, 2, -1, -1};
20497 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
20498 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20499 DAG.getIntPtrConstant(0, DL));
20500 return DAG.getBitcast(MVT::v8i16, In);
20503 return Subtarget.hasSSE41()
20504 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
20505 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
20508 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
20509 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
20511 llvm_unreachable("All 256->128 cases should have been handled above!");
20514 // We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
20515 // behaves on out of range inputs to generate optimized conversions.
20516 static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
20517 SelectionDAG &DAG,
20518 const X86Subtarget &Subtarget) {
20519 MVT SrcVT = Src.getSimpleValueType();
20520 unsigned DstBits = VT.getScalarSizeInBits();
20521 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
20523 // Calculate the converted result for values in the range 0 to
20524 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
20525 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
20526 SDValue Big =
20527 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
20528 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
20529 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
20531 // The "CVTTP2SI" instruction conveniently sets the sign bit if
20532 // and only if the value was out of range. So we can use that
20533 // as our indicator that we rather use "Big" instead of "Small".
20535 // Use "Small" if "IsOverflown" has all bits cleared
20536 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
20538 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
20539 // use the slightly slower blendv select instead.
20540 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
20541 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
20542 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
20545 SDValue IsOverflown =
20546 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
20547 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
20548 return DAG.getNode(ISD::OR, dl, VT, Small,
20549 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
20552 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
20553 bool IsStrict = Op->isStrictFPOpcode();
20554 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
20555 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
20556 MVT VT = Op->getSimpleValueType(0);
20557 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20558 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
20559 MVT SrcVT = Src.getSimpleValueType();
20560 SDLoc dl(Op);
20562 SDValue Res;
20563 if (isSoftF16(SrcVT, Subtarget)) {
20564 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20565 if (IsStrict)
20566 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
20567 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
20568 {NVT, MVT::Other}, {Chain, Src})});
20569 return DAG.getNode(Op.getOpcode(), dl, VT,
20570 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
20571 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
20572 return Op;
20575 if (VT.isVector()) {
20576 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
20577 MVT ResVT = MVT::v4i32;
20578 MVT TruncVT = MVT::v4i1;
20579 unsigned Opc;
20580 if (IsStrict)
20581 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
20582 else
20583 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20585 if (!IsSigned && !Subtarget.hasVLX()) {
20586 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
20587 // Widen to 512-bits.
20588 ResVT = MVT::v8i32;
20589 TruncVT = MVT::v8i1;
20590 Opc = Op.getOpcode();
20591 // Need to concat with zero vector for strict fp to avoid spurious
20592 // exceptions.
20593 // TODO: Should we just do this for non-strict as well?
20594 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
20595 : DAG.getUNDEF(MVT::v8f64);
20596 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
20597 DAG.getIntPtrConstant(0, dl));
20599 if (IsStrict) {
20600 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
20601 Chain = Res.getValue(1);
20602 } else {
20603 Res = DAG.getNode(Opc, dl, ResVT, Src);
20606 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
20607 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
20608 DAG.getIntPtrConstant(0, dl));
20609 if (IsStrict)
20610 return DAG.getMergeValues({Res, Chain}, dl);
20611 return Res;
20614 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
20615 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
20616 return Op;
20618 MVT ResVT = VT;
20619 MVT EleVT = VT.getVectorElementType();
20620 if (EleVT != MVT::i64)
20621 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
20623 if (SrcVT != MVT::v8f16) {
20624 SDValue Tmp =
20625 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
20626 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
20627 Ops[0] = Src;
20628 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
20631 if (IsStrict) {
20632 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
20633 : X86ISD::STRICT_CVTTP2UI,
20634 dl, {ResVT, MVT::Other}, {Chain, Src});
20635 Chain = Res.getValue(1);
20636 } else {
20637 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
20638 ResVT, Src);
20641 // TODO: Need to add exception check code for strict FP.
20642 if (EleVT.getSizeInBits() < 16) {
20643 ResVT = MVT::getVectorVT(EleVT, 8);
20644 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
20647 if (ResVT != VT)
20648 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20649 DAG.getIntPtrConstant(0, dl));
20651 if (IsStrict)
20652 return DAG.getMergeValues({Res, Chain}, dl);
20653 return Res;
20656 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
20657 if (VT.getVectorElementType() == MVT::i16) {
20658 assert((SrcVT.getVectorElementType() == MVT::f32 ||
20659 SrcVT.getVectorElementType() == MVT::f64) &&
20660 "Expected f32/f64 vector!");
20661 MVT NVT = VT.changeVectorElementType(MVT::i32);
20662 if (IsStrict) {
20663 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
20664 : ISD::STRICT_FP_TO_UINT,
20665 dl, {NVT, MVT::Other}, {Chain, Src});
20666 Chain = Res.getValue(1);
20667 } else {
20668 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
20669 NVT, Src);
20672 // TODO: Need to add exception check code for strict FP.
20673 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20675 if (IsStrict)
20676 return DAG.getMergeValues({Res, Chain}, dl);
20677 return Res;
20680 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
20681 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
20682 assert(!IsSigned && "Expected unsigned conversion!");
20683 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
20684 return Op;
20687 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
20688 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
20689 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
20690 Subtarget.useAVX512Regs()) {
20691 assert(!IsSigned && "Expected unsigned conversion!");
20692 assert(!Subtarget.hasVLX() && "Unexpected features!");
20693 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20694 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20695 // Need to concat with zero vector for strict fp to avoid spurious
20696 // exceptions.
20697 // TODO: Should we just do this for non-strict as well?
20698 SDValue Tmp =
20699 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20700 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20701 DAG.getIntPtrConstant(0, dl));
20703 if (IsStrict) {
20704 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
20705 {Chain, Src});
20706 Chain = Res.getValue(1);
20707 } else {
20708 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
20711 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20712 DAG.getIntPtrConstant(0, dl));
20714 if (IsStrict)
20715 return DAG.getMergeValues({Res, Chain}, dl);
20716 return Res;
20719 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
20720 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
20721 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
20722 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
20723 assert(!Subtarget.hasVLX() && "Unexpected features!");
20724 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20725 // Need to concat with zero vector for strict fp to avoid spurious
20726 // exceptions.
20727 // TODO: Should we just do this for non-strict as well?
20728 SDValue Tmp =
20729 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20730 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20731 DAG.getIntPtrConstant(0, dl));
20733 if (IsStrict) {
20734 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
20735 {Chain, Src});
20736 Chain = Res.getValue(1);
20737 } else {
20738 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
20741 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20742 DAG.getIntPtrConstant(0, dl));
20744 if (IsStrict)
20745 return DAG.getMergeValues({Res, Chain}, dl);
20746 return Res;
20749 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
20750 if (!Subtarget.hasVLX()) {
20751 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
20752 // legalizer and then widened again by vector op legalization.
20753 if (!IsStrict)
20754 return SDValue();
20756 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
20757 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
20758 {Src, Zero, Zero, Zero});
20759 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
20760 {Chain, Tmp});
20761 SDValue Chain = Tmp.getValue(1);
20762 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
20763 DAG.getIntPtrConstant(0, dl));
20764 return DAG.getMergeValues({Tmp, Chain}, dl);
20767 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
20768 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
20769 DAG.getUNDEF(MVT::v2f32));
20770 if (IsStrict) {
20771 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
20772 : X86ISD::STRICT_CVTTP2UI;
20773 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
20775 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20776 return DAG.getNode(Opc, dl, VT, Tmp);
20779 // Generate optimized instructions for pre AVX512 unsigned conversions from
20780 // vXf32 to vXi32.
20781 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
20782 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
20783 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
20784 assert(!IsSigned && "Expected unsigned conversion!");
20785 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
20788 return SDValue();
20791 assert(!VT.isVector());
20793 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
20795 if (!IsSigned && UseSSEReg) {
20796 // Conversions from f32/f64 with AVX512 should be legal.
20797 if (Subtarget.hasAVX512())
20798 return Op;
20800 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
20801 // behaves on out of range inputs to generate optimized conversions.
20802 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
20803 (VT == MVT::i64 && Subtarget.is64Bit()))) {
20804 unsigned DstBits = VT.getScalarSizeInBits();
20805 APInt UIntLimit = APInt::getSignMask(DstBits);
20806 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
20807 DAG.getConstant(UIntLimit, dl, VT));
20808 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
20810 // Calculate the converted result for values in the range:
20811 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
20812 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
20813 SDValue Small =
20814 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
20815 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
20816 SDValue Big = DAG.getNode(
20817 X86ISD::CVTTS2SI, dl, VT,
20818 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
20819 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
20821 // The "CVTTS2SI" instruction conveniently sets the sign bit if
20822 // and only if the value was out of range. So we can use that
20823 // as our indicator that we rather use "Big" instead of "Small".
20825 // Use "Small" if "IsOverflown" has all bits cleared
20826 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
20827 SDValue IsOverflown = DAG.getNode(
20828 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
20829 return DAG.getNode(ISD::OR, dl, VT, Small,
20830 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
20833 // Use default expansion for i64.
20834 if (VT == MVT::i64)
20835 return SDValue();
20837 assert(VT == MVT::i32 && "Unexpected VT!");
20839 // Promote i32 to i64 and use a signed operation on 64-bit targets.
20840 // FIXME: This does not generate an invalid exception if the input does not
20841 // fit in i32. PR44019
20842 if (Subtarget.is64Bit()) {
20843 if (IsStrict) {
20844 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
20845 {Chain, Src});
20846 Chain = Res.getValue(1);
20847 } else
20848 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
20850 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20851 if (IsStrict)
20852 return DAG.getMergeValues({Res, Chain}, dl);
20853 return Res;
20856 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
20857 // use fisttp which will be handled later.
20858 if (!Subtarget.hasSSE3())
20859 return SDValue();
20862 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
20863 // FIXME: This does not generate an invalid exception if the input does not
20864 // fit in i16. PR44019
20865 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
20866 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
20867 if (IsStrict) {
20868 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
20869 {Chain, Src});
20870 Chain = Res.getValue(1);
20871 } else
20872 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
20874 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20875 if (IsStrict)
20876 return DAG.getMergeValues({Res, Chain}, dl);
20877 return Res;
20880 // If this is a FP_TO_SINT using SSEReg we're done.
20881 if (UseSSEReg && IsSigned)
20882 return Op;
20884 // fp128 needs to use a libcall.
20885 if (SrcVT == MVT::f128) {
20886 RTLIB::Libcall LC;
20887 if (IsSigned)
20888 LC = RTLIB::getFPTOSINT(SrcVT, VT);
20889 else
20890 LC = RTLIB::getFPTOUINT(SrcVT, VT);
20892 MakeLibCallOptions CallOptions;
20893 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
20894 SDLoc(Op), Chain);
20896 if (IsStrict)
20897 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
20899 return Tmp.first;
20902 // Fall back to X87.
20903 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
20904 if (IsStrict)
20905 return DAG.getMergeValues({V, Chain}, dl);
20906 return V;
20909 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
20912 SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
20913 SelectionDAG &DAG) const {
20914 SDValue Src = Op.getOperand(0);
20915 MVT SrcVT = Src.getSimpleValueType();
20917 if (SrcVT == MVT::f16)
20918 return SDValue();
20920 // If the source is in an SSE register, the node is Legal.
20921 if (isScalarFPTypeInSSEReg(SrcVT))
20922 return Op;
20924 return LRINT_LLRINTHelper(Op.getNode(), DAG);
20927 SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
20928 SelectionDAG &DAG) const {
20929 EVT DstVT = N->getValueType(0);
20930 SDValue Src = N->getOperand(0);
20931 EVT SrcVT = Src.getValueType();
20933 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
20934 // f16 must be promoted before using the lowering in this routine.
20935 // fp128 does not use this lowering.
20936 return SDValue();
20939 SDLoc DL(N);
20940 SDValue Chain = DAG.getEntryNode();
20942 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
20944 // If we're converting from SSE, the stack slot needs to hold both types.
20945 // Otherwise it only needs to hold the DstVT.
20946 EVT OtherVT = UseSSE ? SrcVT : DstVT;
20947 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
20948 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
20949 MachinePointerInfo MPI =
20950 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
20952 if (UseSSE) {
20953 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
20954 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
20955 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20956 SDValue Ops[] = { Chain, StackPtr };
20958 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
20959 /*Align*/ std::nullopt,
20960 MachineMemOperand::MOLoad);
20961 Chain = Src.getValue(1);
20964 SDValue StoreOps[] = { Chain, Src, StackPtr };
20965 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
20966 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
20967 MachineMemOperand::MOStore);
20969 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
20972 SDValue
20973 X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
20974 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
20975 // but making use of X86 specifics to produce better instruction sequences.
20976 SDNode *Node = Op.getNode();
20977 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
20978 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
20979 SDLoc dl(SDValue(Node, 0));
20980 SDValue Src = Node->getOperand(0);
20982 // There are three types involved here: SrcVT is the source floating point
20983 // type, DstVT is the type of the result, and TmpVT is the result of the
20984 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
20985 // DstVT).
20986 EVT SrcVT = Src.getValueType();
20987 EVT DstVT = Node->getValueType(0);
20988 EVT TmpVT = DstVT;
20990 // This code is only for floats and doubles. Fall back to generic code for
20991 // anything else.
20992 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
20993 return SDValue();
20995 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
20996 unsigned SatWidth = SatVT.getScalarSizeInBits();
20997 unsigned DstWidth = DstVT.getScalarSizeInBits();
20998 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
20999 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21000 "Expected saturation width smaller than result width");
21002 // Promote result of FP_TO_*INT to at least 32 bits.
21003 if (TmpWidth < 32) {
21004 TmpVT = MVT::i32;
21005 TmpWidth = 32;
21008 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21009 // us to use a native signed conversion instead.
21010 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21011 TmpVT = MVT::i64;
21012 TmpWidth = 64;
21015 // If the saturation width is smaller than the size of the temporary result,
21016 // we can always use signed conversion, which is native.
21017 if (SatWidth < TmpWidth)
21018 FpToIntOpcode = ISD::FP_TO_SINT;
21020 // Determine minimum and maximum integer values and their corresponding
21021 // floating-point values.
21022 APInt MinInt, MaxInt;
21023 if (IsSigned) {
21024 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
21025 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
21026 } else {
21027 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
21028 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
21031 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21032 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21034 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21035 MinInt, IsSigned, APFloat::rmTowardZero);
21036 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21037 MaxInt, IsSigned, APFloat::rmTowardZero);
21038 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21039 && !(MaxStatus & APFloat::opStatus::opInexact);
21041 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21042 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21044 // If the integer bounds are exactly representable as floats, emit a
21045 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21046 if (AreExactFloatBounds) {
21047 if (DstVT != TmpVT) {
21048 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21049 SDValue MinClamped = DAG.getNode(
21050 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21051 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21052 SDValue BothClamped = DAG.getNode(
21053 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21054 // Convert clamped value to integer.
21055 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21057 // NaN will become INDVAL, with the top bit set and the rest zero.
21058 // Truncation will discard the top bit, resulting in zero.
21059 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21062 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21063 SDValue MinClamped = DAG.getNode(
21064 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21065 // Clamp by MaxFloat from above. NaN cannot occur.
21066 SDValue BothClamped = DAG.getNode(
21067 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21068 // Convert clamped value to integer.
21069 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21071 if (!IsSigned) {
21072 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21073 // which is zero.
21074 return FpToInt;
21077 // Otherwise, select zero if Src is NaN.
21078 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21079 return DAG.getSelectCC(
21080 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21083 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21084 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21086 // Result of direct conversion, which may be selected away.
21087 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21089 if (DstVT != TmpVT) {
21090 // NaN will become INDVAL, with the top bit set and the rest zero.
21091 // Truncation will discard the top bit, resulting in zero.
21092 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21095 SDValue Select = FpToInt;
21096 // For signed conversions where we saturate to the same size as the
21097 // result type of the fptoi instructions, INDVAL coincides with integer
21098 // minimum, so we don't need to explicitly check it.
21099 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21100 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21101 // MinInt if Src is NaN.
21102 Select = DAG.getSelectCC(
21103 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21106 // If Src OGT MaxFloat, select MaxInt.
21107 Select = DAG.getSelectCC(
21108 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21110 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21111 // is already zero. The promoted case was already handled above.
21112 if (!IsSigned || DstVT != TmpVT) {
21113 return Select;
21116 // Otherwise, select 0 if Src is NaN.
21117 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21118 return DAG.getSelectCC(
21119 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21122 SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21123 bool IsStrict = Op->isStrictFPOpcode();
21125 SDLoc DL(Op);
21126 MVT VT = Op.getSimpleValueType();
21127 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21128 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21129 MVT SVT = In.getSimpleValueType();
21131 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21132 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
21133 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
21134 !Subtarget.getTargetTriple().isOSDarwin()))
21135 return SDValue();
21137 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
21138 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
21139 return Op;
21141 if (SVT == MVT::f16) {
21142 if (Subtarget.hasFP16())
21143 return Op;
21145 if (VT != MVT::f32) {
21146 if (IsStrict)
21147 return DAG.getNode(
21148 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
21149 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
21150 {MVT::f32, MVT::Other}, {Chain, In})});
21152 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21153 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
21156 if (!Subtarget.hasF16C()) {
21157 if (!Subtarget.getTargetTriple().isOSDarwin())
21158 return SDValue();
21160 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
21162 // Need a libcall, but ABI for f16 is soft-float on MacOS.
21163 TargetLowering::CallLoweringInfo CLI(DAG);
21164 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21166 In = DAG.getBitcast(MVT::i16, In);
21167 TargetLowering::ArgListTy Args;
21168 TargetLowering::ArgListEntry Entry;
21169 Entry.Node = In;
21170 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
21171 Entry.IsSExt = false;
21172 Entry.IsZExt = true;
21173 Args.push_back(Entry);
21175 SDValue Callee = DAG.getExternalSymbol(
21176 getLibcallName(RTLIB::FPEXT_F16_F32),
21177 getPointerTy(DAG.getDataLayout()));
21178 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21179 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
21180 std::move(Args));
21182 SDValue Res;
21183 std::tie(Res,Chain) = LowerCallTo(CLI);
21184 if (IsStrict)
21185 Res = DAG.getMergeValues({Res, Chain}, DL);
21187 return Res;
21190 In = DAG.getBitcast(MVT::i16, In);
21191 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
21192 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
21193 DAG.getIntPtrConstant(0, DL));
21194 SDValue Res;
21195 if (IsStrict) {
21196 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
21197 {Chain, In});
21198 Chain = Res.getValue(1);
21199 } else {
21200 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
21201 DAG.getTargetConstant(4, DL, MVT::i32));
21203 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
21204 DAG.getIntPtrConstant(0, DL));
21205 if (IsStrict)
21206 return DAG.getMergeValues({Res, Chain}, DL);
21207 return Res;
21210 if (!SVT.isVector())
21211 return Op;
21213 if (SVT.getVectorElementType() == MVT::bf16) {
21214 // FIXME: Do we need to support strict FP?
21215 assert(!IsStrict && "Strict FP doesn't support BF16");
21216 if (VT.getVectorElementType() == MVT::f64) {
21217 MVT TmpVT = VT.changeVectorElementType(MVT::f32);
21218 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21219 DAG.getNode(ISD::FP_EXTEND, DL, TmpVT, In));
21221 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
21222 MVT NVT = SVT.changeVectorElementType(MVT::i32);
21223 In = DAG.getBitcast(SVT.changeTypeToInteger(), In);
21224 In = DAG.getNode(ISD::ZERO_EXTEND, DL, NVT, In);
21225 In = DAG.getNode(ISD::SHL, DL, NVT, In, DAG.getConstant(16, DL, NVT));
21226 return DAG.getBitcast(VT, In);
21229 if (SVT.getVectorElementType() == MVT::f16) {
21230 if (Subtarget.hasFP16() && isTypeLegal(SVT))
21231 return Op;
21232 assert(Subtarget.hasF16C() && "Unexpected features!");
21233 if (SVT == MVT::v2f16)
21234 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
21235 DAG.getUNDEF(MVT::v2f16));
21236 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
21237 DAG.getUNDEF(MVT::v4f16));
21238 if (IsStrict)
21239 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21240 {Op->getOperand(0), Res});
21241 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21242 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
21243 return Op;
21246 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21248 SDValue Res =
21249 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21250 if (IsStrict)
21251 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21252 {Op->getOperand(0), Res});
21253 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21256 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21257 bool IsStrict = Op->isStrictFPOpcode();
21259 SDLoc DL(Op);
21260 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21261 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21262 MVT VT = Op.getSimpleValueType();
21263 MVT SVT = In.getSimpleValueType();
21265 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
21266 return SDValue();
21268 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
21269 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
21270 if (!Subtarget.getTargetTriple().isOSDarwin())
21271 return SDValue();
21273 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
21274 TargetLowering::CallLoweringInfo CLI(DAG);
21275 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21277 TargetLowering::ArgListTy Args;
21278 TargetLowering::ArgListEntry Entry;
21279 Entry.Node = In;
21280 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
21281 Entry.IsSExt = false;
21282 Entry.IsZExt = true;
21283 Args.push_back(Entry);
21285 SDValue Callee = DAG.getExternalSymbol(
21286 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
21287 : RTLIB::FPROUND_F32_F16),
21288 getPointerTy(DAG.getDataLayout()));
21289 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21290 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
21291 std::move(Args));
21293 SDValue Res;
21294 std::tie(Res, Chain) = LowerCallTo(CLI);
21296 Res = DAG.getBitcast(MVT::f16, Res);
21298 if (IsStrict)
21299 Res = DAG.getMergeValues({Res, Chain}, DL);
21301 return Res;
21304 if (VT.getScalarType() == MVT::bf16) {
21305 if (SVT.getScalarType() == MVT::f32 && isTypeLegal(VT))
21306 return Op;
21307 return SDValue();
21310 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
21311 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
21312 return SDValue();
21314 if (VT.isVector())
21315 return Op;
21317 SDValue Res;
21318 SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
21319 MVT::i32);
21320 if (IsStrict) {
21321 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
21322 DAG.getConstantFP(0, DL, MVT::v4f32), In,
21323 DAG.getIntPtrConstant(0, DL));
21324 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
21325 {Chain, Res, Rnd});
21326 Chain = Res.getValue(1);
21327 } else {
21328 // FIXME: Should we use zeros for upper elements for non-strict?
21329 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
21330 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
21333 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21334 DAG.getIntPtrConstant(0, DL));
21335 Res = DAG.getBitcast(MVT::f16, Res);
21337 if (IsStrict)
21338 return DAG.getMergeValues({Res, Chain}, DL);
21340 return Res;
21343 return Op;
21346 static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
21347 bool IsStrict = Op->isStrictFPOpcode();
21348 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21349 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
21350 "Unexpected VT!");
21352 SDLoc dl(Op);
21353 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21354 DAG.getConstant(0, dl, MVT::v8i16), Src,
21355 DAG.getIntPtrConstant(0, dl));
21357 SDValue Chain;
21358 if (IsStrict) {
21359 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21360 {Op.getOperand(0), Res});
21361 Chain = Res.getValue(1);
21362 } else {
21363 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21366 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21367 DAG.getIntPtrConstant(0, dl));
21369 if (IsStrict)
21370 return DAG.getMergeValues({Res, Chain}, dl);
21372 return Res;
21375 static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
21376 bool IsStrict = Op->isStrictFPOpcode();
21377 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21378 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
21379 "Unexpected VT!");
21381 SDLoc dl(Op);
21382 SDValue Res, Chain;
21383 if (IsStrict) {
21384 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21385 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21386 DAG.getIntPtrConstant(0, dl));
21387 Res = DAG.getNode(
21388 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21389 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21390 Chain = Res.getValue(1);
21391 } else {
21392 // FIXME: Should we use zeros for upper elements for non-strict?
21393 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21394 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21395 DAG.getTargetConstant(4, dl, MVT::i32));
21398 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21399 DAG.getIntPtrConstant(0, dl));
21401 if (IsStrict)
21402 return DAG.getMergeValues({Res, Chain}, dl);
21404 return Res;
21407 SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
21408 SelectionDAG &DAG) const {
21409 SDLoc DL(Op);
21410 MakeLibCallOptions CallOptions;
21411 RTLIB::Libcall LC =
21412 RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
21413 SDValue Res =
21414 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
21415 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,
21416 DAG.getBitcast(MVT::i32, Res));
21419 /// Depending on uarch and/or optimizing for size, we might prefer to use a
21420 /// vector operation in place of the typical scalar operation.
21421 static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
21422 const X86Subtarget &Subtarget) {
21423 // If both operands have other uses, this is probably not profitable.
21424 SDValue LHS = Op.getOperand(0);
21425 SDValue RHS = Op.getOperand(1);
21426 if (!LHS.hasOneUse() && !RHS.hasOneUse())
21427 return Op;
21429 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21430 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21431 if (IsFP && !Subtarget.hasSSE3())
21432 return Op;
21433 if (!IsFP && !Subtarget.hasSSSE3())
21434 return Op;
21436 // Extract from a common vector.
21437 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21438 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21439 LHS.getOperand(0) != RHS.getOperand(0) ||
21440 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21441 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21442 !shouldUseHorizontalOp(true, DAG, Subtarget))
21443 return Op;
21445 // Allow commuted 'hadd' ops.
21446 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21447 unsigned HOpcode;
21448 switch (Op.getOpcode()) {
21449 case ISD::ADD: HOpcode = X86ISD::HADD; break;
21450 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21451 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21452 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21453 default:
21454 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
21456 unsigned LExtIndex = LHS.getConstantOperandVal(1);
21457 unsigned RExtIndex = RHS.getConstantOperandVal(1);
21458 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21459 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21460 std::swap(LExtIndex, RExtIndex);
21462 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21463 return Op;
21465 SDValue X = LHS.getOperand(0);
21466 EVT VecVT = X.getValueType();
21467 unsigned BitWidth = VecVT.getSizeInBits();
21468 unsigned NumLanes = BitWidth / 128;
21469 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21470 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
21471 "Not expecting illegal vector widths here");
21473 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21474 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21475 SDLoc DL(Op);
21476 if (BitWidth == 256 || BitWidth == 512) {
21477 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21478 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21479 LExtIndex %= NumEltsPerLane;
21482 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21483 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21484 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21485 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21486 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21487 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21488 DAG.getIntPtrConstant(LExtIndex / 2, DL));
21491 /// Depending on uarch and/or optimizing for size, we might prefer to use a
21492 /// vector operation in place of the typical scalar operation.
21493 SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21494 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
21495 "Only expecting float/double");
21496 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
21499 /// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21500 /// This mode isn't supported in hardware on X86. But as long as we aren't
21501 /// compiling with trapping math, we can emulate this with
21502 /// trunc(X + copysign(nextafter(0.5, 0.0), X)).
21503 static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
21504 SDValue N0 = Op.getOperand(0);
21505 SDLoc dl(Op);
21506 MVT VT = Op.getSimpleValueType();
21508 // N0 += copysign(nextafter(0.5, 0.0), N0)
21509 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21510 bool Ignored;
21511 APFloat Point5Pred = APFloat(0.5f);
21512 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
21513 Point5Pred.next(/*nextDown*/true);
21515 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
21516 DAG.getConstantFP(Point5Pred, dl, VT), N0);
21517 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
21519 // Truncate the result to remove fraction.
21520 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
21523 /// The only differences between FABS and FNEG are the mask and the logic op.
21524 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
21525 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
21526 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
21527 "Wrong opcode for lowering FABS or FNEG.");
21529 bool IsFABS = (Op.getOpcode() == ISD::FABS);
21531 // If this is a FABS and it has an FNEG user, bail out to fold the combination
21532 // into an FNABS. We'll lower the FABS after that if it is still in use.
21533 if (IsFABS)
21534 for (SDNode *User : Op->uses())
21535 if (User->getOpcode() == ISD::FNEG)
21536 return Op;
21538 SDLoc dl(Op);
21539 MVT VT = Op.getSimpleValueType();
21541 bool IsF128 = (VT == MVT::f128);
21542 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21543 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
21544 "Unexpected type in LowerFABSorFNEG");
21546 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
21547 // decide if we should generate a 16-byte constant mask when we only need 4 or
21548 // 8 bytes for the scalar case.
21550 // There are no scalar bitwise logical SSE/AVX instructions, so we
21551 // generate a 16-byte vector constant and logic op even for the scalar case.
21552 // Using a 16-byte mask allows folding the load of the mask with
21553 // the logic op, so it can save (~4 bytes) on code size.
21554 bool IsFakeVector = !VT.isVector() && !IsF128;
21555 MVT LogicVT = VT;
21556 if (IsFakeVector)
21557 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21558 : (VT == MVT::f32) ? MVT::v4f32
21559 : MVT::v8f16;
21561 unsigned EltBits = VT.getScalarSizeInBits();
21562 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
21563 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
21564 APInt::getSignMask(EltBits);
21565 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21566 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
21568 SDValue Op0 = Op.getOperand(0);
21569 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
21570 unsigned LogicOp = IsFABS ? X86ISD::FAND :
21571 IsFNABS ? X86ISD::FOR :
21572 X86ISD::FXOR;
21573 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
21575 if (VT.isVector() || IsF128)
21576 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21578 // For the scalar case extend to a 128-bit vector, perform the logic op,
21579 // and extract the scalar result back out.
21580 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
21581 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21582 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
21583 DAG.getIntPtrConstant(0, dl));
21586 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
21587 SDValue Mag = Op.getOperand(0);
21588 SDValue Sign = Op.getOperand(1);
21589 SDLoc dl(Op);
21591 // If the sign operand is smaller, extend it first.
21592 MVT VT = Op.getSimpleValueType();
21593 if (Sign.getSimpleValueType().bitsLT(VT))
21594 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
21596 // And if it is bigger, shrink it first.
21597 if (Sign.getSimpleValueType().bitsGT(VT))
21598 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
21599 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21601 // At this point the operands and the result should have the same
21602 // type, and that won't be f80 since that is not custom lowered.
21603 bool IsF128 = (VT == MVT::f128);
21604 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21605 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
21606 "Unexpected type in LowerFCOPYSIGN");
21608 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21610 // Perform all scalar logic operations as 16-byte vectors because there are no
21611 // scalar FP logic instructions in SSE.
21612 // TODO: This isn't necessary. If we used scalar types, we might avoid some
21613 // unnecessary splats, but we might miss load folding opportunities. Should
21614 // this decision be based on OptimizeForSize?
21615 bool IsFakeVector = !VT.isVector() && !IsF128;
21616 MVT LogicVT = VT;
21617 if (IsFakeVector)
21618 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21619 : (VT == MVT::f32) ? MVT::v4f32
21620 : MVT::v8f16;
21622 // The mask constants are automatically splatted for vector types.
21623 unsigned EltSizeInBits = VT.getScalarSizeInBits();
21624 SDValue SignMask = DAG.getConstantFP(
21625 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
21626 SDValue MagMask = DAG.getConstantFP(
21627 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
21629 // First, clear all bits but the sign bit from the second operand (sign).
21630 if (IsFakeVector)
21631 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
21632 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
21634 // Next, clear the sign bit from the first operand (magnitude).
21635 // TODO: If we had general constant folding for FP logic ops, this check
21636 // wouldn't be necessary.
21637 SDValue MagBits;
21638 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
21639 APFloat APF = Op0CN->getValueAPF();
21640 APF.clearSign();
21641 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
21642 } else {
21643 // If the magnitude operand wasn't a constant, we need to AND out the sign.
21644 if (IsFakeVector)
21645 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
21646 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
21649 // OR the magnitude value with the sign bit.
21650 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
21651 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
21652 DAG.getIntPtrConstant(0, dl));
21655 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
21656 SDValue N0 = Op.getOperand(0);
21657 SDLoc dl(Op);
21658 MVT VT = Op.getSimpleValueType();
21660 MVT OpVT = N0.getSimpleValueType();
21661 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
21662 "Unexpected type for FGETSIGN");
21664 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
21665 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
21666 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
21667 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
21668 Res = DAG.getZExtOrTrunc(Res, dl, VT);
21669 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
21670 return Res;
21673 /// Helper for attempting to create a X86ISD::BT node.
21674 static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
21675 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
21676 // instruction. Since the shift amount is in-range-or-undefined, we know
21677 // that doing a bittest on the i32 value is ok. We extend to i32 because
21678 // the encoding for the i16 version is larger than the i32 version.
21679 // Also promote i16 to i32 for performance / code size reason.
21680 if (Src.getValueType().getScalarSizeInBits() < 32)
21681 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
21683 // No legal type found, give up.
21684 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
21685 return SDValue();
21687 // See if we can use the 32-bit instruction instead of the 64-bit one for a
21688 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
21689 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
21690 // known to be zero.
21691 if (Src.getValueType() == MVT::i64 &&
21692 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
21693 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
21695 // If the operand types disagree, extend the shift amount to match. Since
21696 // BT ignores high bits (like shifts) we can use anyextend.
21697 if (Src.getValueType() != BitNo.getValueType()) {
21698 // Peek through a mask/modulo operation.
21699 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
21700 // we probably need a better IsDesirableToPromoteOp to handle this as well.
21701 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
21702 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
21703 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
21704 BitNo.getOperand(0)),
21705 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
21706 BitNo.getOperand(1)));
21707 else
21708 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
21711 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
21714 /// Helper for creating a X86ISD::SETCC node.
21715 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
21716 SelectionDAG &DAG) {
21717 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
21718 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
21721 /// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
21722 /// recognizable memcmp expansion.
21723 static bool isOrXorXorTree(SDValue X, bool Root = true) {
21724 if (X.getOpcode() == ISD::OR)
21725 return isOrXorXorTree(X.getOperand(0), false) &&
21726 isOrXorXorTree(X.getOperand(1), false);
21727 if (Root)
21728 return false;
21729 return X.getOpcode() == ISD::XOR;
21732 /// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
21733 /// expansion.
21734 template <typename F>
21735 static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,
21736 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
21737 SDValue Op0 = X.getOperand(0);
21738 SDValue Op1 = X.getOperand(1);
21739 if (X.getOpcode() == ISD::OR) {
21740 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
21741 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
21742 if (VecVT != CmpVT)
21743 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
21744 if (HasPT)
21745 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
21746 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
21748 if (X.getOpcode() == ISD::XOR) {
21749 SDValue A = SToV(Op0);
21750 SDValue B = SToV(Op1);
21751 if (VecVT != CmpVT)
21752 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
21753 if (HasPT)
21754 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
21755 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
21757 llvm_unreachable("Impossible");
21760 /// Try to map a 128-bit or larger integer comparison to vector instructions
21761 /// before type legalization splits it up into chunks.
21762 static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
21763 ISD::CondCode CC,
21764 const SDLoc &DL,
21765 SelectionDAG &DAG,
21766 const X86Subtarget &Subtarget) {
21767 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
21769 // We're looking for an oversized integer equality comparison.
21770 EVT OpVT = X.getValueType();
21771 unsigned OpSize = OpVT.getSizeInBits();
21772 if (!OpVT.isScalarInteger() || OpSize < 128)
21773 return SDValue();
21775 // Ignore a comparison with zero because that gets special treatment in
21776 // EmitTest(). But make an exception for the special case of a pair of
21777 // logically-combined vector-sized operands compared to zero. This pattern may
21778 // be generated by the memcmp expansion pass with oversized integer compares
21779 // (see PR33325).
21780 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
21781 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
21782 return SDValue();
21784 // Don't perform this combine if constructing the vector will be expensive.
21785 auto IsVectorBitCastCheap = [](SDValue X) {
21786 X = peekThroughBitcasts(X);
21787 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
21788 X.getOpcode() == ISD::LOAD;
21790 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
21791 !IsOrXorXorTreeCCZero)
21792 return SDValue();
21794 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
21795 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
21796 // Otherwise use PCMPEQ (plus AND) and mask testing.
21797 bool NoImplicitFloatOps =
21798 DAG.getMachineFunction().getFunction().hasFnAttribute(
21799 Attribute::NoImplicitFloat);
21800 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
21801 ((OpSize == 128 && Subtarget.hasSSE2()) ||
21802 (OpSize == 256 && Subtarget.hasAVX()) ||
21803 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
21804 bool HasPT = Subtarget.hasSSE41();
21806 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
21807 // vector registers are essentially free. (Technically, widening registers
21808 // prevents load folding, but the tradeoff is worth it.)
21809 bool PreferKOT = Subtarget.preferMaskRegisters();
21810 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
21812 EVT VecVT = MVT::v16i8;
21813 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
21814 if (OpSize == 256) {
21815 VecVT = MVT::v32i8;
21816 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
21818 EVT CastVT = VecVT;
21819 bool NeedsAVX512FCast = false;
21820 if (OpSize == 512 || NeedZExt) {
21821 if (Subtarget.hasBWI()) {
21822 VecVT = MVT::v64i8;
21823 CmpVT = MVT::v64i1;
21824 if (OpSize == 512)
21825 CastVT = VecVT;
21826 } else {
21827 VecVT = MVT::v16i32;
21828 CmpVT = MVT::v16i1;
21829 CastVT = OpSize == 512 ? VecVT
21830 : OpSize == 256 ? MVT::v8i32
21831 : MVT::v4i32;
21832 NeedsAVX512FCast = true;
21836 auto ScalarToVector = [&](SDValue X) -> SDValue {
21837 bool TmpZext = false;
21838 EVT TmpCastVT = CastVT;
21839 if (X.getOpcode() == ISD::ZERO_EXTEND) {
21840 SDValue OrigX = X.getOperand(0);
21841 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
21842 if (OrigSize < OpSize) {
21843 if (OrigSize == 128) {
21844 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
21845 X = OrigX;
21846 TmpZext = true;
21847 } else if (OrigSize == 256) {
21848 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
21849 X = OrigX;
21850 TmpZext = true;
21854 X = DAG.getBitcast(TmpCastVT, X);
21855 if (!NeedZExt && !TmpZext)
21856 return X;
21857 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
21858 DAG.getConstant(0, DL, VecVT), X,
21859 DAG.getVectorIdxConstant(0, DL));
21862 SDValue Cmp;
21863 if (IsOrXorXorTreeCCZero) {
21864 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
21865 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
21866 // Use 2 vector equality compares and 'and' the results before doing a
21867 // MOVMSK.
21868 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
21869 } else {
21870 SDValue VecX = ScalarToVector(X);
21871 SDValue VecY = ScalarToVector(Y);
21872 if (VecVT != CmpVT) {
21873 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
21874 } else if (HasPT) {
21875 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
21876 } else {
21877 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
21880 // AVX512 should emit a setcc that will lower to kortest.
21881 if (VecVT != CmpVT) {
21882 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
21883 : CmpVT == MVT::v32i1 ? MVT::i32
21884 : MVT::i16;
21885 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
21886 DAG.getConstant(0, DL, KRegVT), CC);
21888 if (HasPT) {
21889 SDValue BCCmp =
21890 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
21891 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
21892 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
21893 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
21894 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
21896 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
21897 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
21898 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
21899 assert(Cmp.getValueType() == MVT::v16i8 &&
21900 "Non 128-bit vector on pre-SSE41 target");
21901 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
21902 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
21903 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
21906 return SDValue();
21909 /// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
21910 /// style scalarized (associative) reduction patterns. Partial reductions
21911 /// are supported when the pointer SrcMask is non-null.
21912 /// TODO - move this to SelectionDAG?
21913 static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
21914 SmallVectorImpl<SDValue> &SrcOps,
21915 SmallVectorImpl<APInt> *SrcMask = nullptr) {
21916 SmallVector<SDValue, 8> Opnds;
21917 DenseMap<SDValue, APInt> SrcOpMap;
21918 EVT VT = MVT::Other;
21920 // Recognize a special case where a vector is casted into wide integer to
21921 // test all 0s.
21922 assert(Op.getOpcode() == unsigned(BinOp) &&
21923 "Unexpected bit reduction opcode");
21924 Opnds.push_back(Op.getOperand(0));
21925 Opnds.push_back(Op.getOperand(1));
21927 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
21928 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
21929 // BFS traverse all BinOp operands.
21930 if (I->getOpcode() == unsigned(BinOp)) {
21931 Opnds.push_back(I->getOperand(0));
21932 Opnds.push_back(I->getOperand(1));
21933 // Re-evaluate the number of nodes to be traversed.
21934 e += 2; // 2 more nodes (LHS and RHS) are pushed.
21935 continue;
21938 // Quit if a non-EXTRACT_VECTOR_ELT
21939 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
21940 return false;
21942 // Quit if without a constant index.
21943 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
21944 if (!Idx)
21945 return false;
21947 SDValue Src = I->getOperand(0);
21948 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
21949 if (M == SrcOpMap.end()) {
21950 VT = Src.getValueType();
21951 // Quit if not the same type.
21952 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
21953 return false;
21954 unsigned NumElts = VT.getVectorNumElements();
21955 APInt EltCount = APInt::getZero(NumElts);
21956 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
21957 SrcOps.push_back(Src);
21960 // Quit if element already used.
21961 unsigned CIdx = Idx->getZExtValue();
21962 if (M->second[CIdx])
21963 return false;
21964 M->second.setBit(CIdx);
21967 if (SrcMask) {
21968 // Collect the source partial masks.
21969 for (SDValue &SrcOp : SrcOps)
21970 SrcMask->push_back(SrcOpMap[SrcOp]);
21971 } else {
21972 // Quit if not all elements are used.
21973 for (const auto &I : SrcOpMap)
21974 if (!I.second.isAllOnes())
21975 return false;
21978 return true;
21981 // Helper function for comparing all bits of two vectors.
21982 static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS,
21983 ISD::CondCode CC, const APInt &OriginalMask,
21984 const X86Subtarget &Subtarget,
21985 SelectionDAG &DAG, X86::CondCode &X86CC) {
21986 EVT VT = LHS.getValueType();
21987 unsigned ScalarSize = VT.getScalarSizeInBits();
21988 if (OriginalMask.getBitWidth() != ScalarSize) {
21989 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
21990 return SDValue();
21993 // Quit if not convertable to legal scalar or 128/256-bit vector.
21994 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
21995 return SDValue();
21997 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
21998 if (VT.isFloatingPoint())
21999 return SDValue();
22001 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22002 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22004 APInt Mask = OriginalMask;
22006 auto MaskBits = [&](SDValue Src) {
22007 if (Mask.isAllOnes())
22008 return Src;
22009 EVT SrcVT = Src.getValueType();
22010 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22011 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22014 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22015 if (VT.getSizeInBits() < 128) {
22016 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22017 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
22018 if (IntVT != MVT::i64)
22019 return SDValue();
22020 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
22021 MVT::i32, MVT::i32);
22022 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
22023 MVT::i32, MVT::i32);
22024 SDValue Lo =
22025 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
22026 SDValue Hi =
22027 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
22028 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22029 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
22030 DAG.getConstant(0, DL, MVT::i32));
22032 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22033 DAG.getBitcast(IntVT, MaskBits(LHS)),
22034 DAG.getBitcast(IntVT, MaskBits(RHS)));
22037 // Without PTEST, a masked v2i64 or-reduction is not faster than
22038 // scalarization.
22039 bool UseKORTEST = Subtarget.useAVX512Regs();
22040 bool UsePTEST = Subtarget.hasSSE41();
22041 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
22042 return SDValue();
22044 // Split down to 128/256/512-bit vector.
22045 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
22047 // If the input vector has vector elements wider than the target test size,
22048 // then cast to <X x i64> so it will safely split.
22049 if (ScalarSize > TestSize) {
22050 if (!Mask.isAllOnes())
22051 return SDValue();
22052 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
22053 LHS = DAG.getBitcast(VT, LHS);
22054 RHS = DAG.getBitcast(VT, RHS);
22055 Mask = APInt::getAllOnes(64);
22058 if (VT.getSizeInBits() > TestSize) {
22059 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
22060 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
22061 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22062 while (VT.getSizeInBits() > TestSize) {
22063 auto Split = DAG.SplitVector(LHS, DL);
22064 VT = Split.first.getValueType();
22065 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22067 RHS = DAG.getAllOnesConstant(DL, VT);
22068 } else if (!UsePTEST && !KnownRHS.isZero()) {
22069 // MOVMSK Special Case:
22070 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22071 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
22072 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
22073 LHS = DAG.getBitcast(VT, MaskBits(LHS));
22074 RHS = DAG.getBitcast(VT, MaskBits(RHS));
22075 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
22076 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
22077 V = DAG.getSExtOrTrunc(V, DL, VT);
22078 while (VT.getSizeInBits() > TestSize) {
22079 auto Split = DAG.SplitVector(V, DL);
22080 VT = Split.first.getValueType();
22081 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22083 V = DAG.getNOT(DL, V, VT);
22084 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22085 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22086 DAG.getConstant(0, DL, MVT::i32));
22087 } else {
22088 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
22089 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
22090 while (VT.getSizeInBits() > TestSize) {
22091 auto Split = DAG.SplitVector(V, DL);
22092 VT = Split.first.getValueType();
22093 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22095 LHS = V;
22096 RHS = DAG.getConstant(0, DL, VT);
22100 if (UseKORTEST && VT.is512BitVector()) {
22101 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22102 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
22103 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22104 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22105 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
22106 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
22109 if (UsePTEST) {
22110 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
22111 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22112 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22113 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
22114 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22117 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22118 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
22119 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
22120 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
22121 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
22122 V = DAG.getNOT(DL, V, MaskVT);
22123 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22124 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22125 DAG.getConstant(0, DL, MVT::i32));
22128 // Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22129 // to CMP(MOVMSK(PCMPEQB(X,Y))).
22130 static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,
22131 ISD::CondCode CC, const SDLoc &DL,
22132 const X86Subtarget &Subtarget,
22133 SelectionDAG &DAG,
22134 X86::CondCode &X86CC) {
22135 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22137 bool CmpNull = isNullConstant(RHS);
22138 bool CmpAllOnes = isAllOnesConstant(RHS);
22139 if (!CmpNull && !CmpAllOnes)
22140 return SDValue();
22142 SDValue Op = LHS;
22143 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22144 return SDValue();
22146 // Check whether we're masking/truncating an OR-reduction result, in which
22147 // case track the masked bits.
22148 // TODO: Add CmpAllOnes support.
22149 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
22150 if (CmpNull) {
22151 switch (Op.getOpcode()) {
22152 case ISD::TRUNCATE: {
22153 SDValue Src = Op.getOperand(0);
22154 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22155 Op.getScalarValueSizeInBits());
22156 Op = Src;
22157 break;
22159 case ISD::AND: {
22160 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22161 Mask = Cst->getAPIntValue();
22162 Op = Op.getOperand(0);
22164 break;
22169 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
22171 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
22172 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22173 SmallVector<SDValue, 8> VecIns;
22174 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
22175 EVT VT = VecIns[0].getValueType();
22176 assert(llvm::all_of(VecIns,
22177 [VT](SDValue V) { return VT == V.getValueType(); }) &&
22178 "Reduction source vector mismatch");
22180 // Quit if not splittable to scalar/128/256/512-bit vector.
22181 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22182 return SDValue();
22184 // If more than one full vector is evaluated, AND/OR them first before
22185 // PTEST.
22186 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22187 Slot += 2, e += 1) {
22188 // Each iteration will AND/OR 2 nodes and append the result until there is
22189 // only 1 node left, i.e. the final value of all vectors.
22190 SDValue LHS = VecIns[Slot];
22191 SDValue RHS = VecIns[Slot + 1];
22192 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
22195 return LowerVectorAllEqual(DL, VecIns.back(),
22196 CmpNull ? DAG.getConstant(0, DL, VT)
22197 : DAG.getAllOnesConstant(DL, VT),
22198 CC, Mask, Subtarget, DAG, X86CC);
22201 // Match icmp(reduce_or(X),0) anyof reduction patterns.
22202 // Match icmp(reduce_and(X),-1) allof reduction patterns.
22203 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22204 ISD::NodeType BinOp;
22205 if (SDValue Match =
22206 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
22207 EVT MatchVT = Match.getValueType();
22208 return LowerVectorAllEqual(DL, Match,
22209 CmpNull ? DAG.getConstant(0, DL, MatchVT)
22210 : DAG.getAllOnesConstant(DL, MatchVT),
22211 CC, Mask, Subtarget, DAG, X86CC);
22215 if (Mask.isAllOnes()) {
22216 assert(!Op.getValueType().isVector() &&
22217 "Illegal vector type for reduction pattern");
22218 SDValue Src = peekThroughBitcasts(Op);
22219 if (Src.getValueType().isFixedLengthVector() &&
22220 Src.getValueType().getScalarType() == MVT::i1) {
22221 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
22222 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22223 if (Src.getOpcode() == ISD::SETCC) {
22224 SDValue LHS = Src.getOperand(0);
22225 SDValue RHS = Src.getOperand(1);
22226 EVT LHSVT = LHS.getValueType();
22227 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22228 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
22229 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
22230 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
22231 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
22232 X86CC);
22235 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
22236 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22237 // Peek through truncation, mask the LSB and compare against zero/LSB.
22238 if (Src.getOpcode() == ISD::TRUNCATE) {
22239 SDValue Inner = Src.getOperand(0);
22240 EVT InnerVT = Inner.getValueType();
22241 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
22242 unsigned BW = InnerVT.getScalarSizeInBits();
22243 APInt SrcMask = APInt(BW, 1);
22244 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
22245 return LowerVectorAllEqual(DL, Inner,
22246 DAG.getConstant(Cmp, DL, InnerVT), CC,
22247 SrcMask, Subtarget, DAG, X86CC);
22253 return SDValue();
22256 /// return true if \c Op has a use that doesn't just read flags.
22257 static bool hasNonFlagsUse(SDValue Op) {
22258 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22259 ++UI) {
22260 SDNode *User = *UI;
22261 unsigned UOpNo = UI.getOperandNo();
22262 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22263 // Look pass truncate.
22264 UOpNo = User->use_begin().getOperandNo();
22265 User = *User->use_begin();
22268 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22269 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22270 return true;
22272 return false;
22275 // Transform to an x86-specific ALU node with flags if there is a chance of
22276 // using an RMW op or only the flags are used. Otherwise, leave
22277 // the node alone and emit a 'cmp' or 'test' instruction.
22278 static bool isProfitableToUseFlagOp(SDValue Op) {
22279 for (SDNode *U : Op->uses())
22280 if (U->getOpcode() != ISD::CopyToReg &&
22281 U->getOpcode() != ISD::SETCC &&
22282 U->getOpcode() != ISD::STORE)
22283 return false;
22285 return true;
22288 /// Emit nodes that will be selected as "test Op0,Op0", or something
22289 /// equivalent.
22290 static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22291 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22292 // CF and OF aren't always set the way we want. Determine which
22293 // of these we need.
22294 bool NeedCF = false;
22295 bool NeedOF = false;
22296 switch (X86CC) {
22297 default: break;
22298 case X86::COND_A: case X86::COND_AE:
22299 case X86::COND_B: case X86::COND_BE:
22300 NeedCF = true;
22301 break;
22302 case X86::COND_G: case X86::COND_GE:
22303 case X86::COND_L: case X86::COND_LE:
22304 case X86::COND_O: case X86::COND_NO: {
22305 // Check if we really need to set the
22306 // Overflow flag. If NoSignedWrap is present
22307 // that is not actually needed.
22308 switch (Op->getOpcode()) {
22309 case ISD::ADD:
22310 case ISD::SUB:
22311 case ISD::MUL:
22312 case ISD::SHL:
22313 if (Op.getNode()->getFlags().hasNoSignedWrap())
22314 break;
22315 [[fallthrough]];
22316 default:
22317 NeedOF = true;
22318 break;
22320 break;
22323 // See if we can use the EFLAGS value from the operand instead of
22324 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22325 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22326 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22327 // Emit a CMP with 0, which is the TEST pattern.
22328 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22329 DAG.getConstant(0, dl, Op.getValueType()));
22331 unsigned Opcode = 0;
22332 unsigned NumOperands = 0;
22334 SDValue ArithOp = Op;
22336 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22337 // which may be the result of a CAST. We use the variable 'Op', which is the
22338 // non-casted variable when we check for possible users.
22339 switch (ArithOp.getOpcode()) {
22340 case ISD::AND:
22341 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22342 // because a TEST instruction will be better.
22343 if (!hasNonFlagsUse(Op))
22344 break;
22346 [[fallthrough]];
22347 case ISD::ADD:
22348 case ISD::SUB:
22349 case ISD::OR:
22350 case ISD::XOR:
22351 if (!isProfitableToUseFlagOp(Op))
22352 break;
22354 // Otherwise use a regular EFLAGS-setting instruction.
22355 switch (ArithOp.getOpcode()) {
22356 default: llvm_unreachable("unexpected operator!");
22357 case ISD::ADD: Opcode = X86ISD::ADD; break;
22358 case ISD::SUB: Opcode = X86ISD::SUB; break;
22359 case ISD::XOR: Opcode = X86ISD::XOR; break;
22360 case ISD::AND: Opcode = X86ISD::AND; break;
22361 case ISD::OR: Opcode = X86ISD::OR; break;
22364 NumOperands = 2;
22365 break;
22366 case X86ISD::ADD:
22367 case X86ISD::SUB:
22368 case X86ISD::OR:
22369 case X86ISD::XOR:
22370 case X86ISD::AND:
22371 return SDValue(Op.getNode(), 1);
22372 case ISD::SSUBO:
22373 case ISD::USUBO: {
22374 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22375 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22376 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22377 Op->getOperand(1)).getValue(1);
22379 default:
22380 break;
22383 if (Opcode == 0) {
22384 // Emit a CMP with 0, which is the TEST pattern.
22385 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22386 DAG.getConstant(0, dl, Op.getValueType()));
22388 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22389 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22391 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22392 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22393 return SDValue(New.getNode(), 1);
22396 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
22397 /// equivalent.
22398 static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22399 const SDLoc &dl, SelectionDAG &DAG,
22400 const X86Subtarget &Subtarget) {
22401 if (isNullConstant(Op1))
22402 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22404 EVT CmpVT = Op0.getValueType();
22406 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
22407 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
22409 // Only promote the compare up to I32 if it is a 16 bit operation
22410 // with an immediate. 16 bit immediates are to be avoided.
22411 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22412 !DAG.getMachineFunction().getFunction().hasMinSize()) {
22413 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22414 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22415 // Don't do this if the immediate can fit in 8-bits.
22416 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22417 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22418 unsigned ExtendOp =
22419 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22420 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22421 // For equality comparisons try to use SIGN_EXTEND if the input was
22422 // truncate from something with enough sign bits.
22423 if (Op0.getOpcode() == ISD::TRUNCATE) {
22424 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
22425 ExtendOp = ISD::SIGN_EXTEND;
22426 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22427 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
22428 ExtendOp = ISD::SIGN_EXTEND;
22432 CmpVT = MVT::i32;
22433 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22434 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22438 // Try to shrink i64 compares if the input has enough zero bits.
22439 // FIXME: Do this for non-constant compares for constant on LHS?
22440 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
22441 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22442 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
22443 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22444 CmpVT = MVT::i32;
22445 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22446 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22449 // 0-x == y --> x+y == 0
22450 // 0-x != y --> x+y != 0
22451 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22452 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22453 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22454 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22455 return Add.getValue(1);
22458 // x == 0-y --> x+y == 0
22459 // x != 0-y --> x+y != 0
22460 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22461 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22462 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22463 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22464 return Add.getValue(1);
22467 // Use SUB instead of CMP to enable CSE between SUB and CMP.
22468 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22469 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22470 return Sub.getValue(1);
22473 bool X86TargetLowering::isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond,
22474 EVT VT) const {
22475 return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
22478 bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
22479 SDNode *N, SDValue, SDValue IntPow2) const {
22480 if (N->getOpcode() == ISD::FDIV)
22481 return true;
22483 EVT FPVT = N->getValueType(0);
22484 EVT IntVT = IntPow2.getValueType();
22486 // This indicates a non-free bitcast.
22487 // TODO: This is probably overly conservative as we will need to scale the
22488 // integer vector anyways for the int->fp cast.
22489 if (FPVT.isVector() &&
22490 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
22491 return false;
22493 return true;
22496 /// Check if replacement of SQRT with RSQRT should be disabled.
22497 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22498 EVT VT = Op.getValueType();
22500 // We don't need to replace SQRT with RSQRT for half type.
22501 if (VT.getScalarType() == MVT::f16)
22502 return true;
22504 // We never want to use both SQRT and RSQRT instructions for the same input.
22505 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22506 return false;
22508 if (VT.isVector())
22509 return Subtarget.hasFastVectorFSQRT();
22510 return Subtarget.hasFastScalarFSQRT();
22513 /// The minimum architected relative accuracy is 2^-12. We need one
22514 /// Newton-Raphson step to have a good float result (24 bits of precision).
22515 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22516 SelectionDAG &DAG, int Enabled,
22517 int &RefinementSteps,
22518 bool &UseOneConstNR,
22519 bool Reciprocal) const {
22520 SDLoc DL(Op);
22521 EVT VT = Op.getValueType();
22523 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22524 // It is likely not profitable to do this for f64 because a double-precision
22525 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22526 // instructions: convert to single, rsqrtss, convert back to double, refine
22527 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22528 // along with FMA, this could be a throughput win.
22529 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22530 // after legalize types.
22531 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22532 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22533 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22534 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22535 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22536 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22537 RefinementSteps = 1;
22539 UseOneConstNR = false;
22540 // There is no FSQRT for 512-bits, but there is RSQRT14.
22541 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22542 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
22543 if (RefinementSteps == 0 && !Reciprocal)
22544 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
22545 return Estimate;
22548 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22549 Subtarget.hasFP16()) {
22550 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
22551 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22552 RefinementSteps = 0;
22554 if (VT == MVT::f16) {
22555 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22556 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22557 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22558 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
22559 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22562 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
22564 return SDValue();
22567 /// The minimum architected relative accuracy is 2^-12. We need one
22568 /// Newton-Raphson step to have a good float result (24 bits of precision).
22569 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22570 int Enabled,
22571 int &RefinementSteps) const {
22572 SDLoc DL(Op);
22573 EVT VT = Op.getValueType();
22575 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22576 // It is likely not profitable to do this for f64 because a double-precision
22577 // reciprocal estimate with refinement on x86 prior to FMA requires
22578 // 15 instructions: convert to single, rcpss, convert back to double, refine
22579 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22580 // along with FMA, this could be a throughput win.
22582 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22583 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22584 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22585 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22586 // Enable estimate codegen with 1 refinement step for vector division.
22587 // Scalar division estimates are disabled because they break too much
22588 // real-world code. These defaults are intended to match GCC behavior.
22589 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22590 return SDValue();
22592 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22593 RefinementSteps = 1;
22595 // There is no FSQRT for 512-bits, but there is RCP14.
22596 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22597 return DAG.getNode(Opcode, DL, VT, Op);
22600 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22601 Subtarget.hasFP16()) {
22602 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22603 RefinementSteps = 0;
22605 if (VT == MVT::f16) {
22606 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22607 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22608 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22609 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
22610 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22613 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
22615 return SDValue();
22618 /// If we have at least two divisions that use the same divisor, convert to
22619 /// multiplication by a reciprocal. This may need to be adjusted for a given
22620 /// CPU if a division's cost is not at least twice the cost of a multiplication.
22621 /// This is because we still need one division to calculate the reciprocal and
22622 /// then we need two multiplies by that reciprocal as replacements for the
22623 /// original divisions.
22624 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22625 return 2;
22628 SDValue
22629 X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22630 SelectionDAG &DAG,
22631 SmallVectorImpl<SDNode *> &Created) const {
22632 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
22633 if (isIntDivCheap(N->getValueType(0), Attr))
22634 return SDValue(N,0); // Lower SDIV as SDIV
22636 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
22637 "Unexpected divisor!");
22639 // Only perform this transform if CMOV is supported otherwise the select
22640 // below will become a branch.
22641 if (!Subtarget.canUseCMOV())
22642 return SDValue();
22644 // fold (sdiv X, pow2)
22645 EVT VT = N->getValueType(0);
22646 // FIXME: Support i8.
22647 if (VT != MVT::i16 && VT != MVT::i32 &&
22648 !(Subtarget.is64Bit() && VT == MVT::i64))
22649 return SDValue();
22651 // If the divisor is 2 or -2, the default expansion is better.
22652 if (Divisor == 2 ||
22653 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
22654 return SDValue();
22656 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
22659 /// Result of 'and' is compared against zero. Change to a BT node if possible.
22660 /// Returns the BT node and the condition code needed to use it.
22661 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
22662 SelectionDAG &DAG, X86::CondCode &X86CC) {
22663 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
22664 SDValue Op0 = And.getOperand(0);
22665 SDValue Op1 = And.getOperand(1);
22666 if (Op0.getOpcode() == ISD::TRUNCATE)
22667 Op0 = Op0.getOperand(0);
22668 if (Op1.getOpcode() == ISD::TRUNCATE)
22669 Op1 = Op1.getOperand(0);
22671 SDValue Src, BitNo;
22672 if (Op1.getOpcode() == ISD::SHL)
22673 std::swap(Op0, Op1);
22674 if (Op0.getOpcode() == ISD::SHL) {
22675 if (isOneConstant(Op0.getOperand(0))) {
22676 // If we looked past a truncate, check that it's only truncating away
22677 // known zeros.
22678 unsigned BitWidth = Op0.getValueSizeInBits();
22679 unsigned AndBitWidth = And.getValueSizeInBits();
22680 if (BitWidth > AndBitWidth) {
22681 KnownBits Known = DAG.computeKnownBits(Op0);
22682 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22683 return SDValue();
22685 Src = Op1;
22686 BitNo = Op0.getOperand(1);
22688 } else if (Op1.getOpcode() == ISD::Constant) {
22689 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22690 uint64_t AndRHSVal = AndRHS->getZExtValue();
22691 SDValue AndLHS = Op0;
22693 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22694 Src = AndLHS.getOperand(0);
22695 BitNo = AndLHS.getOperand(1);
22696 } else {
22697 // Use BT if the immediate can't be encoded in a TEST instruction or we
22698 // are optimizing for size and the immedaite won't fit in a byte.
22699 bool OptForSize = DAG.shouldOptForSize();
22700 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22701 isPowerOf2_64(AndRHSVal)) {
22702 Src = AndLHS;
22703 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22704 Src.getValueType());
22709 // No patterns found, give up.
22710 if (!Src.getNode())
22711 return SDValue();
22713 // Remove any bit flip.
22714 if (isBitwiseNot(Src)) {
22715 Src = Src.getOperand(0);
22716 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
22719 // Attempt to create the X86ISD::BT node.
22720 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
22721 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
22722 return BT;
22725 return SDValue();
22728 // Check if pre-AVX condcode can be performed by a single FCMP op.
22729 static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
22730 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
22733 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
22734 /// CMPs.
22735 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
22736 SDValue &Op1, bool &IsAlwaysSignaling) {
22737 unsigned SSECC;
22738 bool Swap = false;
22740 // SSE Condition code mapping:
22741 // 0 - EQ
22742 // 1 - LT
22743 // 2 - LE
22744 // 3 - UNORD
22745 // 4 - NEQ
22746 // 5 - NLT
22747 // 6 - NLE
22748 // 7 - ORD
22749 switch (SetCCOpcode) {
22750 default: llvm_unreachable("Unexpected SETCC condition");
22751 case ISD::SETOEQ:
22752 case ISD::SETEQ: SSECC = 0; break;
22753 case ISD::SETOGT:
22754 case ISD::SETGT: Swap = true; [[fallthrough]];
22755 case ISD::SETLT:
22756 case ISD::SETOLT: SSECC = 1; break;
22757 case ISD::SETOGE:
22758 case ISD::SETGE: Swap = true; [[fallthrough]];
22759 case ISD::SETLE:
22760 case ISD::SETOLE: SSECC = 2; break;
22761 case ISD::SETUO: SSECC = 3; break;
22762 case ISD::SETUNE:
22763 case ISD::SETNE: SSECC = 4; break;
22764 case ISD::SETULE: Swap = true; [[fallthrough]];
22765 case ISD::SETUGE: SSECC = 5; break;
22766 case ISD::SETULT: Swap = true; [[fallthrough]];
22767 case ISD::SETUGT: SSECC = 6; break;
22768 case ISD::SETO: SSECC = 7; break;
22769 case ISD::SETUEQ: SSECC = 8; break;
22770 case ISD::SETONE: SSECC = 12; break;
22772 if (Swap)
22773 std::swap(Op0, Op1);
22775 switch (SetCCOpcode) {
22776 default:
22777 IsAlwaysSignaling = true;
22778 break;
22779 case ISD::SETEQ:
22780 case ISD::SETOEQ:
22781 case ISD::SETUEQ:
22782 case ISD::SETNE:
22783 case ISD::SETONE:
22784 case ISD::SETUNE:
22785 case ISD::SETO:
22786 case ISD::SETUO:
22787 IsAlwaysSignaling = false;
22788 break;
22791 return SSECC;
22794 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
22795 /// concatenate the result back.
22796 static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
22797 ISD::CondCode Cond, SelectionDAG &DAG,
22798 const SDLoc &dl) {
22799 assert(VT.isInteger() && VT == LHS.getValueType() &&
22800 VT == RHS.getValueType() && "Unsupported VTs!");
22802 SDValue CC = DAG.getCondCode(Cond);
22804 // Extract the LHS Lo/Hi vectors
22805 SDValue LHS1, LHS2;
22806 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
22808 // Extract the RHS Lo/Hi vectors
22809 SDValue RHS1, RHS2;
22810 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
22812 // Issue the operation on the smaller types and concatenate the result back
22813 EVT LoVT, HiVT;
22814 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22815 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22816 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
22817 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
22820 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
22822 SDValue Op0 = Op.getOperand(0);
22823 SDValue Op1 = Op.getOperand(1);
22824 SDValue CC = Op.getOperand(2);
22825 MVT VT = Op.getSimpleValueType();
22826 SDLoc dl(Op);
22828 assert(VT.getVectorElementType() == MVT::i1 &&
22829 "Cannot set masked compare for this operation");
22831 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
22833 // Prefer SETGT over SETLT.
22834 if (SetCCOpcode == ISD::SETLT) {
22835 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
22836 std::swap(Op0, Op1);
22839 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
22842 /// Given a buildvector constant, return a new vector constant with each element
22843 /// incremented or decremented. If incrementing or decrementing would result in
22844 /// unsigned overflow or underflow or this is not a simple vector constant,
22845 /// return an empty value.
22846 static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc,
22847 bool NSW) {
22848 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
22849 if (!BV || !V.getValueType().isSimple())
22850 return SDValue();
22852 MVT VT = V.getSimpleValueType();
22853 MVT EltVT = VT.getVectorElementType();
22854 unsigned NumElts = VT.getVectorNumElements();
22855 SmallVector<SDValue, 8> NewVecC;
22856 SDLoc DL(V);
22857 for (unsigned i = 0; i < NumElts; ++i) {
22858 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
22859 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
22860 return SDValue();
22862 // Avoid overflow/underflow.
22863 const APInt &EltC = Elt->getAPIntValue();
22864 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
22865 return SDValue();
22866 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
22867 (!IsInc && EltC.isMinSignedValue())))
22868 return SDValue();
22870 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
22873 return DAG.getBuildVector(VT, DL, NewVecC);
22876 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
22877 /// Op0 u<= Op1:
22878 /// t = psubus Op0, Op1
22879 /// pcmpeq t, <0..0>
22880 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
22881 ISD::CondCode Cond, const SDLoc &dl,
22882 const X86Subtarget &Subtarget,
22883 SelectionDAG &DAG) {
22884 if (!Subtarget.hasSSE2())
22885 return SDValue();
22887 MVT VET = VT.getVectorElementType();
22888 if (VET != MVT::i8 && VET != MVT::i16)
22889 return SDValue();
22891 switch (Cond) {
22892 default:
22893 return SDValue();
22894 case ISD::SETULT: {
22895 // If the comparison is against a constant we can turn this into a
22896 // setule. With psubus, setule does not require a swap. This is
22897 // beneficial because the constant in the register is no longer
22898 // destructed as the destination so it can be hoisted out of a loop.
22899 // Only do this pre-AVX since vpcmp* is no longer destructive.
22900 if (Subtarget.hasAVX())
22901 return SDValue();
22902 SDValue ULEOp1 =
22903 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
22904 if (!ULEOp1)
22905 return SDValue();
22906 Op1 = ULEOp1;
22907 break;
22909 case ISD::SETUGT: {
22910 // If the comparison is against a constant, we can turn this into a setuge.
22911 // This is beneficial because materializing a constant 0 for the PCMPEQ is
22912 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
22913 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
22914 SDValue UGEOp1 =
22915 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
22916 if (!UGEOp1)
22917 return SDValue();
22918 Op1 = Op0;
22919 Op0 = UGEOp1;
22920 break;
22922 // Psubus is better than flip-sign because it requires no inversion.
22923 case ISD::SETUGE:
22924 std::swap(Op0, Op1);
22925 break;
22926 case ISD::SETULE:
22927 break;
22930 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
22931 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
22932 DAG.getConstant(0, dl, VT));
22935 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
22936 SelectionDAG &DAG) {
22937 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
22938 Op.getOpcode() == ISD::STRICT_FSETCCS;
22939 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
22940 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
22941 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
22942 MVT VT = Op->getSimpleValueType(0);
22943 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
22944 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
22945 SDLoc dl(Op);
22947 if (isFP) {
22948 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
22949 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
22950 if (isSoftF16(EltVT, Subtarget))
22951 return SDValue();
22953 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
22954 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22956 // If we have a strict compare with a vXi1 result and the input is 128/256
22957 // bits we can't use a masked compare unless we have VLX. If we use a wider
22958 // compare like we do for non-strict, we might trigger spurious exceptions
22959 // from the upper elements. Instead emit a AVX compare and convert to mask.
22960 unsigned Opc;
22961 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
22962 (!IsStrict || Subtarget.hasVLX() ||
22963 Op0.getSimpleValueType().is512BitVector())) {
22964 #ifndef NDEBUG
22965 unsigned Num = VT.getVectorNumElements();
22966 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
22967 #endif
22968 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
22969 } else {
22970 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
22971 // The SSE/AVX packed FP comparison nodes are defined with a
22972 // floating-point vector result that matches the operand type. This allows
22973 // them to work with an SSE1 target (integer vector types are not legal).
22974 VT = Op0.getSimpleValueType();
22977 SDValue Cmp;
22978 bool IsAlwaysSignaling;
22979 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
22980 if (!Subtarget.hasAVX()) {
22981 // TODO: We could use following steps to handle a quiet compare with
22982 // signaling encodings.
22983 // 1. Get ordered masks from a quiet ISD::SETO
22984 // 2. Use the masks to mask potential unordered elements in operand A, B
22985 // 3. Get the compare results of masked A, B
22986 // 4. Calculating final result using the mask and result from 3
22987 // But currently, we just fall back to scalar operations.
22988 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
22989 return SDValue();
22991 // Insert an extra signaling instruction to raise exception.
22992 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
22993 SDValue SignalCmp = DAG.getNode(
22994 Opc, dl, {VT, MVT::Other},
22995 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
22996 // FIXME: It seems we need to update the flags of all new strict nodes.
22997 // Otherwise, mayRaiseFPException in MI will return false due to
22998 // NoFPExcept = false by default. However, I didn't find it in other
22999 // patches.
23000 SignalCmp->setFlags(Op->getFlags());
23001 Chain = SignalCmp.getValue(1);
23004 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23005 // emit two comparisons and a logic op to tie them together.
23006 if (!cheapX86FSETCC_SSE(Cond)) {
23007 // LLVM predicate is SETUEQ or SETONE.
23008 unsigned CC0, CC1;
23009 unsigned CombineOpc;
23010 if (Cond == ISD::SETUEQ) {
23011 CC0 = 3; // UNORD
23012 CC1 = 0; // EQ
23013 CombineOpc = X86ISD::FOR;
23014 } else {
23015 assert(Cond == ISD::SETONE);
23016 CC0 = 7; // ORD
23017 CC1 = 4; // NEQ
23018 CombineOpc = X86ISD::FAND;
23021 SDValue Cmp0, Cmp1;
23022 if (IsStrict) {
23023 Cmp0 = DAG.getNode(
23024 Opc, dl, {VT, MVT::Other},
23025 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23026 Cmp1 = DAG.getNode(
23027 Opc, dl, {VT, MVT::Other},
23028 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23029 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23030 Cmp1.getValue(1));
23031 } else {
23032 Cmp0 = DAG.getNode(
23033 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23034 Cmp1 = DAG.getNode(
23035 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23037 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23038 } else {
23039 if (IsStrict) {
23040 Cmp = DAG.getNode(
23041 Opc, dl, {VT, MVT::Other},
23042 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23043 Chain = Cmp.getValue(1);
23044 } else
23045 Cmp = DAG.getNode(
23046 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23048 } else {
23049 // Handle all other FP comparisons here.
23050 if (IsStrict) {
23051 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23052 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23053 Cmp = DAG.getNode(
23054 Opc, dl, {VT, MVT::Other},
23055 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23056 Chain = Cmp.getValue(1);
23057 } else
23058 Cmp = DAG.getNode(
23059 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23062 if (VT.getFixedSizeInBits() >
23063 Op.getSimpleValueType().getFixedSizeInBits()) {
23064 // We emitted a compare with an XMM/YMM result. Finish converting to a
23065 // mask register using a vptestm.
23066 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23067 Cmp = DAG.getBitcast(CastVT, Cmp);
23068 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23069 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23070 } else {
23071 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23072 // the result type of SETCC. The bitcast is expected to be optimized
23073 // away during combining/isel.
23074 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23077 if (IsStrict)
23078 return DAG.getMergeValues({Cmp, Chain}, dl);
23080 return Cmp;
23083 assert(!IsStrict && "Strict SETCC only handles FP operands.");
23085 MVT VTOp0 = Op0.getSimpleValueType();
23086 (void)VTOp0;
23087 assert(VTOp0 == Op1.getSimpleValueType() &&
23088 "Expected operands with same type!");
23089 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
23090 "Invalid number of packed elements for source and destination!");
23092 // The non-AVX512 code below works under the assumption that source and
23093 // destination types are the same.
23094 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23095 "Value types for source and destination must be the same!");
23097 // The result is boolean, but operands are int/float
23098 if (VT.getVectorElementType() == MVT::i1) {
23099 // In AVX-512 architecture setcc returns mask with i1 elements,
23100 // But there is no compare instruction for i8 and i16 elements in KNL.
23101 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23102 "Unexpected operand type");
23103 return LowerIntVSETCC_AVX512(Op, DAG);
23106 // Lower using XOP integer comparisons.
23107 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23108 // Translate compare code to XOP PCOM compare mode.
23109 unsigned CmpMode = 0;
23110 switch (Cond) {
23111 default: llvm_unreachable("Unexpected SETCC condition");
23112 case ISD::SETULT:
23113 case ISD::SETLT: CmpMode = 0x00; break;
23114 case ISD::SETULE:
23115 case ISD::SETLE: CmpMode = 0x01; break;
23116 case ISD::SETUGT:
23117 case ISD::SETGT: CmpMode = 0x02; break;
23118 case ISD::SETUGE:
23119 case ISD::SETGE: CmpMode = 0x03; break;
23120 case ISD::SETEQ: CmpMode = 0x04; break;
23121 case ISD::SETNE: CmpMode = 0x05; break;
23124 // Are we comparing unsigned or signed integers?
23125 unsigned Opc =
23126 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
23128 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23129 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23132 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23133 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23134 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
23135 SDValue BC0 = peekThroughBitcasts(Op0);
23136 if (BC0.getOpcode() == ISD::AND) {
23137 APInt UndefElts;
23138 SmallVector<APInt, 64> EltBits;
23139 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
23140 VT.getScalarSizeInBits(), UndefElts,
23141 EltBits, false, false)) {
23142 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23143 Cond = ISD::SETEQ;
23144 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23150 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23151 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23152 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23153 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
23154 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23155 unsigned BitWidth = VT.getScalarSizeInBits();
23156 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23158 SDValue Result = Op0.getOperand(0);
23159 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23160 DAG.getConstant(ShiftAmt, dl, VT));
23161 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23162 DAG.getConstant(BitWidth - 1, dl, VT));
23163 return Result;
23167 // Break 256-bit integer vector compare into smaller ones.
23168 if (VT.is256BitVector() && !Subtarget.hasInt256())
23169 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23171 // Break 512-bit integer vector compare into smaller ones.
23172 // TODO: Try harder to use VPCMPx + VPMOV2x?
23173 if (VT.is512BitVector())
23174 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23176 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23177 // not-of-PCMPEQ:
23178 // X != INT_MIN --> X >s INT_MIN
23179 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23180 // +X != 0 --> +X >s 0
23181 APInt ConstValue;
23182 if (Cond == ISD::SETNE &&
23183 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23184 if (ConstValue.isMinSignedValue())
23185 Cond = ISD::SETGT;
23186 else if (ConstValue.isMaxSignedValue())
23187 Cond = ISD::SETLT;
23188 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
23189 Cond = ISD::SETGT;
23192 // If both operands are known non-negative, then an unsigned compare is the
23193 // same as a signed compare and there's no need to flip signbits.
23194 // TODO: We could check for more general simplifications here since we're
23195 // computing known bits.
23196 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23197 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23199 // Special case: Use min/max operations for unsigned compares.
23200 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23201 if (ISD::isUnsignedIntSetCC(Cond) &&
23202 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23203 TLI.isOperationLegal(ISD::UMIN, VT)) {
23204 // If we have a constant operand, increment/decrement it and change the
23205 // condition to avoid an invert.
23206 if (Cond == ISD::SETUGT) {
23207 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23208 if (SDValue UGTOp1 =
23209 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
23210 Op1 = UGTOp1;
23211 Cond = ISD::SETUGE;
23214 if (Cond == ISD::SETULT) {
23215 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23216 if (SDValue ULTOp1 =
23217 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
23218 Op1 = ULTOp1;
23219 Cond = ISD::SETULE;
23222 bool Invert = false;
23223 unsigned Opc;
23224 switch (Cond) {
23225 default: llvm_unreachable("Unexpected condition code");
23226 case ISD::SETUGT: Invert = true; [[fallthrough]];
23227 case ISD::SETULE: Opc = ISD::UMIN; break;
23228 case ISD::SETULT: Invert = true; [[fallthrough]];
23229 case ISD::SETUGE: Opc = ISD::UMAX; break;
23232 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23233 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23235 // If the logical-not of the result is required, perform that now.
23236 if (Invert)
23237 Result = DAG.getNOT(dl, Result, VT);
23239 return Result;
23242 // Try to use SUBUS and PCMPEQ.
23243 if (FlipSigns)
23244 if (SDValue V =
23245 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23246 return V;
23248 // We are handling one of the integer comparisons here. Since SSE only has
23249 // GT and EQ comparisons for integer, swapping operands and multiple
23250 // operations may be required for some comparisons.
23251 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23252 : X86ISD::PCMPGT;
23253 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23254 Cond == ISD::SETGE || Cond == ISD::SETUGE;
23255 bool Invert = Cond == ISD::SETNE ||
23256 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
23258 if (Swap)
23259 std::swap(Op0, Op1);
23261 // Check that the operation in question is available (most are plain SSE2,
23262 // but PCMPGTQ and PCMPEQQ have different requirements).
23263 if (VT == MVT::v2i64) {
23264 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23265 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23267 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23268 // the odd elements over the even elements.
23269 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23270 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23271 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23273 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23274 static const int MaskHi[] = { 1, 1, 3, 3 };
23275 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23277 return DAG.getBitcast(VT, Result);
23280 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23281 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23282 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23284 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23285 static const int MaskHi[] = { 1, 1, 3, 3 };
23286 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23288 return DAG.getBitcast(VT, Result);
23291 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23292 // bits of the inputs before performing those operations. The lower
23293 // compare is always unsigned.
23294 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
23295 : 0x0000000080000000ULL,
23296 dl, MVT::v2i64);
23298 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23299 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23301 // Cast everything to the right type.
23302 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23303 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23305 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23306 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23307 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23309 // Create masks for only the low parts/high parts of the 64 bit integers.
23310 static const int MaskHi[] = { 1, 1, 3, 3 };
23311 static const int MaskLo[] = { 0, 0, 2, 2 };
23312 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23313 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23314 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23316 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23317 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23319 if (Invert)
23320 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23322 return DAG.getBitcast(VT, Result);
23325 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23326 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23327 // pcmpeqd + pshufd + pand.
23328 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
23330 // First cast everything to the right type.
23331 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23332 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23334 // Do the compare.
23335 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23337 // Make sure the lower and upper halves are both all-ones.
23338 static const int Mask[] = { 1, 0, 3, 2 };
23339 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23340 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23342 if (Invert)
23343 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23345 return DAG.getBitcast(VT, Result);
23349 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23350 // bits of the inputs before performing those operations.
23351 if (FlipSigns) {
23352 MVT EltVT = VT.getVectorElementType();
23353 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
23354 VT);
23355 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23356 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23359 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23361 // If the logical-not of the result is required, perform that now.
23362 if (Invert)
23363 Result = DAG.getNOT(dl, Result, VT);
23365 return Result;
23368 // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23369 static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
23370 const SDLoc &dl, SelectionDAG &DAG,
23371 const X86Subtarget &Subtarget,
23372 SDValue &X86CC) {
23373 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23375 // Must be a bitcast from vXi1.
23376 if (Op0.getOpcode() != ISD::BITCAST)
23377 return SDValue();
23379 Op0 = Op0.getOperand(0);
23380 MVT VT = Op0.getSimpleValueType();
23381 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23382 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23383 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23384 return SDValue();
23386 X86::CondCode X86Cond;
23387 if (isNullConstant(Op1)) {
23388 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23389 } else if (isAllOnesConstant(Op1)) {
23390 // C flag is set for all ones.
23391 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23392 } else
23393 return SDValue();
23395 // If the input is an AND, we can combine it's operands into the KTEST.
23396 bool KTestable = false;
23397 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23398 KTestable = true;
23399 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23400 KTestable = true;
23401 if (!isNullConstant(Op1))
23402 KTestable = false;
23403 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23404 SDValue LHS = Op0.getOperand(0);
23405 SDValue RHS = Op0.getOperand(1);
23406 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23407 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23410 // If the input is an OR, we can combine it's operands into the KORTEST.
23411 SDValue LHS = Op0;
23412 SDValue RHS = Op0;
23413 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23414 LHS = Op0.getOperand(0);
23415 RHS = Op0.getOperand(1);
23418 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23419 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23422 /// Emit flags for the given setcc condition and operands. Also returns the
23423 /// corresponding X86 condition code constant in X86CC.
23424 SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23425 ISD::CondCode CC, const SDLoc &dl,
23426 SelectionDAG &DAG,
23427 SDValue &X86CC) const {
23428 // Equality Combines.
23429 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23430 X86::CondCode X86CondCode;
23432 // Optimize to BT if possible.
23433 // Lower (X & (1 << N)) == 0 to BT(X, N).
23434 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23435 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23436 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
23437 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
23438 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23439 return BT;
23443 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
23444 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
23445 X86CondCode)) {
23446 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23447 return CmpZ;
23450 // Try to lower using KORTEST or KTEST.
23451 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23452 return Test;
23454 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
23455 // of these.
23456 if (isOneConstant(Op1) || isNullConstant(Op1)) {
23457 // If the input is a setcc, then reuse the input setcc or use a new one
23458 // with the inverted condition.
23459 if (Op0.getOpcode() == X86ISD::SETCC) {
23460 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23462 X86CC = Op0.getOperand(0);
23463 if (Invert) {
23464 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23465 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
23466 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23469 return Op0.getOperand(1);
23473 // Try to use the carry flag from the add in place of an separate CMP for:
23474 // (seteq (add X, -1), -1). Similar for setne.
23475 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23476 Op0.getOperand(1) == Op1) {
23477 if (isProfitableToUseFlagOp(Op0)) {
23478 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23480 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23481 Op0.getOperand(1));
23482 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23483 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23484 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23485 return SDValue(New.getNode(), 1);
23490 X86::CondCode CondCode =
23491 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23492 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
23494 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23495 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23496 return EFLAGS;
23499 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23501 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23502 Op.getOpcode() == ISD::STRICT_FSETCCS;
23503 MVT VT = Op->getSimpleValueType(0);
23505 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23507 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23508 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23509 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23510 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23511 SDLoc dl(Op);
23512 ISD::CondCode CC =
23513 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23515 if (isSoftF16(Op0.getValueType(), Subtarget))
23516 return SDValue();
23518 // Handle f128 first, since one possible outcome is a normal integer
23519 // comparison which gets handled by emitFlagsForSetcc.
23520 if (Op0.getValueType() == MVT::f128) {
23521 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23522 Op.getOpcode() == ISD::STRICT_FSETCCS);
23524 // If softenSetCCOperands returned a scalar, use it.
23525 if (!Op1.getNode()) {
23526 assert(Op0.getValueType() == Op.getValueType() &&
23527 "Unexpected setcc expansion!");
23528 if (IsStrict)
23529 return DAG.getMergeValues({Op0, Chain}, dl);
23530 return Op0;
23534 if (Op0.getSimpleValueType().isInteger()) {
23535 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23536 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23537 // this may translate to less uops depending on uarch implementation. The
23538 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23539 // canonicalize to that CondCode.
23540 // NOTE: Only do this if incrementing the constant doesn't increase the bit
23541 // encoding size - so it must either already be a i8 or i32 immediate, or it
23542 // shrinks down to that. We don't do this for any i64's to avoid additional
23543 // constant materializations.
23544 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23545 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23546 const APInt &Op1Val = Op1C->getAPIntValue();
23547 if (!Op1Val.isZero()) {
23548 // Ensure the constant+1 doesn't overflow.
23549 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23550 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23551 APInt Op1ValPlusOne = Op1Val + 1;
23552 if (Op1ValPlusOne.isSignedIntN(32) &&
23553 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23554 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23555 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
23556 : ISD::CondCode::SETUGE;
23562 SDValue X86CC;
23563 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23564 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23565 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23568 // Handle floating point.
23569 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23570 if (CondCode == X86::COND_INVALID)
23571 return SDValue();
23573 SDValue EFLAGS;
23574 if (IsStrict) {
23575 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23576 EFLAGS =
23577 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
23578 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23579 Chain = EFLAGS.getValue(1);
23580 } else {
23581 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23584 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23585 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23586 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23589 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23590 SDValue LHS = Op.getOperand(0);
23591 SDValue RHS = Op.getOperand(1);
23592 SDValue Carry = Op.getOperand(2);
23593 SDValue Cond = Op.getOperand(3);
23594 SDLoc DL(Op);
23596 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
23597 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23599 // Recreate the carry if needed.
23600 EVT CarryVT = Carry.getValueType();
23601 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23602 Carry, DAG.getAllOnesConstant(DL, CarryVT));
23604 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23605 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23606 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23609 // This function returns three things: the arithmetic computation itself
23610 // (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
23611 // flag and the condition code define the case in which the arithmetic
23612 // computation overflows.
23613 static std::pair<SDValue, SDValue>
23614 getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
23615 assert(Op.getResNo() == 0 && "Unexpected result number!");
23616 SDValue Value, Overflow;
23617 SDValue LHS = Op.getOperand(0);
23618 SDValue RHS = Op.getOperand(1);
23619 unsigned BaseOp = 0;
23620 SDLoc DL(Op);
23621 switch (Op.getOpcode()) {
23622 default: llvm_unreachable("Unknown ovf instruction!");
23623 case ISD::SADDO:
23624 BaseOp = X86ISD::ADD;
23625 Cond = X86::COND_O;
23626 break;
23627 case ISD::UADDO:
23628 BaseOp = X86ISD::ADD;
23629 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
23630 break;
23631 case ISD::SSUBO:
23632 BaseOp = X86ISD::SUB;
23633 Cond = X86::COND_O;
23634 break;
23635 case ISD::USUBO:
23636 BaseOp = X86ISD::SUB;
23637 Cond = X86::COND_B;
23638 break;
23639 case ISD::SMULO:
23640 BaseOp = X86ISD::SMUL;
23641 Cond = X86::COND_O;
23642 break;
23643 case ISD::UMULO:
23644 BaseOp = X86ISD::UMUL;
23645 Cond = X86::COND_O;
23646 break;
23649 if (BaseOp) {
23650 // Also sets EFLAGS.
23651 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23652 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23653 Overflow = Value.getValue(1);
23656 return std::make_pair(Value, Overflow);
23659 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23660 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23661 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23662 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23663 // has only one use.
23664 SDLoc DL(Op);
23665 X86::CondCode Cond;
23666 SDValue Value, Overflow;
23667 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23669 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23670 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
23671 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23674 /// Return true if opcode is a X86 logical comparison.
23675 static bool isX86LogicalCmp(SDValue Op) {
23676 unsigned Opc = Op.getOpcode();
23677 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23678 Opc == X86ISD::FCMP)
23679 return true;
23680 if (Op.getResNo() == 1 &&
23681 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23682 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23683 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23684 return true;
23686 return false;
23689 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
23690 if (V.getOpcode() != ISD::TRUNCATE)
23691 return false;
23693 SDValue VOp0 = V.getOperand(0);
23694 unsigned InBits = VOp0.getValueSizeInBits();
23695 unsigned Bits = V.getValueSizeInBits();
23696 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23699 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23700 bool AddTest = true;
23701 SDValue Cond = Op.getOperand(0);
23702 SDValue Op1 = Op.getOperand(1);
23703 SDValue Op2 = Op.getOperand(2);
23704 SDLoc DL(Op);
23705 MVT VT = Op1.getSimpleValueType();
23706 SDValue CC;
23708 if (isSoftF16(VT, Subtarget)) {
23709 MVT NVT = VT.changeTypeToInteger();
23710 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
23711 DAG.getBitcast(NVT, Op1),
23712 DAG.getBitcast(NVT, Op2)));
23715 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
23716 // are available or VBLENDV if AVX is available.
23717 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
23718 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
23719 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
23720 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
23721 bool IsAlwaysSignaling;
23722 unsigned SSECC =
23723 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
23724 CondOp0, CondOp1, IsAlwaysSignaling);
23726 if (Subtarget.hasAVX512()) {
23727 SDValue Cmp =
23728 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
23729 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23730 assert(!VT.isVector() && "Not a scalar type?");
23731 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23734 if (SSECC < 8 || Subtarget.hasAVX()) {
23735 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
23736 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23738 // If we have AVX, we can use a variable vector select (VBLENDV) instead
23739 // of 3 logic instructions for size savings and potentially speed.
23740 // Unfortunately, there is no scalar form of VBLENDV.
23742 // If either operand is a +0.0 constant, don't try this. We can expect to
23743 // optimize away at least one of the logic instructions later in that
23744 // case, so that sequence would be faster than a variable blend.
23746 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
23747 // uses XMM0 as the selection register. That may need just as many
23748 // instructions as the AND/ANDN/OR sequence due to register moves, so
23749 // don't bother.
23750 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
23751 !isNullFPConstant(Op2)) {
23752 // Convert to vectors, do a VSELECT, and convert back to scalar.
23753 // All of the conversions should be optimized away.
23754 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
23755 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
23756 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
23757 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
23759 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
23760 VCmp = DAG.getBitcast(VCmpVT, VCmp);
23762 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
23764 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
23765 VSel, DAG.getIntPtrConstant(0, DL));
23767 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
23768 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
23769 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
23773 // AVX512 fallback is to lower selects of scalar floats to masked moves.
23774 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
23775 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
23776 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23779 if (Cond.getOpcode() == ISD::SETCC &&
23780 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
23781 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
23782 Cond = NewCond;
23783 // If the condition was updated, it's possible that the operands of the
23784 // select were also updated (for example, EmitTest has a RAUW). Refresh
23785 // the local references to the select operands in case they got stale.
23786 Op1 = Op.getOperand(1);
23787 Op2 = Op.getOperand(2);
23791 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
23792 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
23793 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
23794 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
23795 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
23796 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
23797 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
23798 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
23799 if (Cond.getOpcode() == X86ISD::SETCC &&
23800 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
23801 isNullConstant(Cond.getOperand(1).getOperand(1))) {
23802 SDValue Cmp = Cond.getOperand(1);
23803 SDValue CmpOp0 = Cmp.getOperand(0);
23804 unsigned CondCode = Cond.getConstantOperandVal(0);
23806 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
23807 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
23808 // handle to keep the CMP with 0. This should be removed by
23809 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
23810 // cttz_zero_undef.
23811 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
23812 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
23813 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
23815 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
23816 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
23817 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
23818 // Keep Cmp.
23819 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23820 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
23821 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
23822 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
23824 // 'X - 1' sets the carry flag if X == 0.
23825 // '0 - X' sets the carry flag if X != 0.
23826 // Convert the carry flag to a -1/0 mask with sbb:
23827 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
23828 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
23829 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
23830 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
23831 SDValue Sub;
23832 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
23833 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
23834 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
23835 } else {
23836 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
23837 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
23839 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
23840 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
23841 Sub.getValue(1));
23842 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
23843 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
23844 CmpOp0.getOpcode() == ISD::AND &&
23845 isOneConstant(CmpOp0.getOperand(1))) {
23846 SDValue Src1, Src2;
23847 // true if Op2 is XOR or OR operator and one of its operands
23848 // is equal to Op1
23849 // ( a , a op b) || ( b , a op b)
23850 auto isOrXorPattern = [&]() {
23851 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
23852 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
23853 Src1 =
23854 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
23855 Src2 = Op1;
23856 return true;
23858 return false;
23861 if (isOrXorPattern()) {
23862 SDValue Neg;
23863 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
23864 // we need mask of all zeros or ones with same size of the other
23865 // operands.
23866 if (CmpSz > VT.getSizeInBits())
23867 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
23868 else if (CmpSz < VT.getSizeInBits())
23869 Neg = DAG.getNode(ISD::AND, DL, VT,
23870 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
23871 DAG.getConstant(1, DL, VT));
23872 else
23873 Neg = CmpOp0;
23874 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
23875 Neg); // -(and (x, 0x1))
23876 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
23877 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
23879 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
23880 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
23881 ((CondCode == X86::COND_S) || // smin(x, 0)
23882 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
23883 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
23885 // If the comparison is testing for a positive value, we have to invert
23886 // the sign bit mask, so only do that transform if the target has a
23887 // bitwise 'and not' instruction (the invert is free).
23888 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
23889 unsigned ShCt = VT.getSizeInBits() - 1;
23890 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
23891 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
23892 if (CondCode == X86::COND_G)
23893 Shift = DAG.getNOT(DL, Shift, VT);
23894 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
23898 // Look past (and (setcc_carry (cmp ...)), 1).
23899 if (Cond.getOpcode() == ISD::AND &&
23900 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
23901 isOneConstant(Cond.getOperand(1)))
23902 Cond = Cond.getOperand(0);
23904 // If condition flag is set by a X86ISD::CMP, then use it as the condition
23905 // setting operand in place of the X86ISD::SETCC.
23906 unsigned CondOpcode = Cond.getOpcode();
23907 if (CondOpcode == X86ISD::SETCC ||
23908 CondOpcode == X86ISD::SETCC_CARRY) {
23909 CC = Cond.getOperand(0);
23911 SDValue Cmp = Cond.getOperand(1);
23912 bool IllegalFPCMov = false;
23913 if (VT.isFloatingPoint() && !VT.isVector() &&
23914 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
23915 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
23917 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
23918 Cmp.getOpcode() == X86ISD::BT) { // FIXME
23919 Cond = Cmp;
23920 AddTest = false;
23922 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
23923 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
23924 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
23925 SDValue Value;
23926 X86::CondCode X86Cond;
23927 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
23929 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
23930 AddTest = false;
23933 if (AddTest) {
23934 // Look past the truncate if the high bits are known zero.
23935 if (isTruncWithZeroHighBitsInput(Cond, DAG))
23936 Cond = Cond.getOperand(0);
23938 // We know the result of AND is compared against zero. Try to match
23939 // it to BT.
23940 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
23941 X86::CondCode X86CondCode;
23942 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
23943 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
23944 Cond = BT;
23945 AddTest = false;
23950 if (AddTest) {
23951 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
23952 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
23955 // a < b ? -1 : 0 -> RES = ~setcc_carry
23956 // a < b ? 0 : -1 -> RES = setcc_carry
23957 // a >= b ? -1 : 0 -> RES = setcc_carry
23958 // a >= b ? 0 : -1 -> RES = ~setcc_carry
23959 if (Cond.getOpcode() == X86ISD::SUB) {
23960 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
23962 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
23963 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23964 (isNullConstant(Op1) || isNullConstant(Op2))) {
23965 SDValue Res =
23966 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
23967 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
23968 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
23969 return DAG.getNOT(DL, Res, Res.getValueType());
23970 return Res;
23974 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
23975 // widen the cmov and push the truncate through. This avoids introducing a new
23976 // branch during isel and doesn't add any extensions.
23977 if (Op.getValueType() == MVT::i8 &&
23978 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
23979 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
23980 if (T1.getValueType() == T2.getValueType() &&
23981 // Exclude CopyFromReg to avoid partial register stalls.
23982 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
23983 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
23984 CC, Cond);
23985 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
23989 // Or finally, promote i8 cmovs if we have CMOV,
23990 // or i16 cmovs if it won't prevent folding a load.
23991 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
23992 // legal, but EmitLoweredSelect() can not deal with these extensions
23993 // being inserted between two CMOV's. (in i16 case too TBN)
23994 // https://bugs.llvm.org/show_bug.cgi?id=40974
23995 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
23996 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
23997 !X86::mayFoldLoad(Op2, Subtarget))) {
23998 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
23999 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24000 SDValue Ops[] = { Op2, Op1, CC, Cond };
24001 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24002 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24005 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24006 // condition is true.
24007 SDValue Ops[] = { Op2, Op1, CC, Cond };
24008 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24011 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
24012 const X86Subtarget &Subtarget,
24013 SelectionDAG &DAG) {
24014 MVT VT = Op->getSimpleValueType(0);
24015 SDValue In = Op->getOperand(0);
24016 MVT InVT = In.getSimpleValueType();
24017 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24018 MVT VTElt = VT.getVectorElementType();
24019 SDLoc dl(Op);
24021 unsigned NumElts = VT.getVectorNumElements();
24023 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24024 MVT ExtVT = VT;
24025 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24026 // If v16i32 is to be avoided, we'll need to split and concatenate.
24027 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24028 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24030 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24033 // Widen to 512-bits if VLX is not supported.
24034 MVT WideVT = ExtVT;
24035 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24036 NumElts *= 512 / ExtVT.getSizeInBits();
24037 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24038 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24039 In, DAG.getIntPtrConstant(0, dl));
24040 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24043 SDValue V;
24044 MVT WideEltVT = WideVT.getVectorElementType();
24045 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24046 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24047 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24048 } else {
24049 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24050 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24051 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24054 // Truncate if we had to extend i16/i8 above.
24055 if (VT != ExtVT) {
24056 WideVT = MVT::getVectorVT(VTElt, NumElts);
24057 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24060 // Extract back to 128/256-bit if we widened.
24061 if (WideVT != VT)
24062 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24063 DAG.getIntPtrConstant(0, dl));
24065 return V;
24068 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24069 SelectionDAG &DAG) {
24070 SDValue In = Op->getOperand(0);
24071 MVT InVT = In.getSimpleValueType();
24073 if (InVT.getVectorElementType() == MVT::i1)
24074 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24076 assert(Subtarget.hasAVX() && "Expected AVX support");
24077 return LowerAVXExtend(Op, DAG, Subtarget);
24080 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24081 // For sign extend this needs to handle all vector sizes and SSE4.1 and
24082 // non-SSE4.1 targets. For zero extend this should only handle inputs of
24083 // MVT::v64i8 when BWI is not supported, but AVX512 is.
24084 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
24085 const X86Subtarget &Subtarget,
24086 SelectionDAG &DAG) {
24087 SDValue In = Op->getOperand(0);
24088 MVT VT = Op->getSimpleValueType(0);
24089 MVT InVT = In.getSimpleValueType();
24091 MVT SVT = VT.getVectorElementType();
24092 MVT InSVT = InVT.getVectorElementType();
24093 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
24095 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24096 return SDValue();
24097 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24098 return SDValue();
24099 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24100 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24101 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24102 return SDValue();
24104 SDLoc dl(Op);
24105 unsigned Opc = Op.getOpcode();
24106 unsigned NumElts = VT.getVectorNumElements();
24108 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24109 // For 512-bit vectors, we need 128-bits or 256-bits.
24110 if (InVT.getSizeInBits() > 128) {
24111 // Input needs to be at least the same number of elements as output, and
24112 // at least 128-bits.
24113 int InSize = InSVT.getSizeInBits() * NumElts;
24114 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24115 InVT = In.getSimpleValueType();
24118 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24119 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24120 // need to be handled here for 256/512-bit results.
24121 if (Subtarget.hasInt256()) {
24122 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24124 if (InVT.getVectorNumElements() != NumElts)
24125 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24127 // FIXME: Apparently we create inreg operations that could be regular
24128 // extends.
24129 unsigned ExtOpc =
24130 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
24131 : ISD::ZERO_EXTEND;
24132 return DAG.getNode(ExtOpc, dl, VT, In);
24135 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24136 if (Subtarget.hasAVX()) {
24137 assert(VT.is256BitVector() && "256-bit vector expected");
24138 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24139 int HalfNumElts = HalfVT.getVectorNumElements();
24141 unsigned NumSrcElts = InVT.getVectorNumElements();
24142 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24143 for (int i = 0; i != HalfNumElts; ++i)
24144 HiMask[i] = HalfNumElts + i;
24146 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24147 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24148 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24149 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24152 // We should only get here for sign extend.
24153 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24154 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24155 unsigned InNumElts = InVT.getVectorNumElements();
24157 // If the source elements are already all-signbits, we don't need to extend,
24158 // just splat the elements.
24159 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
24160 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
24161 unsigned Scale = InNumElts / NumElts;
24162 SmallVector<int, 16> ShuffleMask;
24163 for (unsigned I = 0; I != NumElts; ++I)
24164 ShuffleMask.append(Scale, I);
24165 return DAG.getBitcast(VT,
24166 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
24169 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24170 SDValue Curr = In;
24171 SDValue SignExt = Curr;
24173 // As SRAI is only available on i16/i32 types, we expand only up to i32
24174 // and handle i64 separately.
24175 if (InVT != MVT::v4i32) {
24176 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24178 unsigned DestWidth = DestVT.getScalarSizeInBits();
24179 unsigned Scale = DestWidth / InSVT.getSizeInBits();
24180 unsigned DestElts = DestVT.getVectorNumElements();
24182 // Build a shuffle mask that takes each input element and places it in the
24183 // MSBs of the new element size.
24184 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24185 for (unsigned i = 0; i != DestElts; ++i)
24186 Mask[i * Scale + (Scale - 1)] = i;
24188 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24189 Curr = DAG.getBitcast(DestVT, Curr);
24191 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24192 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24193 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24196 if (VT == MVT::v2i64) {
24197 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24198 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24199 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24200 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24201 SignExt = DAG.getBitcast(VT, SignExt);
24204 return SignExt;
24207 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24208 SelectionDAG &DAG) {
24209 MVT VT = Op->getSimpleValueType(0);
24210 SDValue In = Op->getOperand(0);
24211 MVT InVT = In.getSimpleValueType();
24212 SDLoc dl(Op);
24214 if (InVT.getVectorElementType() == MVT::i1)
24215 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24217 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
24218 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
24219 "Expected same number of elements");
24220 assert((VT.getVectorElementType() == MVT::i16 ||
24221 VT.getVectorElementType() == MVT::i32 ||
24222 VT.getVectorElementType() == MVT::i64) &&
24223 "Unexpected element type");
24224 assert((InVT.getVectorElementType() == MVT::i8 ||
24225 InVT.getVectorElementType() == MVT::i16 ||
24226 InVT.getVectorElementType() == MVT::i32) &&
24227 "Unexpected element type");
24229 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24230 assert(InVT == MVT::v32i8 && "Unexpected VT!");
24231 return splitVectorIntUnary(Op, DAG);
24234 if (Subtarget.hasInt256())
24235 return Op;
24237 // Optimize vectors in AVX mode
24238 // Sign extend v8i16 to v8i32 and
24239 // v4i32 to v4i64
24241 // Divide input vector into two parts
24242 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24243 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24244 // concat the vectors to original VT
24245 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24246 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24248 unsigned NumElems = InVT.getVectorNumElements();
24249 SmallVector<int,8> ShufMask(NumElems, -1);
24250 for (unsigned i = 0; i != NumElems/2; ++i)
24251 ShufMask[i] = i + NumElems/2;
24253 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24254 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24256 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24259 /// Change a vector store into a pair of half-size vector stores.
24260 static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
24261 SDValue StoredVal = Store->getValue();
24262 assert((StoredVal.getValueType().is256BitVector() ||
24263 StoredVal.getValueType().is512BitVector()) &&
24264 "Expecting 256/512-bit op");
24266 // Splitting volatile memory ops is not allowed unless the operation was not
24267 // legal to begin with. Assume the input store is legal (this transform is
24268 // only used for targets with AVX). Note: It is possible that we have an
24269 // illegal type like v2i128, and so we could allow splitting a volatile store
24270 // in that case if that is important.
24271 if (!Store->isSimple())
24272 return SDValue();
24274 SDLoc DL(Store);
24275 SDValue Value0, Value1;
24276 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24277 unsigned HalfOffset = Value0.getValueType().getStoreSize();
24278 SDValue Ptr0 = Store->getBasePtr();
24279 SDValue Ptr1 =
24280 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
24281 SDValue Ch0 =
24282 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24283 Store->getOriginalAlign(),
24284 Store->getMemOperand()->getFlags());
24285 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24286 Store->getPointerInfo().getWithOffset(HalfOffset),
24287 Store->getOriginalAlign(),
24288 Store->getMemOperand()->getFlags());
24289 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24292 /// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24293 /// type.
24294 static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
24295 SelectionDAG &DAG) {
24296 SDValue StoredVal = Store->getValue();
24297 assert(StoreVT.is128BitVector() &&
24298 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24299 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24301 // Splitting volatile memory ops is not allowed unless the operation was not
24302 // legal to begin with. We are assuming the input op is legal (this transform
24303 // is only used for targets with AVX).
24304 if (!Store->isSimple())
24305 return SDValue();
24307 MVT StoreSVT = StoreVT.getScalarType();
24308 unsigned NumElems = StoreVT.getVectorNumElements();
24309 unsigned ScalarSize = StoreSVT.getStoreSize();
24311 SDLoc DL(Store);
24312 SmallVector<SDValue, 4> Stores;
24313 for (unsigned i = 0; i != NumElems; ++i) {
24314 unsigned Offset = i * ScalarSize;
24315 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24316 TypeSize::Fixed(Offset), DL);
24317 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24318 DAG.getIntPtrConstant(i, DL));
24319 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24320 Store->getPointerInfo().getWithOffset(Offset),
24321 Store->getOriginalAlign(),
24322 Store->getMemOperand()->getFlags());
24323 Stores.push_back(Ch);
24325 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24328 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24329 SelectionDAG &DAG) {
24330 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24331 SDLoc dl(St);
24332 SDValue StoredVal = St->getValue();
24334 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24335 if (StoredVal.getValueType().isVector() &&
24336 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24337 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24338 assert(NumElts <= 8 && "Unexpected VT");
24339 assert(!St->isTruncatingStore() && "Expected non-truncating store");
24340 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24341 "Expected AVX512F without AVX512DQI");
24343 // We must pad with zeros to ensure we store zeroes to any unused bits.
24344 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24345 DAG.getUNDEF(MVT::v16i1), StoredVal,
24346 DAG.getIntPtrConstant(0, dl));
24347 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24348 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24349 // Make sure we store zeros in the extra bits.
24350 if (NumElts < 8)
24351 StoredVal = DAG.getZeroExtendInReg(
24352 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24354 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24355 St->getPointerInfo(), St->getOriginalAlign(),
24356 St->getMemOperand()->getFlags());
24359 if (St->isTruncatingStore())
24360 return SDValue();
24362 // If this is a 256-bit store of concatenated ops, we are better off splitting
24363 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24364 // and each half can execute independently. Some cores would split the op into
24365 // halves anyway, so the concat (vinsertf128) is purely an extra op.
24366 MVT StoreVT = StoredVal.getSimpleValueType();
24367 if (StoreVT.is256BitVector() ||
24368 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24369 !Subtarget.hasBWI())) {
24370 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG))
24371 return splitVectorStore(St, DAG);
24372 return SDValue();
24375 if (StoreVT.is32BitVector())
24376 return SDValue();
24378 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24379 assert(StoreVT.is64BitVector() && "Unexpected VT");
24380 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
24381 TargetLowering::TypeWidenVector &&
24382 "Unexpected type action!");
24384 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24385 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24386 DAG.getUNDEF(StoreVT));
24388 if (Subtarget.hasSSE2()) {
24389 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24390 // and store it.
24391 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24392 MVT CastVT = MVT::getVectorVT(StVT, 2);
24393 StoredVal = DAG.getBitcast(CastVT, StoredVal);
24394 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24395 DAG.getIntPtrConstant(0, dl));
24397 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24398 St->getPointerInfo(), St->getOriginalAlign(),
24399 St->getMemOperand()->getFlags());
24401 assert(Subtarget.hasSSE1() && "Expected SSE");
24402 SDVTList Tys = DAG.getVTList(MVT::Other);
24403 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24404 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24405 St->getMemOperand());
24408 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
24409 // may emit an illegal shuffle but the expansion is still better than scalar
24410 // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24411 // we'll emit a shuffle and a arithmetic shift.
24412 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24413 // TODO: It is possible to support ZExt by zeroing the undef values during
24414 // the shuffle phase or after the shuffle.
24415 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24416 SelectionDAG &DAG) {
24417 MVT RegVT = Op.getSimpleValueType();
24418 assert(RegVT.isVector() && "We only custom lower vector loads.");
24419 assert(RegVT.isInteger() &&
24420 "We only custom lower integer vector loads.");
24422 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24423 SDLoc dl(Ld);
24425 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24426 if (RegVT.getVectorElementType() == MVT::i1) {
24427 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24428 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
24429 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24430 "Expected AVX512F without AVX512DQI");
24432 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24433 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24434 Ld->getMemOperand()->getFlags());
24436 // Replace chain users with the new chain.
24437 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24439 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24440 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24441 DAG.getBitcast(MVT::v16i1, Val),
24442 DAG.getIntPtrConstant(0, dl));
24443 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24446 return SDValue();
24449 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24450 /// each of which has no other use apart from the AND / OR.
24451 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24452 Opc = Op.getOpcode();
24453 if (Opc != ISD::OR && Opc != ISD::AND)
24454 return false;
24455 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24456 Op.getOperand(0).hasOneUse() &&
24457 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24458 Op.getOperand(1).hasOneUse());
24461 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24462 SDValue Chain = Op.getOperand(0);
24463 SDValue Cond = Op.getOperand(1);
24464 SDValue Dest = Op.getOperand(2);
24465 SDLoc dl(Op);
24467 // Bail out when we don't have native compare instructions.
24468 if (Cond.getOpcode() == ISD::SETCC &&
24469 Cond.getOperand(0).getValueType() != MVT::f128 &&
24470 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
24471 SDValue LHS = Cond.getOperand(0);
24472 SDValue RHS = Cond.getOperand(1);
24473 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24475 // Special case for
24476 // setcc([su]{add,sub,mul}o == 0)
24477 // setcc([su]{add,sub,mul}o != 1)
24478 if (ISD::isOverflowIntrOpRes(LHS) &&
24479 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24480 (isNullConstant(RHS) || isOneConstant(RHS))) {
24481 SDValue Value, Overflow;
24482 X86::CondCode X86Cond;
24483 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24485 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24486 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24488 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24489 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24490 Overflow);
24493 if (LHS.getSimpleValueType().isInteger()) {
24494 SDValue CCVal;
24495 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24496 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24497 EFLAGS);
24500 if (CC == ISD::SETOEQ) {
24501 // For FCMP_OEQ, we can emit
24502 // two branches instead of an explicit AND instruction with a
24503 // separate test. However, we only do this if this block doesn't
24504 // have a fall-through edge, because this requires an explicit
24505 // jmp when the condition is false.
24506 if (Op.getNode()->hasOneUse()) {
24507 SDNode *User = *Op.getNode()->use_begin();
24508 // Look for an unconditional branch following this conditional branch.
24509 // We need this because we need to reverse the successors in order
24510 // to implement FCMP_OEQ.
24511 if (User->getOpcode() == ISD::BR) {
24512 SDValue FalseBB = User->getOperand(1);
24513 SDNode *NewBR =
24514 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24515 assert(NewBR == User);
24516 (void)NewBR;
24517 Dest = FalseBB;
24519 SDValue Cmp =
24520 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24521 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24522 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24523 CCVal, Cmp);
24524 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24525 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24526 Cmp);
24529 } else if (CC == ISD::SETUNE) {
24530 // For FCMP_UNE, we can emit
24531 // two branches instead of an explicit OR instruction with a
24532 // separate test.
24533 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24534 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24535 Chain =
24536 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24537 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24538 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24539 Cmp);
24540 } else {
24541 X86::CondCode X86Cond =
24542 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24543 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24544 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24545 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24546 Cmp);
24550 if (ISD::isOverflowIntrOpRes(Cond)) {
24551 SDValue Value, Overflow;
24552 X86::CondCode X86Cond;
24553 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24555 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24556 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24557 Overflow);
24560 // Look past the truncate if the high bits are known zero.
24561 if (isTruncWithZeroHighBitsInput(Cond, DAG))
24562 Cond = Cond.getOperand(0);
24564 EVT CondVT = Cond.getValueType();
24566 // Add an AND with 1 if we don't already have one.
24567 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24568 Cond =
24569 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24571 SDValue LHS = Cond;
24572 SDValue RHS = DAG.getConstant(0, dl, CondVT);
24574 SDValue CCVal;
24575 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24576 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24577 EFLAGS);
24580 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24581 // Calls to _alloca are needed to probe the stack when allocating more than 4k
24582 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
24583 // that the guard pages used by the OS virtual memory manager are allocated in
24584 // correct sequence.
24585 SDValue
24586 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24587 SelectionDAG &DAG) const {
24588 MachineFunction &MF = DAG.getMachineFunction();
24589 bool SplitStack = MF.shouldSplitStack();
24590 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24591 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24592 SplitStack || EmitStackProbeCall;
24593 SDLoc dl(Op);
24595 // Get the inputs.
24596 SDNode *Node = Op.getNode();
24597 SDValue Chain = Op.getOperand(0);
24598 SDValue Size = Op.getOperand(1);
24599 MaybeAlign Alignment(Op.getConstantOperandVal(2));
24600 EVT VT = Node->getValueType(0);
24602 // Chain the dynamic stack allocation so that it doesn't modify the stack
24603 // pointer when other instructions are using the stack.
24604 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24606 bool Is64Bit = Subtarget.is64Bit();
24607 MVT SPTy = getPointerTy(DAG.getDataLayout());
24609 SDValue Result;
24610 if (!Lower) {
24611 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24612 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
24613 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
24614 " not tell us which reg is the stack pointer!");
24616 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24617 const Align StackAlign = TFI.getStackAlign();
24618 if (hasInlineStackProbe(MF)) {
24619 MachineRegisterInfo &MRI = MF.getRegInfo();
24621 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24622 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24623 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24624 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24625 DAG.getRegister(Vreg, SPTy));
24626 } else {
24627 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24628 Chain = SP.getValue(1);
24629 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24631 if (Alignment && *Alignment > StackAlign)
24632 Result =
24633 DAG.getNode(ISD::AND, dl, VT, Result,
24634 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24635 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24636 } else if (SplitStack) {
24637 MachineRegisterInfo &MRI = MF.getRegInfo();
24639 if (Is64Bit) {
24640 // The 64 bit implementation of segmented stacks needs to clobber both r10
24641 // r11. This makes it impossible to use it along with nested parameters.
24642 const Function &F = MF.getFunction();
24643 for (const auto &A : F.args()) {
24644 if (A.hasNestAttr())
24645 report_fatal_error("Cannot use segmented stacks with functions that "
24646 "have nested arguments.");
24650 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24651 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24652 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24653 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24654 DAG.getRegister(Vreg, SPTy));
24655 } else {
24656 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24657 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
24658 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
24660 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24661 Register SPReg = RegInfo->getStackRegister();
24662 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24663 Chain = SP.getValue(1);
24665 if (Alignment) {
24666 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24667 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24668 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24671 Result = SP;
24674 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
24676 SDValue Ops[2] = {Result, Chain};
24677 return DAG.getMergeValues(Ops, dl);
24680 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24681 MachineFunction &MF = DAG.getMachineFunction();
24682 auto PtrVT = getPointerTy(MF.getDataLayout());
24683 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
24685 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24686 SDLoc DL(Op);
24688 if (!Subtarget.is64Bit() ||
24689 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24690 // vastart just stores the address of the VarArgsFrameIndex slot into the
24691 // memory location argument.
24692 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24693 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24694 MachinePointerInfo(SV));
24697 // __va_list_tag:
24698 // gp_offset (0 - 6 * 8)
24699 // fp_offset (48 - 48 + 8 * 16)
24700 // overflow_arg_area (point to parameters coming in memory).
24701 // reg_save_area
24702 SmallVector<SDValue, 8> MemOps;
24703 SDValue FIN = Op.getOperand(1);
24704 // Store gp_offset
24705 SDValue Store = DAG.getStore(
24706 Op.getOperand(0), DL,
24707 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24708 MachinePointerInfo(SV));
24709 MemOps.push_back(Store);
24711 // Store fp_offset
24712 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
24713 Store = DAG.getStore(
24714 Op.getOperand(0), DL,
24715 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
24716 MachinePointerInfo(SV, 4));
24717 MemOps.push_back(Store);
24719 // Store ptr to overflow_arg_area
24720 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
24721 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24722 Store =
24723 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
24724 MemOps.push_back(Store);
24726 // Store ptr to reg_save_area.
24727 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
24728 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
24729 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
24730 Store = DAG.getStore(
24731 Op.getOperand(0), DL, RSFIN, FIN,
24732 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
24733 MemOps.push_back(Store);
24734 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
24737 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
24738 assert(Subtarget.is64Bit() &&
24739 "LowerVAARG only handles 64-bit va_arg!");
24740 assert(Op.getNumOperands() == 4);
24742 MachineFunction &MF = DAG.getMachineFunction();
24743 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
24744 // The Win64 ABI uses char* instead of a structure.
24745 return DAG.expandVAArg(Op.getNode());
24747 SDValue Chain = Op.getOperand(0);
24748 SDValue SrcPtr = Op.getOperand(1);
24749 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24750 unsigned Align = Op.getConstantOperandVal(3);
24751 SDLoc dl(Op);
24753 EVT ArgVT = Op.getNode()->getValueType(0);
24754 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24755 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
24756 uint8_t ArgMode;
24758 // Decide which area this value should be read from.
24759 // TODO: Implement the AMD64 ABI in its entirety. This simple
24760 // selection mechanism works only for the basic types.
24761 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
24762 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
24763 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
24764 } else {
24765 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
24766 "Unhandled argument type in LowerVAARG");
24767 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
24770 if (ArgMode == 2) {
24771 // Make sure using fp_offset makes sense.
24772 assert(!Subtarget.useSoftFloat() &&
24773 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
24774 Subtarget.hasSSE1());
24777 // Insert VAARG node into the DAG
24778 // VAARG returns two values: Variable Argument Address, Chain
24779 SDValue InstOps[] = {Chain, SrcPtr,
24780 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
24781 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
24782 DAG.getTargetConstant(Align, dl, MVT::i32)};
24783 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
24784 SDValue VAARG = DAG.getMemIntrinsicNode(
24785 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
24786 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
24787 /*Alignment=*/std::nullopt,
24788 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
24789 Chain = VAARG.getValue(1);
24791 // Load the next argument and return it
24792 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
24795 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
24796 SelectionDAG &DAG) {
24797 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
24798 // where a va_list is still an i8*.
24799 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
24800 if (Subtarget.isCallingConvWin64(
24801 DAG.getMachineFunction().getFunction().getCallingConv()))
24802 // Probably a Win64 va_copy.
24803 return DAG.expandVACopy(Op.getNode());
24805 SDValue Chain = Op.getOperand(0);
24806 SDValue DstPtr = Op.getOperand(1);
24807 SDValue SrcPtr = Op.getOperand(2);
24808 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
24809 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
24810 SDLoc DL(Op);
24812 return DAG.getMemcpy(
24813 Chain, DL, DstPtr, SrcPtr,
24814 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
24815 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
24816 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
24819 // Helper to get immediate/variable SSE shift opcode from other shift opcodes.
24820 static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
24821 switch (Opc) {
24822 case ISD::SHL:
24823 case X86ISD::VSHL:
24824 case X86ISD::VSHLI:
24825 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
24826 case ISD::SRL:
24827 case X86ISD::VSRL:
24828 case X86ISD::VSRLI:
24829 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
24830 case ISD::SRA:
24831 case X86ISD::VSRA:
24832 case X86ISD::VSRAI:
24833 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
24835 llvm_unreachable("Unknown target vector shift node");
24838 /// Handle vector element shifts where the shift amount is a constant.
24839 /// Takes immediate version of shift as input.
24840 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
24841 SDValue SrcOp, uint64_t ShiftAmt,
24842 SelectionDAG &DAG) {
24843 MVT ElementType = VT.getVectorElementType();
24845 // Bitcast the source vector to the output type, this is mainly necessary for
24846 // vXi8/vXi64 shifts.
24847 if (VT != SrcOp.getSimpleValueType())
24848 SrcOp = DAG.getBitcast(VT, SrcOp);
24850 // Fold this packed shift into its first operand if ShiftAmt is 0.
24851 if (ShiftAmt == 0)
24852 return SrcOp;
24854 // Check for ShiftAmt >= element width
24855 if (ShiftAmt >= ElementType.getSizeInBits()) {
24856 if (Opc == X86ISD::VSRAI)
24857 ShiftAmt = ElementType.getSizeInBits() - 1;
24858 else
24859 return DAG.getConstant(0, dl, VT);
24862 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
24863 && "Unknown target vector shift-by-constant node");
24865 // Fold this packed vector shift into a build vector if SrcOp is a
24866 // vector of Constants or UNDEFs.
24867 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
24868 unsigned ShiftOpc;
24869 switch (Opc) {
24870 default: llvm_unreachable("Unknown opcode!");
24871 case X86ISD::VSHLI:
24872 ShiftOpc = ISD::SHL;
24873 break;
24874 case X86ISD::VSRLI:
24875 ShiftOpc = ISD::SRL;
24876 break;
24877 case X86ISD::VSRAI:
24878 ShiftOpc = ISD::SRA;
24879 break;
24882 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
24883 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
24884 return C;
24887 return DAG.getNode(Opc, dl, VT, SrcOp,
24888 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
24891 /// Handle vector element shifts by a splat shift amount
24892 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
24893 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
24894 const X86Subtarget &Subtarget,
24895 SelectionDAG &DAG) {
24896 MVT AmtVT = ShAmt.getSimpleValueType();
24897 assert(AmtVT.isVector() && "Vector shift type mismatch");
24898 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
24899 "Illegal vector splat index");
24901 // Move the splat element to the bottom element.
24902 if (ShAmtIdx != 0) {
24903 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
24904 Mask[0] = ShAmtIdx;
24905 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
24908 // Peek through any zext node if we can get back to a 128-bit source.
24909 if (AmtVT.getScalarSizeInBits() == 64 &&
24910 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
24911 ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
24912 ShAmt.getOperand(0).getValueType().isSimple() &&
24913 ShAmt.getOperand(0).getValueType().is128BitVector()) {
24914 ShAmt = ShAmt.getOperand(0);
24915 AmtVT = ShAmt.getSimpleValueType();
24918 // See if we can mask off the upper elements using the existing source node.
24919 // The shift uses the entire lower 64-bits of the amount vector, so no need to
24920 // do this for vXi64 types.
24921 bool IsMasked = false;
24922 if (AmtVT.getScalarSizeInBits() < 64) {
24923 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
24924 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
24925 // If the shift amount has come from a scalar, then zero-extend the scalar
24926 // before moving to the vector.
24927 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
24928 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
24929 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
24930 AmtVT = MVT::v4i32;
24931 IsMasked = true;
24932 } else if (ShAmt.getOpcode() == ISD::AND) {
24933 // See if the shift amount is already masked (e.g. for rotation modulo),
24934 // then we can zero-extend it by setting all the other mask elements to
24935 // zero.
24936 SmallVector<SDValue> MaskElts(
24937 AmtVT.getVectorNumElements(),
24938 DAG.getConstant(0, dl, AmtVT.getScalarType()));
24939 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
24940 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
24941 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
24942 {ShAmt.getOperand(1), Mask}))) {
24943 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
24944 IsMasked = true;
24949 // Extract if the shift amount vector is larger than 128-bits.
24950 if (AmtVT.getSizeInBits() > 128) {
24951 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
24952 AmtVT = ShAmt.getSimpleValueType();
24955 // Zero-extend bottom element to v2i64 vector type, either by extension or
24956 // shuffle masking.
24957 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
24958 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
24959 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
24960 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
24961 } else if (Subtarget.hasSSE41()) {
24962 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
24963 MVT::v2i64, ShAmt);
24964 } else {
24965 SDValue ByteShift = DAG.getTargetConstant(
24966 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
24967 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
24968 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
24969 ByteShift);
24970 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
24971 ByteShift);
24975 // Change opcode to non-immediate version.
24976 Opc = getTargetVShiftUniformOpcode(Opc, true);
24978 // The return type has to be a 128-bit type with the same element
24979 // type as the input type.
24980 MVT EltVT = VT.getVectorElementType();
24981 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
24983 ShAmt = DAG.getBitcast(ShVT, ShAmt);
24984 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
24987 /// Return Mask with the necessary casting or extending
24988 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
24989 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
24990 const X86Subtarget &Subtarget, SelectionDAG &DAG,
24991 const SDLoc &dl) {
24993 if (isAllOnesConstant(Mask))
24994 return DAG.getConstant(1, dl, MaskVT);
24995 if (X86::isZeroNode(Mask))
24996 return DAG.getConstant(0, dl, MaskVT);
24998 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25000 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25001 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25002 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25003 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25004 SDValue Lo, Hi;
25005 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
25006 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25007 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25008 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25009 } else {
25010 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25011 Mask.getSimpleValueType().getSizeInBits());
25012 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25013 // are extracted by EXTRACT_SUBVECTOR.
25014 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25015 DAG.getBitcast(BitcastVT, Mask),
25016 DAG.getIntPtrConstant(0, dl));
25020 /// Return (and \p Op, \p Mask) for compare instructions or
25021 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25022 /// necessary casting or extending for \p Mask when lowering masking intrinsics
25023 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
25024 SDValue PreservedSrc,
25025 const X86Subtarget &Subtarget,
25026 SelectionDAG &DAG) {
25027 MVT VT = Op.getSimpleValueType();
25028 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25029 unsigned OpcodeSelect = ISD::VSELECT;
25030 SDLoc dl(Op);
25032 if (isAllOnesConstant(Mask))
25033 return Op;
25035 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25037 if (PreservedSrc.isUndef())
25038 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25039 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25042 /// Creates an SDNode for a predicated scalar operation.
25043 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25044 /// The mask is coming as MVT::i8 and it should be transformed
25045 /// to MVT::v1i1 while lowering masking intrinsics.
25046 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25047 /// "X86select" instead of "vselect". We just can't create the "vselect" node
25048 /// for a scalar instruction.
25049 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
25050 SDValue PreservedSrc,
25051 const X86Subtarget &Subtarget,
25052 SelectionDAG &DAG) {
25054 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25055 if (MaskConst->getZExtValue() & 0x1)
25056 return Op;
25058 MVT VT = Op.getSimpleValueType();
25059 SDLoc dl(Op);
25061 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25062 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25063 DAG.getBitcast(MVT::v8i1, Mask),
25064 DAG.getIntPtrConstant(0, dl));
25065 if (Op.getOpcode() == X86ISD::FSETCCM ||
25066 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25067 Op.getOpcode() == X86ISD::VFPCLASSS)
25068 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25070 if (PreservedSrc.isUndef())
25071 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25072 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25075 static int getSEHRegistrationNodeSize(const Function *Fn) {
25076 if (!Fn->hasPersonalityFn())
25077 report_fatal_error(
25078 "querying registration node size for function without personality");
25079 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25080 // WinEHStatePass for the full struct definition.
25081 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25082 case EHPersonality::MSVC_X86SEH: return 24;
25083 case EHPersonality::MSVC_CXX: return 16;
25084 default: break;
25086 report_fatal_error(
25087 "can only recover FP for 32-bit MSVC EH personality functions");
25090 /// When the MSVC runtime transfers control to us, either to an outlined
25091 /// function or when returning to a parent frame after catching an exception, we
25092 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25093 /// Here's the math:
25094 /// RegNodeBase = EntryEBP - RegNodeSize
25095 /// ParentFP = RegNodeBase - ParentFrameOffset
25096 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
25097 /// subtracting the offset (negative on x86) takes us back to the parent FP.
25098 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
25099 SDValue EntryEBP) {
25100 MachineFunction &MF = DAG.getMachineFunction();
25101 SDLoc dl;
25103 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25104 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25106 // It's possible that the parent function no longer has a personality function
25107 // if the exceptional code was optimized away, in which case we just return
25108 // the incoming EBP.
25109 if (!Fn->hasPersonalityFn())
25110 return EntryEBP;
25112 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25113 // registration, or the .set_setframe offset.
25114 MCSymbol *OffsetSym =
25115 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
25116 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25117 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25118 SDValue ParentFrameOffset =
25119 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25121 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25122 // prologue to RBP in the parent function.
25123 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
25124 if (Subtarget.is64Bit())
25125 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25127 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25128 // RegNodeBase = EntryEBP - RegNodeSize
25129 // ParentFP = RegNodeBase - ParentFrameOffset
25130 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25131 DAG.getConstant(RegNodeSize, dl, PtrVT));
25132 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25135 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25136 SelectionDAG &DAG) const {
25137 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25138 auto isRoundModeCurDirection = [](SDValue Rnd) {
25139 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25140 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25142 return false;
25144 auto isRoundModeSAE = [](SDValue Rnd) {
25145 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25146 unsigned RC = C->getZExtValue();
25147 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25148 // Clear the NO_EXC bit and check remaining bits.
25149 RC ^= X86::STATIC_ROUNDING::NO_EXC;
25150 // As a convenience we allow no other bits or explicitly
25151 // current direction.
25152 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25156 return false;
25158 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25159 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25160 RC = C->getZExtValue();
25161 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25162 // Clear the NO_EXC bit and check remaining bits.
25163 RC ^= X86::STATIC_ROUNDING::NO_EXC;
25164 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
25165 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
25166 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
25167 RC == X86::STATIC_ROUNDING::TO_ZERO;
25171 return false;
25174 SDLoc dl(Op);
25175 unsigned IntNo = Op.getConstantOperandVal(0);
25176 MVT VT = Op.getSimpleValueType();
25177 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25179 // Propagate flags from original node to transformed node(s).
25180 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25182 if (IntrData) {
25183 switch(IntrData->Type) {
25184 case INTR_TYPE_1OP: {
25185 // We specify 2 possible opcodes for intrinsics with rounding modes.
25186 // First, we check if the intrinsic may have non-default rounding mode,
25187 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25188 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25189 if (IntrWithRoundingModeOpcode != 0) {
25190 SDValue Rnd = Op.getOperand(2);
25191 unsigned RC = 0;
25192 if (isRoundModeSAEToX(Rnd, RC))
25193 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25194 Op.getOperand(1),
25195 DAG.getTargetConstant(RC, dl, MVT::i32));
25196 if (!isRoundModeCurDirection(Rnd))
25197 return SDValue();
25199 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25200 Op.getOperand(1));
25202 case INTR_TYPE_1OP_SAE: {
25203 SDValue Sae = Op.getOperand(2);
25205 unsigned Opc;
25206 if (isRoundModeCurDirection(Sae))
25207 Opc = IntrData->Opc0;
25208 else if (isRoundModeSAE(Sae))
25209 Opc = IntrData->Opc1;
25210 else
25211 return SDValue();
25213 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25215 case INTR_TYPE_2OP: {
25216 SDValue Src2 = Op.getOperand(2);
25218 // We specify 2 possible opcodes for intrinsics with rounding modes.
25219 // First, we check if the intrinsic may have non-default rounding mode,
25220 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25221 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25222 if (IntrWithRoundingModeOpcode != 0) {
25223 SDValue Rnd = Op.getOperand(3);
25224 unsigned RC = 0;
25225 if (isRoundModeSAEToX(Rnd, RC))
25226 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25227 Op.getOperand(1), Src2,
25228 DAG.getTargetConstant(RC, dl, MVT::i32));
25229 if (!isRoundModeCurDirection(Rnd))
25230 return SDValue();
25233 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25234 Op.getOperand(1), Src2);
25236 case INTR_TYPE_2OP_SAE: {
25237 SDValue Sae = Op.getOperand(3);
25239 unsigned Opc;
25240 if (isRoundModeCurDirection(Sae))
25241 Opc = IntrData->Opc0;
25242 else if (isRoundModeSAE(Sae))
25243 Opc = IntrData->Opc1;
25244 else
25245 return SDValue();
25247 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25248 Op.getOperand(2));
25250 case INTR_TYPE_3OP:
25251 case INTR_TYPE_3OP_IMM8: {
25252 SDValue Src1 = Op.getOperand(1);
25253 SDValue Src2 = Op.getOperand(2);
25254 SDValue Src3 = Op.getOperand(3);
25256 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25257 Src3.getValueType() != MVT::i8) {
25258 Src3 = DAG.getTargetConstant(
25259 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
25262 // We specify 2 possible opcodes for intrinsics with rounding modes.
25263 // First, we check if the intrinsic may have non-default rounding mode,
25264 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25265 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25266 if (IntrWithRoundingModeOpcode != 0) {
25267 SDValue Rnd = Op.getOperand(4);
25268 unsigned RC = 0;
25269 if (isRoundModeSAEToX(Rnd, RC))
25270 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25271 Src1, Src2, Src3,
25272 DAG.getTargetConstant(RC, dl, MVT::i32));
25273 if (!isRoundModeCurDirection(Rnd))
25274 return SDValue();
25277 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25278 {Src1, Src2, Src3});
25280 case INTR_TYPE_4OP_IMM8: {
25281 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25282 SDValue Src4 = Op.getOperand(4);
25283 if (Src4.getValueType() != MVT::i8) {
25284 Src4 = DAG.getTargetConstant(
25285 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
25288 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25289 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25290 Src4);
25292 case INTR_TYPE_1OP_MASK: {
25293 SDValue Src = Op.getOperand(1);
25294 SDValue PassThru = Op.getOperand(2);
25295 SDValue Mask = Op.getOperand(3);
25296 // We add rounding mode to the Node when
25297 // - RC Opcode is specified and
25298 // - RC is not "current direction".
25299 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25300 if (IntrWithRoundingModeOpcode != 0) {
25301 SDValue Rnd = Op.getOperand(4);
25302 unsigned RC = 0;
25303 if (isRoundModeSAEToX(Rnd, RC))
25304 return getVectorMaskingNode(
25305 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25306 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25307 Mask, PassThru, Subtarget, DAG);
25308 if (!isRoundModeCurDirection(Rnd))
25309 return SDValue();
25311 return getVectorMaskingNode(
25312 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25313 Subtarget, DAG);
25315 case INTR_TYPE_1OP_MASK_SAE: {
25316 SDValue Src = Op.getOperand(1);
25317 SDValue PassThru = Op.getOperand(2);
25318 SDValue Mask = Op.getOperand(3);
25319 SDValue Rnd = Op.getOperand(4);
25321 unsigned Opc;
25322 if (isRoundModeCurDirection(Rnd))
25323 Opc = IntrData->Opc0;
25324 else if (isRoundModeSAE(Rnd))
25325 Opc = IntrData->Opc1;
25326 else
25327 return SDValue();
25329 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25330 Subtarget, DAG);
25332 case INTR_TYPE_SCALAR_MASK: {
25333 SDValue Src1 = Op.getOperand(1);
25334 SDValue Src2 = Op.getOperand(2);
25335 SDValue passThru = Op.getOperand(3);
25336 SDValue Mask = Op.getOperand(4);
25337 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25338 // There are 2 kinds of intrinsics in this group:
25339 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25340 // (2) With rounding mode and sae - 7 operands.
25341 bool HasRounding = IntrWithRoundingModeOpcode != 0;
25342 if (Op.getNumOperands() == (5U + HasRounding)) {
25343 if (HasRounding) {
25344 SDValue Rnd = Op.getOperand(5);
25345 unsigned RC = 0;
25346 if (isRoundModeSAEToX(Rnd, RC))
25347 return getScalarMaskingNode(
25348 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25349 DAG.getTargetConstant(RC, dl, MVT::i32)),
25350 Mask, passThru, Subtarget, DAG);
25351 if (!isRoundModeCurDirection(Rnd))
25352 return SDValue();
25354 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25355 Src2),
25356 Mask, passThru, Subtarget, DAG);
25359 assert(Op.getNumOperands() == (6U + HasRounding) &&
25360 "Unexpected intrinsic form");
25361 SDValue RoundingMode = Op.getOperand(5);
25362 unsigned Opc = IntrData->Opc0;
25363 if (HasRounding) {
25364 SDValue Sae = Op.getOperand(6);
25365 if (isRoundModeSAE(Sae))
25366 Opc = IntrWithRoundingModeOpcode;
25367 else if (!isRoundModeCurDirection(Sae))
25368 return SDValue();
25370 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25371 Src2, RoundingMode),
25372 Mask, passThru, Subtarget, DAG);
25374 case INTR_TYPE_SCALAR_MASK_RND: {
25375 SDValue Src1 = Op.getOperand(1);
25376 SDValue Src2 = Op.getOperand(2);
25377 SDValue passThru = Op.getOperand(3);
25378 SDValue Mask = Op.getOperand(4);
25379 SDValue Rnd = Op.getOperand(5);
25381 SDValue NewOp;
25382 unsigned RC = 0;
25383 if (isRoundModeCurDirection(Rnd))
25384 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25385 else if (isRoundModeSAEToX(Rnd, RC))
25386 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25387 DAG.getTargetConstant(RC, dl, MVT::i32));
25388 else
25389 return SDValue();
25391 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25393 case INTR_TYPE_SCALAR_MASK_SAE: {
25394 SDValue Src1 = Op.getOperand(1);
25395 SDValue Src2 = Op.getOperand(2);
25396 SDValue passThru = Op.getOperand(3);
25397 SDValue Mask = Op.getOperand(4);
25398 SDValue Sae = Op.getOperand(5);
25399 unsigned Opc;
25400 if (isRoundModeCurDirection(Sae))
25401 Opc = IntrData->Opc0;
25402 else if (isRoundModeSAE(Sae))
25403 Opc = IntrData->Opc1;
25404 else
25405 return SDValue();
25407 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25408 Mask, passThru, Subtarget, DAG);
25410 case INTR_TYPE_2OP_MASK: {
25411 SDValue Src1 = Op.getOperand(1);
25412 SDValue Src2 = Op.getOperand(2);
25413 SDValue PassThru = Op.getOperand(3);
25414 SDValue Mask = Op.getOperand(4);
25415 SDValue NewOp;
25416 if (IntrData->Opc1 != 0) {
25417 SDValue Rnd = Op.getOperand(5);
25418 unsigned RC = 0;
25419 if (isRoundModeSAEToX(Rnd, RC))
25420 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25421 DAG.getTargetConstant(RC, dl, MVT::i32));
25422 else if (!isRoundModeCurDirection(Rnd))
25423 return SDValue();
25425 if (!NewOp)
25426 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25427 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25429 case INTR_TYPE_2OP_MASK_SAE: {
25430 SDValue Src1 = Op.getOperand(1);
25431 SDValue Src2 = Op.getOperand(2);
25432 SDValue PassThru = Op.getOperand(3);
25433 SDValue Mask = Op.getOperand(4);
25435 unsigned Opc = IntrData->Opc0;
25436 if (IntrData->Opc1 != 0) {
25437 SDValue Sae = Op.getOperand(5);
25438 if (isRoundModeSAE(Sae))
25439 Opc = IntrData->Opc1;
25440 else if (!isRoundModeCurDirection(Sae))
25441 return SDValue();
25444 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25445 Mask, PassThru, Subtarget, DAG);
25447 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
25448 SDValue Src1 = Op.getOperand(1);
25449 SDValue Src2 = Op.getOperand(2);
25450 SDValue Src3 = Op.getOperand(3);
25451 SDValue PassThru = Op.getOperand(4);
25452 SDValue Mask = Op.getOperand(5);
25453 SDValue Sae = Op.getOperand(6);
25454 unsigned Opc;
25455 if (isRoundModeCurDirection(Sae))
25456 Opc = IntrData->Opc0;
25457 else if (isRoundModeSAE(Sae))
25458 Opc = IntrData->Opc1;
25459 else
25460 return SDValue();
25462 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25463 Mask, PassThru, Subtarget, DAG);
25465 case INTR_TYPE_3OP_MASK_SAE: {
25466 SDValue Src1 = Op.getOperand(1);
25467 SDValue Src2 = Op.getOperand(2);
25468 SDValue Src3 = Op.getOperand(3);
25469 SDValue PassThru = Op.getOperand(4);
25470 SDValue Mask = Op.getOperand(5);
25472 unsigned Opc = IntrData->Opc0;
25473 if (IntrData->Opc1 != 0) {
25474 SDValue Sae = Op.getOperand(6);
25475 if (isRoundModeSAE(Sae))
25476 Opc = IntrData->Opc1;
25477 else if (!isRoundModeCurDirection(Sae))
25478 return SDValue();
25480 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25481 Mask, PassThru, Subtarget, DAG);
25483 case BLENDV: {
25484 SDValue Src1 = Op.getOperand(1);
25485 SDValue Src2 = Op.getOperand(2);
25486 SDValue Src3 = Op.getOperand(3);
25488 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
25489 Src3 = DAG.getBitcast(MaskVT, Src3);
25491 // Reverse the operands to match VSELECT order.
25492 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25494 case VPERM_2OP : {
25495 SDValue Src1 = Op.getOperand(1);
25496 SDValue Src2 = Op.getOperand(2);
25498 // Swap Src1 and Src2 in the node creation
25499 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25501 case CFMA_OP_MASKZ:
25502 case CFMA_OP_MASK: {
25503 SDValue Src1 = Op.getOperand(1);
25504 SDValue Src2 = Op.getOperand(2);
25505 SDValue Src3 = Op.getOperand(3);
25506 SDValue Mask = Op.getOperand(4);
25507 MVT VT = Op.getSimpleValueType();
25509 SDValue PassThru = Src3;
25510 if (IntrData->Type == CFMA_OP_MASKZ)
25511 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25513 // We add rounding mode to the Node when
25514 // - RC Opcode is specified and
25515 // - RC is not "current direction".
25516 SDValue NewOp;
25517 if (IntrData->Opc1 != 0) {
25518 SDValue Rnd = Op.getOperand(5);
25519 unsigned RC = 0;
25520 if (isRoundModeSAEToX(Rnd, RC))
25521 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
25522 DAG.getTargetConstant(RC, dl, MVT::i32));
25523 else if (!isRoundModeCurDirection(Rnd))
25524 return SDValue();
25526 if (!NewOp)
25527 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
25528 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25530 case IFMA_OP:
25531 // NOTE: We need to swizzle the operands to pass the multiply operands
25532 // first.
25533 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25534 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25535 case FPCLASSS: {
25536 SDValue Src1 = Op.getOperand(1);
25537 SDValue Imm = Op.getOperand(2);
25538 SDValue Mask = Op.getOperand(3);
25539 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25540 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25541 Subtarget, DAG);
25542 // Need to fill with zeros to ensure the bitcast will produce zeroes
25543 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25544 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25545 DAG.getConstant(0, dl, MVT::v8i1),
25546 FPclassMask, DAG.getIntPtrConstant(0, dl));
25547 return DAG.getBitcast(MVT::i8, Ins);
25550 case CMP_MASK_CC: {
25551 MVT MaskVT = Op.getSimpleValueType();
25552 SDValue CC = Op.getOperand(3);
25553 SDValue Mask = Op.getOperand(4);
25554 // We specify 2 possible opcodes for intrinsics with rounding modes.
25555 // First, we check if the intrinsic may have non-default rounding mode,
25556 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25557 if (IntrData->Opc1 != 0) {
25558 SDValue Sae = Op.getOperand(5);
25559 if (isRoundModeSAE(Sae))
25560 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25561 Op.getOperand(2), CC, Mask, Sae);
25562 if (!isRoundModeCurDirection(Sae))
25563 return SDValue();
25565 //default rounding mode
25566 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25567 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25569 case CMP_MASK_SCALAR_CC: {
25570 SDValue Src1 = Op.getOperand(1);
25571 SDValue Src2 = Op.getOperand(2);
25572 SDValue CC = Op.getOperand(3);
25573 SDValue Mask = Op.getOperand(4);
25575 SDValue Cmp;
25576 if (IntrData->Opc1 != 0) {
25577 SDValue Sae = Op.getOperand(5);
25578 if (isRoundModeSAE(Sae))
25579 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25580 else if (!isRoundModeCurDirection(Sae))
25581 return SDValue();
25583 //default rounding mode
25584 if (!Cmp.getNode())
25585 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25587 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25588 Subtarget, DAG);
25589 // Need to fill with zeros to ensure the bitcast will produce zeroes
25590 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25591 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25592 DAG.getConstant(0, dl, MVT::v8i1),
25593 CmpMask, DAG.getIntPtrConstant(0, dl));
25594 return DAG.getBitcast(MVT::i8, Ins);
25596 case COMI: { // Comparison intrinsics
25597 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25598 SDValue LHS = Op.getOperand(1);
25599 SDValue RHS = Op.getOperand(2);
25600 // Some conditions require the operands to be swapped.
25601 if (CC == ISD::SETLT || CC == ISD::SETLE)
25602 std::swap(LHS, RHS);
25604 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25605 SDValue SetCC;
25606 switch (CC) {
25607 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25608 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25609 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25610 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25611 break;
25613 case ISD::SETNE: { // (ZF = 1 or PF = 1)
25614 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25615 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25616 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25617 break;
25619 case ISD::SETGT: // (CF = 0 and ZF = 0)
25620 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25621 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25622 break;
25624 case ISD::SETGE: // CF = 0
25625 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25626 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25627 break;
25628 default:
25629 llvm_unreachable("Unexpected illegal condition!");
25631 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25633 case COMI_RM: { // Comparison intrinsics with Sae
25634 SDValue LHS = Op.getOperand(1);
25635 SDValue RHS = Op.getOperand(2);
25636 unsigned CondVal = Op.getConstantOperandVal(3);
25637 SDValue Sae = Op.getOperand(4);
25639 SDValue FCmp;
25640 if (isRoundModeCurDirection(Sae))
25641 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25642 DAG.getTargetConstant(CondVal, dl, MVT::i8));
25643 else if (isRoundModeSAE(Sae))
25644 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25645 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25646 else
25647 return SDValue();
25648 // Need to fill with zeros to ensure the bitcast will produce zeroes
25649 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25650 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25651 DAG.getConstant(0, dl, MVT::v16i1),
25652 FCmp, DAG.getIntPtrConstant(0, dl));
25653 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25654 DAG.getBitcast(MVT::i16, Ins));
25656 case VSHIFT: {
25657 SDValue SrcOp = Op.getOperand(1);
25658 SDValue ShAmt = Op.getOperand(2);
25659 assert(ShAmt.getValueType() == MVT::i32 &&
25660 "Unexpected VSHIFT amount type");
25662 // Catch shift-by-constant.
25663 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
25664 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
25665 Op.getSimpleValueType(), SrcOp,
25666 CShAmt->getZExtValue(), DAG);
25668 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25669 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25670 SrcOp, ShAmt, 0, Subtarget, DAG);
25672 case COMPRESS_EXPAND_IN_REG: {
25673 SDValue Mask = Op.getOperand(3);
25674 SDValue DataToCompress = Op.getOperand(1);
25675 SDValue PassThru = Op.getOperand(2);
25676 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25677 return Op.getOperand(1);
25679 // Avoid false dependency.
25680 if (PassThru.isUndef())
25681 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25683 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25684 Mask);
25686 case FIXUPIMM:
25687 case FIXUPIMM_MASKZ: {
25688 SDValue Src1 = Op.getOperand(1);
25689 SDValue Src2 = Op.getOperand(2);
25690 SDValue Src3 = Op.getOperand(3);
25691 SDValue Imm = Op.getOperand(4);
25692 SDValue Mask = Op.getOperand(5);
25693 SDValue Passthru = (IntrData->Type == FIXUPIMM)
25694 ? Src1
25695 : getZeroVector(VT, Subtarget, DAG, dl);
25697 unsigned Opc = IntrData->Opc0;
25698 if (IntrData->Opc1 != 0) {
25699 SDValue Sae = Op.getOperand(6);
25700 if (isRoundModeSAE(Sae))
25701 Opc = IntrData->Opc1;
25702 else if (!isRoundModeCurDirection(Sae))
25703 return SDValue();
25706 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25708 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25709 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25711 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25713 case ROUNDP: {
25714 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
25715 // Clear the upper bits of the rounding immediate so that the legacy
25716 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25717 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
25718 SDValue RoundingMode =
25719 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25720 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25721 Op.getOperand(1), RoundingMode);
25723 case ROUNDS: {
25724 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
25725 // Clear the upper bits of the rounding immediate so that the legacy
25726 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25727 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
25728 SDValue RoundingMode =
25729 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25730 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25731 Op.getOperand(1), Op.getOperand(2), RoundingMode);
25733 case BEXTRI: {
25734 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
25736 uint64_t Imm = Op.getConstantOperandVal(2);
25737 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
25738 Op.getValueType());
25739 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25740 Op.getOperand(1), Control);
25742 // ADC/SBB
25743 case ADX: {
25744 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
25745 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
25747 SDValue Res;
25748 // If the carry in is zero, then we should just use ADD/SUB instead of
25749 // ADC/SBB.
25750 if (isNullConstant(Op.getOperand(1))) {
25751 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
25752 Op.getOperand(3));
25753 } else {
25754 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
25755 DAG.getConstant(-1, dl, MVT::i8));
25756 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
25757 Op.getOperand(3), GenCF.getValue(1));
25759 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
25760 SDValue Results[] = { SetCC, Res };
25761 return DAG.getMergeValues(Results, dl);
25763 case CVTPD2PS_MASK:
25764 case CVTPD2DQ_MASK:
25765 case CVTQQ2PS_MASK:
25766 case TRUNCATE_TO_REG: {
25767 SDValue Src = Op.getOperand(1);
25768 SDValue PassThru = Op.getOperand(2);
25769 SDValue Mask = Op.getOperand(3);
25771 if (isAllOnesConstant(Mask))
25772 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25774 MVT SrcVT = Src.getSimpleValueType();
25775 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25776 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25777 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
25778 {Src, PassThru, Mask});
25780 case CVTPS2PH_MASK: {
25781 SDValue Src = Op.getOperand(1);
25782 SDValue Rnd = Op.getOperand(2);
25783 SDValue PassThru = Op.getOperand(3);
25784 SDValue Mask = Op.getOperand(4);
25786 unsigned RC = 0;
25787 unsigned Opc = IntrData->Opc0;
25788 bool SAE = Src.getValueType().is512BitVector() &&
25789 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
25790 if (SAE) {
25791 Opc = X86ISD::CVTPS2PH_SAE;
25792 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
25795 if (isAllOnesConstant(Mask))
25796 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
25798 if (SAE)
25799 Opc = X86ISD::MCVTPS2PH_SAE;
25800 else
25801 Opc = IntrData->Opc1;
25802 MVT SrcVT = Src.getSimpleValueType();
25803 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25804 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25805 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
25807 case CVTNEPS2BF16_MASK: {
25808 SDValue Src = Op.getOperand(1);
25809 SDValue PassThru = Op.getOperand(2);
25810 SDValue Mask = Op.getOperand(3);
25812 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
25813 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25815 // Break false dependency.
25816 if (PassThru.isUndef())
25817 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
25819 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
25820 Mask);
25822 default:
25823 break;
25827 switch (IntNo) {
25828 default: return SDValue(); // Don't custom lower most intrinsics.
25830 // ptest and testp intrinsics. The intrinsic these come from are designed to
25831 // return an integer value, not just an instruction so lower it to the ptest
25832 // or testp pattern and a setcc for the result.
25833 case Intrinsic::x86_avx512_ktestc_b:
25834 case Intrinsic::x86_avx512_ktestc_w:
25835 case Intrinsic::x86_avx512_ktestc_d:
25836 case Intrinsic::x86_avx512_ktestc_q:
25837 case Intrinsic::x86_avx512_ktestz_b:
25838 case Intrinsic::x86_avx512_ktestz_w:
25839 case Intrinsic::x86_avx512_ktestz_d:
25840 case Intrinsic::x86_avx512_ktestz_q:
25841 case Intrinsic::x86_sse41_ptestz:
25842 case Intrinsic::x86_sse41_ptestc:
25843 case Intrinsic::x86_sse41_ptestnzc:
25844 case Intrinsic::x86_avx_ptestz_256:
25845 case Intrinsic::x86_avx_ptestc_256:
25846 case Intrinsic::x86_avx_ptestnzc_256:
25847 case Intrinsic::x86_avx_vtestz_ps:
25848 case Intrinsic::x86_avx_vtestc_ps:
25849 case Intrinsic::x86_avx_vtestnzc_ps:
25850 case Intrinsic::x86_avx_vtestz_pd:
25851 case Intrinsic::x86_avx_vtestc_pd:
25852 case Intrinsic::x86_avx_vtestnzc_pd:
25853 case Intrinsic::x86_avx_vtestz_ps_256:
25854 case Intrinsic::x86_avx_vtestc_ps_256:
25855 case Intrinsic::x86_avx_vtestnzc_ps_256:
25856 case Intrinsic::x86_avx_vtestz_pd_256:
25857 case Intrinsic::x86_avx_vtestc_pd_256:
25858 case Intrinsic::x86_avx_vtestnzc_pd_256: {
25859 unsigned TestOpc = X86ISD::PTEST;
25860 X86::CondCode X86CC;
25861 switch (IntNo) {
25862 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
25863 case Intrinsic::x86_avx512_ktestc_b:
25864 case Intrinsic::x86_avx512_ktestc_w:
25865 case Intrinsic::x86_avx512_ktestc_d:
25866 case Intrinsic::x86_avx512_ktestc_q:
25867 // CF = 1
25868 TestOpc = X86ISD::KTEST;
25869 X86CC = X86::COND_B;
25870 break;
25871 case Intrinsic::x86_avx512_ktestz_b:
25872 case Intrinsic::x86_avx512_ktestz_w:
25873 case Intrinsic::x86_avx512_ktestz_d:
25874 case Intrinsic::x86_avx512_ktestz_q:
25875 TestOpc = X86ISD::KTEST;
25876 X86CC = X86::COND_E;
25877 break;
25878 case Intrinsic::x86_avx_vtestz_ps:
25879 case Intrinsic::x86_avx_vtestz_pd:
25880 case Intrinsic::x86_avx_vtestz_ps_256:
25881 case Intrinsic::x86_avx_vtestz_pd_256:
25882 TestOpc = X86ISD::TESTP;
25883 [[fallthrough]];
25884 case Intrinsic::x86_sse41_ptestz:
25885 case Intrinsic::x86_avx_ptestz_256:
25886 // ZF = 1
25887 X86CC = X86::COND_E;
25888 break;
25889 case Intrinsic::x86_avx_vtestc_ps:
25890 case Intrinsic::x86_avx_vtestc_pd:
25891 case Intrinsic::x86_avx_vtestc_ps_256:
25892 case Intrinsic::x86_avx_vtestc_pd_256:
25893 TestOpc = X86ISD::TESTP;
25894 [[fallthrough]];
25895 case Intrinsic::x86_sse41_ptestc:
25896 case Intrinsic::x86_avx_ptestc_256:
25897 // CF = 1
25898 X86CC = X86::COND_B;
25899 break;
25900 case Intrinsic::x86_avx_vtestnzc_ps:
25901 case Intrinsic::x86_avx_vtestnzc_pd:
25902 case Intrinsic::x86_avx_vtestnzc_ps_256:
25903 case Intrinsic::x86_avx_vtestnzc_pd_256:
25904 TestOpc = X86ISD::TESTP;
25905 [[fallthrough]];
25906 case Intrinsic::x86_sse41_ptestnzc:
25907 case Intrinsic::x86_avx_ptestnzc_256:
25908 // ZF and CF = 0
25909 X86CC = X86::COND_A;
25910 break;
25913 SDValue LHS = Op.getOperand(1);
25914 SDValue RHS = Op.getOperand(2);
25915 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
25916 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
25917 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25920 case Intrinsic::x86_sse42_pcmpistria128:
25921 case Intrinsic::x86_sse42_pcmpestria128:
25922 case Intrinsic::x86_sse42_pcmpistric128:
25923 case Intrinsic::x86_sse42_pcmpestric128:
25924 case Intrinsic::x86_sse42_pcmpistrio128:
25925 case Intrinsic::x86_sse42_pcmpestrio128:
25926 case Intrinsic::x86_sse42_pcmpistris128:
25927 case Intrinsic::x86_sse42_pcmpestris128:
25928 case Intrinsic::x86_sse42_pcmpistriz128:
25929 case Intrinsic::x86_sse42_pcmpestriz128: {
25930 unsigned Opcode;
25931 X86::CondCode X86CC;
25932 switch (IntNo) {
25933 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
25934 case Intrinsic::x86_sse42_pcmpistria128:
25935 Opcode = X86ISD::PCMPISTR;
25936 X86CC = X86::COND_A;
25937 break;
25938 case Intrinsic::x86_sse42_pcmpestria128:
25939 Opcode = X86ISD::PCMPESTR;
25940 X86CC = X86::COND_A;
25941 break;
25942 case Intrinsic::x86_sse42_pcmpistric128:
25943 Opcode = X86ISD::PCMPISTR;
25944 X86CC = X86::COND_B;
25945 break;
25946 case Intrinsic::x86_sse42_pcmpestric128:
25947 Opcode = X86ISD::PCMPESTR;
25948 X86CC = X86::COND_B;
25949 break;
25950 case Intrinsic::x86_sse42_pcmpistrio128:
25951 Opcode = X86ISD::PCMPISTR;
25952 X86CC = X86::COND_O;
25953 break;
25954 case Intrinsic::x86_sse42_pcmpestrio128:
25955 Opcode = X86ISD::PCMPESTR;
25956 X86CC = X86::COND_O;
25957 break;
25958 case Intrinsic::x86_sse42_pcmpistris128:
25959 Opcode = X86ISD::PCMPISTR;
25960 X86CC = X86::COND_S;
25961 break;
25962 case Intrinsic::x86_sse42_pcmpestris128:
25963 Opcode = X86ISD::PCMPESTR;
25964 X86CC = X86::COND_S;
25965 break;
25966 case Intrinsic::x86_sse42_pcmpistriz128:
25967 Opcode = X86ISD::PCMPISTR;
25968 X86CC = X86::COND_E;
25969 break;
25970 case Intrinsic::x86_sse42_pcmpestriz128:
25971 Opcode = X86ISD::PCMPESTR;
25972 X86CC = X86::COND_E;
25973 break;
25975 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
25976 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25977 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
25978 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
25979 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25982 case Intrinsic::x86_sse42_pcmpistri128:
25983 case Intrinsic::x86_sse42_pcmpestri128: {
25984 unsigned Opcode;
25985 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
25986 Opcode = X86ISD::PCMPISTR;
25987 else
25988 Opcode = X86ISD::PCMPESTR;
25990 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
25991 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25992 return DAG.getNode(Opcode, dl, VTs, NewOps);
25995 case Intrinsic::x86_sse42_pcmpistrm128:
25996 case Intrinsic::x86_sse42_pcmpestrm128: {
25997 unsigned Opcode;
25998 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
25999 Opcode = X86ISD::PCMPISTR;
26000 else
26001 Opcode = X86ISD::PCMPESTR;
26003 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26004 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26005 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26008 case Intrinsic::eh_sjlj_lsda: {
26009 MachineFunction &MF = DAG.getMachineFunction();
26010 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26011 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26012 auto &Context = MF.getMMI().getContext();
26013 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26014 Twine(MF.getFunctionNumber()));
26015 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
26016 DAG.getMCSymbol(S, PtrVT));
26019 case Intrinsic::x86_seh_lsda: {
26020 // Compute the symbol for the LSDA. We know it'll get emitted later.
26021 MachineFunction &MF = DAG.getMachineFunction();
26022 SDValue Op1 = Op.getOperand(1);
26023 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26024 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
26025 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26027 // Generate a simple absolute symbol reference. This intrinsic is only
26028 // supported on 32-bit Windows, which isn't PIC.
26029 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26030 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26033 case Intrinsic::eh_recoverfp: {
26034 SDValue FnOp = Op.getOperand(1);
26035 SDValue IncomingFPOp = Op.getOperand(2);
26036 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26037 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26038 if (!Fn)
26039 report_fatal_error(
26040 "llvm.eh.recoverfp must take a function as the first argument");
26041 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26044 case Intrinsic::localaddress: {
26045 // Returns one of the stack, base, or frame pointer registers, depending on
26046 // which is used to reference local variables.
26047 MachineFunction &MF = DAG.getMachineFunction();
26048 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26049 unsigned Reg;
26050 if (RegInfo->hasBasePointer(MF))
26051 Reg = RegInfo->getBaseRegister();
26052 else { // Handles the SP or FP case.
26053 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26054 if (CantUseFP)
26055 Reg = RegInfo->getPtrSizedStackRegister(MF);
26056 else
26057 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26059 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26061 case Intrinsic::x86_avx512_vp2intersect_q_512:
26062 case Intrinsic::x86_avx512_vp2intersect_q_256:
26063 case Intrinsic::x86_avx512_vp2intersect_q_128:
26064 case Intrinsic::x86_avx512_vp2intersect_d_512:
26065 case Intrinsic::x86_avx512_vp2intersect_d_256:
26066 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26067 MVT MaskVT = Op.getSimpleValueType();
26069 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26070 SDLoc DL(Op);
26072 SDValue Operation =
26073 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
26074 Op->getOperand(1), Op->getOperand(2));
26076 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26077 MaskVT, Operation);
26078 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26079 MaskVT, Operation);
26080 return DAG.getMergeValues({Result0, Result1}, DL);
26082 case Intrinsic::x86_mmx_pslli_w:
26083 case Intrinsic::x86_mmx_pslli_d:
26084 case Intrinsic::x86_mmx_pslli_q:
26085 case Intrinsic::x86_mmx_psrli_w:
26086 case Intrinsic::x86_mmx_psrli_d:
26087 case Intrinsic::x86_mmx_psrli_q:
26088 case Intrinsic::x86_mmx_psrai_w:
26089 case Intrinsic::x86_mmx_psrai_d: {
26090 SDLoc DL(Op);
26091 SDValue ShAmt = Op.getOperand(2);
26092 // If the argument is a constant, convert it to a target constant.
26093 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26094 // Clamp out of bounds shift amounts since they will otherwise be masked
26095 // to 8-bits which may make it no longer out of bounds.
26096 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26097 if (ShiftAmount == 0)
26098 return Op.getOperand(1);
26100 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26101 Op.getOperand(0), Op.getOperand(1),
26102 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26105 unsigned NewIntrinsic;
26106 switch (IntNo) {
26107 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26108 case Intrinsic::x86_mmx_pslli_w:
26109 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26110 break;
26111 case Intrinsic::x86_mmx_pslli_d:
26112 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26113 break;
26114 case Intrinsic::x86_mmx_pslli_q:
26115 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26116 break;
26117 case Intrinsic::x86_mmx_psrli_w:
26118 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26119 break;
26120 case Intrinsic::x86_mmx_psrli_d:
26121 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26122 break;
26123 case Intrinsic::x86_mmx_psrli_q:
26124 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26125 break;
26126 case Intrinsic::x86_mmx_psrai_w:
26127 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26128 break;
26129 case Intrinsic::x86_mmx_psrai_d:
26130 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26131 break;
26134 // The vector shift intrinsics with scalars uses 32b shift amounts but
26135 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26136 // MMX register.
26137 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26138 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26139 DAG.getTargetConstant(NewIntrinsic, DL,
26140 getPointerTy(DAG.getDataLayout())),
26141 Op.getOperand(1), ShAmt);
26143 case Intrinsic::thread_pointer: {
26144 if (Subtarget.isTargetELF()) {
26145 SDLoc dl(Op);
26146 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26147 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
26148 Value *Ptr = Constant::getNullValue(PointerType::get(
26149 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
26150 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26151 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
26153 report_fatal_error(
26154 "Target OS doesn't support __builtin_thread_pointer() yet.");
26159 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26160 SDValue Src, SDValue Mask, SDValue Base,
26161 SDValue Index, SDValue ScaleOp, SDValue Chain,
26162 const X86Subtarget &Subtarget) {
26163 SDLoc dl(Op);
26164 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26165 // Scale must be constant.
26166 if (!C)
26167 return SDValue();
26168 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26169 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26170 TLI.getPointerTy(DAG.getDataLayout()));
26171 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26172 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26173 // If source is undef or we know it won't be used, use a zero vector
26174 // to break register dependency.
26175 // TODO: use undef instead and let BreakFalseDeps deal with it?
26176 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26177 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26179 // Cast mask to an integer type.
26180 Mask = DAG.getBitcast(MaskVT, Mask);
26182 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26184 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26185 SDValue Res =
26186 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26187 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26188 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26191 static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
26192 SDValue Src, SDValue Mask, SDValue Base,
26193 SDValue Index, SDValue ScaleOp, SDValue Chain,
26194 const X86Subtarget &Subtarget) {
26195 MVT VT = Op.getSimpleValueType();
26196 SDLoc dl(Op);
26197 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26198 // Scale must be constant.
26199 if (!C)
26200 return SDValue();
26201 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26202 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26203 TLI.getPointerTy(DAG.getDataLayout()));
26204 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26205 VT.getVectorNumElements());
26206 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26208 // We support two versions of the gather intrinsics. One with scalar mask and
26209 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26210 if (Mask.getValueType() != MaskVT)
26211 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26213 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26214 // If source is undef or we know it won't be used, use a zero vector
26215 // to break register dependency.
26216 // TODO: use undef instead and let BreakFalseDeps deal with it?
26217 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26218 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26220 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26222 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26223 SDValue Res =
26224 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26225 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26226 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26229 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26230 SDValue Src, SDValue Mask, SDValue Base,
26231 SDValue Index, SDValue ScaleOp, SDValue Chain,
26232 const X86Subtarget &Subtarget) {
26233 SDLoc dl(Op);
26234 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26235 // Scale must be constant.
26236 if (!C)
26237 return SDValue();
26238 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26239 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26240 TLI.getPointerTy(DAG.getDataLayout()));
26241 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26242 Src.getSimpleValueType().getVectorNumElements());
26243 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26245 // We support two versions of the scatter intrinsics. One with scalar mask and
26246 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26247 if (Mask.getValueType() != MaskVT)
26248 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26250 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26252 SDVTList VTs = DAG.getVTList(MVT::Other);
26253 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26254 SDValue Res =
26255 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26256 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26257 return Res;
26260 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26261 SDValue Mask, SDValue Base, SDValue Index,
26262 SDValue ScaleOp, SDValue Chain,
26263 const X86Subtarget &Subtarget) {
26264 SDLoc dl(Op);
26265 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26266 // Scale must be constant.
26267 if (!C)
26268 return SDValue();
26269 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26270 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26271 TLI.getPointerTy(DAG.getDataLayout()));
26272 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26273 SDValue Segment = DAG.getRegister(0, MVT::i32);
26274 MVT MaskVT =
26275 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26276 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26277 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26278 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26279 return SDValue(Res, 0);
26282 /// Handles the lowering of builtin intrinsics with chain that return their
26283 /// value into registers EDX:EAX.
26284 /// If operand ScrReg is a valid register identifier, then operand 2 of N is
26285 /// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26286 /// TargetOpcode.
26287 /// Returns a Glue value which can be used to add extra copy-from-reg if the
26288 /// expanded intrinsics implicitly defines extra registers (i.e. not just
26289 /// EDX:EAX).
26290 static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
26291 SelectionDAG &DAG,
26292 unsigned TargetOpcode,
26293 unsigned SrcReg,
26294 const X86Subtarget &Subtarget,
26295 SmallVectorImpl<SDValue> &Results) {
26296 SDValue Chain = N->getOperand(0);
26297 SDValue Glue;
26299 if (SrcReg) {
26300 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26301 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26302 Glue = Chain.getValue(1);
26305 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26306 SDValue N1Ops[] = {Chain, Glue};
26307 SDNode *N1 = DAG.getMachineNode(
26308 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26309 Chain = SDValue(N1, 0);
26311 // Reads the content of XCR and returns it in registers EDX:EAX.
26312 SDValue LO, HI;
26313 if (Subtarget.is64Bit()) {
26314 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26315 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26316 LO.getValue(2));
26317 } else {
26318 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26319 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26320 LO.getValue(2));
26322 Chain = HI.getValue(1);
26323 Glue = HI.getValue(2);
26325 if (Subtarget.is64Bit()) {
26326 // Merge the two 32-bit values into a 64-bit one.
26327 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26328 DAG.getConstant(32, DL, MVT::i8));
26329 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26330 Results.push_back(Chain);
26331 return Glue;
26334 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26335 SDValue Ops[] = { LO, HI };
26336 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26337 Results.push_back(Pair);
26338 Results.push_back(Chain);
26339 return Glue;
26342 /// Handles the lowering of builtin intrinsics that read the time stamp counter
26343 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26344 /// READCYCLECOUNTER nodes.
26345 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26346 SelectionDAG &DAG,
26347 const X86Subtarget &Subtarget,
26348 SmallVectorImpl<SDValue> &Results) {
26349 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26350 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26351 // and the EAX register is loaded with the low-order 32 bits.
26352 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26353 /* NoRegister */0, Subtarget,
26354 Results);
26355 if (Opcode != X86::RDTSCP)
26356 return;
26358 SDValue Chain = Results[1];
26359 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26360 // the ECX register. Add 'ecx' explicitly to the chain.
26361 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26362 Results[1] = ecx;
26363 Results.push_back(ecx.getValue(1));
26366 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
26367 SelectionDAG &DAG) {
26368 SmallVector<SDValue, 3> Results;
26369 SDLoc DL(Op);
26370 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26371 Results);
26372 return DAG.getMergeValues(Results, DL);
26375 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
26376 MachineFunction &MF = DAG.getMachineFunction();
26377 SDValue Chain = Op.getOperand(0);
26378 SDValue RegNode = Op.getOperand(2);
26379 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26380 if (!EHInfo)
26381 report_fatal_error("EH registrations only live in functions using WinEH");
26383 // Cast the operand to an alloca, and remember the frame index.
26384 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26385 if (!FINode)
26386 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26387 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26389 // Return the chain operand without making any DAG nodes.
26390 return Chain;
26393 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
26394 MachineFunction &MF = DAG.getMachineFunction();
26395 SDValue Chain = Op.getOperand(0);
26396 SDValue EHGuard = Op.getOperand(2);
26397 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26398 if (!EHInfo)
26399 report_fatal_error("EHGuard only live in functions using WinEH");
26401 // Cast the operand to an alloca, and remember the frame index.
26402 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26403 if (!FINode)
26404 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26405 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26407 // Return the chain operand without making any DAG nodes.
26408 return Chain;
26411 /// Emit Truncating Store with signed or unsigned saturation.
26412 static SDValue
26413 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
26414 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26415 SelectionDAG &DAG) {
26416 SDVTList VTs = DAG.getVTList(MVT::Other);
26417 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26418 SDValue Ops[] = { Chain, Val, Ptr, Undef };
26419 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26420 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26423 /// Emit Masked Truncating Store with signed or unsigned saturation.
26424 static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
26425 const SDLoc &DL,
26426 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26427 MachineMemOperand *MMO, SelectionDAG &DAG) {
26428 SDVTList VTs = DAG.getVTList(MVT::Other);
26429 SDValue Ops[] = { Chain, Val, Ptr, Mask };
26430 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26431 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26434 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
26435 SelectionDAG &DAG) {
26436 unsigned IntNo = Op.getConstantOperandVal(1);
26437 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26438 if (!IntrData) {
26439 switch (IntNo) {
26441 case Intrinsic::swift_async_context_addr: {
26442 SDLoc dl(Op);
26443 auto &MF = DAG.getMachineFunction();
26444 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26445 if (Subtarget.is64Bit()) {
26446 MF.getFrameInfo().setFrameAddressIsTaken(true);
26447 X86FI->setHasSwiftAsyncContext(true);
26448 SDValue Chain = Op->getOperand(0);
26449 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
26450 SDValue Result =
26451 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
26452 DAG.getTargetConstant(8, dl, MVT::i32)),
26454 // Return { result, chain }.
26455 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26456 CopyRBP.getValue(1));
26457 } else {
26458 // 32-bit so no special extended frame, create or reuse an existing
26459 // stack slot.
26460 if (!X86FI->getSwiftAsyncContextFrameIdx())
26461 X86FI->setSwiftAsyncContextFrameIdx(
26462 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
26463 SDValue Result =
26464 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
26465 // Return { result, chain }.
26466 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26467 Op->getOperand(0));
26471 case llvm::Intrinsic::x86_seh_ehregnode:
26472 return MarkEHRegistrationNode(Op, DAG);
26473 case llvm::Intrinsic::x86_seh_ehguard:
26474 return MarkEHGuard(Op, DAG);
26475 case llvm::Intrinsic::x86_rdpkru: {
26476 SDLoc dl(Op);
26477 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26478 // Create a RDPKRU node and pass 0 to the ECX parameter.
26479 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26480 DAG.getConstant(0, dl, MVT::i32));
26482 case llvm::Intrinsic::x86_wrpkru: {
26483 SDLoc dl(Op);
26484 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
26485 // to the EDX and ECX parameters.
26486 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26487 Op.getOperand(0), Op.getOperand(2),
26488 DAG.getConstant(0, dl, MVT::i32),
26489 DAG.getConstant(0, dl, MVT::i32));
26491 case llvm::Intrinsic::asan_check_memaccess: {
26492 // Mark this as adjustsStack because it will be lowered to a call.
26493 DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);
26494 // Don't do anything here, we will expand these intrinsics out later.
26495 return Op;
26497 case llvm::Intrinsic::x86_flags_read_u32:
26498 case llvm::Intrinsic::x86_flags_read_u64:
26499 case llvm::Intrinsic::x86_flags_write_u32:
26500 case llvm::Intrinsic::x86_flags_write_u64: {
26501 // We need a frame pointer because this will get lowered to a PUSH/POP
26502 // sequence.
26503 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26504 MFI.setHasCopyImplyingStackAdjustment(true);
26505 // Don't do anything here, we will expand these intrinsics out later
26506 // during FinalizeISel in EmitInstrWithCustomInserter.
26507 return Op;
26509 case Intrinsic::x86_lwpins32:
26510 case Intrinsic::x86_lwpins64:
26511 case Intrinsic::x86_umwait:
26512 case Intrinsic::x86_tpause: {
26513 SDLoc dl(Op);
26514 SDValue Chain = Op->getOperand(0);
26515 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26516 unsigned Opcode;
26518 switch (IntNo) {
26519 default: llvm_unreachable("Impossible intrinsic");
26520 case Intrinsic::x86_umwait:
26521 Opcode = X86ISD::UMWAIT;
26522 break;
26523 case Intrinsic::x86_tpause:
26524 Opcode = X86ISD::TPAUSE;
26525 break;
26526 case Intrinsic::x86_lwpins32:
26527 case Intrinsic::x86_lwpins64:
26528 Opcode = X86ISD::LWPINS;
26529 break;
26532 SDValue Operation =
26533 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26534 Op->getOperand(3), Op->getOperand(4));
26535 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26536 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26537 Operation.getValue(1));
26539 case Intrinsic::x86_enqcmd:
26540 case Intrinsic::x86_enqcmds: {
26541 SDLoc dl(Op);
26542 SDValue Chain = Op.getOperand(0);
26543 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26544 unsigned Opcode;
26545 switch (IntNo) {
26546 default: llvm_unreachable("Impossible intrinsic!");
26547 case Intrinsic::x86_enqcmd:
26548 Opcode = X86ISD::ENQCMD;
26549 break;
26550 case Intrinsic::x86_enqcmds:
26551 Opcode = X86ISD::ENQCMDS;
26552 break;
26554 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26555 Op.getOperand(3));
26556 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26557 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26558 Operation.getValue(1));
26560 case Intrinsic::x86_aesenc128kl:
26561 case Intrinsic::x86_aesdec128kl:
26562 case Intrinsic::x86_aesenc256kl:
26563 case Intrinsic::x86_aesdec256kl: {
26564 SDLoc DL(Op);
26565 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26566 SDValue Chain = Op.getOperand(0);
26567 unsigned Opcode;
26569 switch (IntNo) {
26570 default: llvm_unreachable("Impossible intrinsic");
26571 case Intrinsic::x86_aesenc128kl:
26572 Opcode = X86ISD::AESENC128KL;
26573 break;
26574 case Intrinsic::x86_aesdec128kl:
26575 Opcode = X86ISD::AESDEC128KL;
26576 break;
26577 case Intrinsic::x86_aesenc256kl:
26578 Opcode = X86ISD::AESENC256KL;
26579 break;
26580 case Intrinsic::x86_aesdec256kl:
26581 Opcode = X86ISD::AESDEC256KL;
26582 break;
26585 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26586 MachineMemOperand *MMO = MemIntr->getMemOperand();
26587 EVT MemVT = MemIntr->getMemoryVT();
26588 SDValue Operation = DAG.getMemIntrinsicNode(
26589 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26590 MMO);
26591 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26593 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26594 {ZF, Operation.getValue(0), Operation.getValue(2)});
26596 case Intrinsic::x86_aesencwide128kl:
26597 case Intrinsic::x86_aesdecwide128kl:
26598 case Intrinsic::x86_aesencwide256kl:
26599 case Intrinsic::x86_aesdecwide256kl: {
26600 SDLoc DL(Op);
26601 SDVTList VTs = DAG.getVTList(
26602 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26603 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26604 SDValue Chain = Op.getOperand(0);
26605 unsigned Opcode;
26607 switch (IntNo) {
26608 default: llvm_unreachable("Impossible intrinsic");
26609 case Intrinsic::x86_aesencwide128kl:
26610 Opcode = X86ISD::AESENCWIDE128KL;
26611 break;
26612 case Intrinsic::x86_aesdecwide128kl:
26613 Opcode = X86ISD::AESDECWIDE128KL;
26614 break;
26615 case Intrinsic::x86_aesencwide256kl:
26616 Opcode = X86ISD::AESENCWIDE256KL;
26617 break;
26618 case Intrinsic::x86_aesdecwide256kl:
26619 Opcode = X86ISD::AESDECWIDE256KL;
26620 break;
26623 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26624 MachineMemOperand *MMO = MemIntr->getMemOperand();
26625 EVT MemVT = MemIntr->getMemoryVT();
26626 SDValue Operation = DAG.getMemIntrinsicNode(
26627 Opcode, DL, VTs,
26628 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26629 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26630 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26631 MemVT, MMO);
26632 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26634 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26635 {ZF, Operation.getValue(1), Operation.getValue(2),
26636 Operation.getValue(3), Operation.getValue(4),
26637 Operation.getValue(5), Operation.getValue(6),
26638 Operation.getValue(7), Operation.getValue(8),
26639 Operation.getValue(9)});
26641 case Intrinsic::x86_testui: {
26642 SDLoc dl(Op);
26643 SDValue Chain = Op.getOperand(0);
26644 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26645 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26646 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26647 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26648 Operation.getValue(1));
26650 case Intrinsic::x86_atomic_bts_rm:
26651 case Intrinsic::x86_atomic_btc_rm:
26652 case Intrinsic::x86_atomic_btr_rm: {
26653 SDLoc DL(Op);
26654 MVT VT = Op.getSimpleValueType();
26655 SDValue Chain = Op.getOperand(0);
26656 SDValue Op1 = Op.getOperand(2);
26657 SDValue Op2 = Op.getOperand(3);
26658 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
26659 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
26660 : X86ISD::LBTR_RM;
26661 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26662 SDValue Res =
26663 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26664 {Chain, Op1, Op2}, VT, MMO);
26665 Chain = Res.getValue(1);
26666 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
26667 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
26669 case Intrinsic::x86_atomic_bts:
26670 case Intrinsic::x86_atomic_btc:
26671 case Intrinsic::x86_atomic_btr: {
26672 SDLoc DL(Op);
26673 MVT VT = Op.getSimpleValueType();
26674 SDValue Chain = Op.getOperand(0);
26675 SDValue Op1 = Op.getOperand(2);
26676 SDValue Op2 = Op.getOperand(3);
26677 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
26678 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
26679 : X86ISD::LBTR;
26680 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
26681 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26682 SDValue Res =
26683 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26684 {Chain, Op1, Op2, Size}, VT, MMO);
26685 Chain = Res.getValue(1);
26686 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
26687 unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
26688 if (Imm)
26689 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
26690 DAG.getShiftAmountConstant(Imm, VT, DL));
26691 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
26693 case Intrinsic::x86_cmpccxadd32:
26694 case Intrinsic::x86_cmpccxadd64: {
26695 SDLoc DL(Op);
26696 SDValue Chain = Op.getOperand(0);
26697 SDValue Addr = Op.getOperand(2);
26698 SDValue Src1 = Op.getOperand(3);
26699 SDValue Src2 = Op.getOperand(4);
26700 SDValue CC = Op.getOperand(5);
26701 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26702 SDValue Operation = DAG.getMemIntrinsicNode(
26703 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
26704 MVT::i32, MMO);
26705 return Operation;
26707 case Intrinsic::x86_aadd32:
26708 case Intrinsic::x86_aadd64:
26709 case Intrinsic::x86_aand32:
26710 case Intrinsic::x86_aand64:
26711 case Intrinsic::x86_aor32:
26712 case Intrinsic::x86_aor64:
26713 case Intrinsic::x86_axor32:
26714 case Intrinsic::x86_axor64: {
26715 SDLoc DL(Op);
26716 SDValue Chain = Op.getOperand(0);
26717 SDValue Op1 = Op.getOperand(2);
26718 SDValue Op2 = Op.getOperand(3);
26719 MVT VT = Op2.getSimpleValueType();
26720 unsigned Opc = 0;
26721 switch (IntNo) {
26722 default:
26723 llvm_unreachable("Unknown Intrinsic");
26724 case Intrinsic::x86_aadd32:
26725 case Intrinsic::x86_aadd64:
26726 Opc = X86ISD::AADD;
26727 break;
26728 case Intrinsic::x86_aand32:
26729 case Intrinsic::x86_aand64:
26730 Opc = X86ISD::AAND;
26731 break;
26732 case Intrinsic::x86_aor32:
26733 case Intrinsic::x86_aor64:
26734 Opc = X86ISD::AOR;
26735 break;
26736 case Intrinsic::x86_axor32:
26737 case Intrinsic::x86_axor64:
26738 Opc = X86ISD::AXOR;
26739 break;
26741 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
26742 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
26743 {Chain, Op1, Op2}, VT, MMO);
26745 case Intrinsic::x86_atomic_add_cc:
26746 case Intrinsic::x86_atomic_sub_cc:
26747 case Intrinsic::x86_atomic_or_cc:
26748 case Intrinsic::x86_atomic_and_cc:
26749 case Intrinsic::x86_atomic_xor_cc: {
26750 SDLoc DL(Op);
26751 SDValue Chain = Op.getOperand(0);
26752 SDValue Op1 = Op.getOperand(2);
26753 SDValue Op2 = Op.getOperand(3);
26754 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
26755 MVT VT = Op2.getSimpleValueType();
26756 unsigned Opc = 0;
26757 switch (IntNo) {
26758 default:
26759 llvm_unreachable("Unknown Intrinsic");
26760 case Intrinsic::x86_atomic_add_cc:
26761 Opc = X86ISD::LADD;
26762 break;
26763 case Intrinsic::x86_atomic_sub_cc:
26764 Opc = X86ISD::LSUB;
26765 break;
26766 case Intrinsic::x86_atomic_or_cc:
26767 Opc = X86ISD::LOR;
26768 break;
26769 case Intrinsic::x86_atomic_and_cc:
26770 Opc = X86ISD::LAND;
26771 break;
26772 case Intrinsic::x86_atomic_xor_cc:
26773 Opc = X86ISD::LXOR;
26774 break;
26776 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26777 SDValue LockArith =
26778 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26779 {Chain, Op1, Op2}, VT, MMO);
26780 Chain = LockArith.getValue(1);
26781 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
26784 return SDValue();
26787 SDLoc dl(Op);
26788 switch(IntrData->Type) {
26789 default: llvm_unreachable("Unknown Intrinsic Type");
26790 case RDSEED:
26791 case RDRAND: {
26792 // Emit the node with the right value type.
26793 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
26794 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26796 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
26797 // Otherwise return the value from Rand, which is always 0, casted to i32.
26798 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
26799 DAG.getConstant(1, dl, Op->getValueType(1)),
26800 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
26801 SDValue(Result.getNode(), 1)};
26802 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
26804 // Return { result, isValid, chain }.
26805 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
26806 SDValue(Result.getNode(), 2));
26808 case GATHER_AVX2: {
26809 SDValue Chain = Op.getOperand(0);
26810 SDValue Src = Op.getOperand(2);
26811 SDValue Base = Op.getOperand(3);
26812 SDValue Index = Op.getOperand(4);
26813 SDValue Mask = Op.getOperand(5);
26814 SDValue Scale = Op.getOperand(6);
26815 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26816 Scale, Chain, Subtarget);
26818 case GATHER: {
26819 //gather(v1, mask, index, base, scale);
26820 SDValue Chain = Op.getOperand(0);
26821 SDValue Src = Op.getOperand(2);
26822 SDValue Base = Op.getOperand(3);
26823 SDValue Index = Op.getOperand(4);
26824 SDValue Mask = Op.getOperand(5);
26825 SDValue Scale = Op.getOperand(6);
26826 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
26827 Chain, Subtarget);
26829 case SCATTER: {
26830 //scatter(base, mask, index, v1, scale);
26831 SDValue Chain = Op.getOperand(0);
26832 SDValue Base = Op.getOperand(2);
26833 SDValue Mask = Op.getOperand(3);
26834 SDValue Index = Op.getOperand(4);
26835 SDValue Src = Op.getOperand(5);
26836 SDValue Scale = Op.getOperand(6);
26837 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26838 Scale, Chain, Subtarget);
26840 case PREFETCH: {
26841 const APInt &HintVal = Op.getConstantOperandAPInt(6);
26842 assert((HintVal == 2 || HintVal == 3) &&
26843 "Wrong prefetch hint in intrinsic: should be 2 or 3");
26844 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
26845 SDValue Chain = Op.getOperand(0);
26846 SDValue Mask = Op.getOperand(2);
26847 SDValue Index = Op.getOperand(3);
26848 SDValue Base = Op.getOperand(4);
26849 SDValue Scale = Op.getOperand(5);
26850 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
26851 Subtarget);
26853 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
26854 case RDTSC: {
26855 SmallVector<SDValue, 2> Results;
26856 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
26857 Results);
26858 return DAG.getMergeValues(Results, dl);
26860 // Read Performance Monitoring Counters.
26861 case RDPMC:
26862 // Read Processor Register.
26863 case RDPRU:
26864 // GetExtended Control Register.
26865 case XGETBV: {
26866 SmallVector<SDValue, 2> Results;
26868 // RDPMC uses ECX to select the index of the performance counter to read.
26869 // RDPRU uses ECX to select the processor register to read.
26870 // XGETBV uses ECX to select the index of the XCR register to return.
26871 // The result is stored into registers EDX:EAX.
26872 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
26873 Subtarget, Results);
26874 return DAG.getMergeValues(Results, dl);
26876 // XTEST intrinsics.
26877 case XTEST: {
26878 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
26879 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26881 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
26882 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
26883 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
26884 Ret, SDValue(InTrans.getNode(), 1));
26886 case TRUNCATE_TO_MEM_VI8:
26887 case TRUNCATE_TO_MEM_VI16:
26888 case TRUNCATE_TO_MEM_VI32: {
26889 SDValue Mask = Op.getOperand(4);
26890 SDValue DataToTruncate = Op.getOperand(3);
26891 SDValue Addr = Op.getOperand(2);
26892 SDValue Chain = Op.getOperand(0);
26894 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
26895 assert(MemIntr && "Expected MemIntrinsicSDNode!");
26897 EVT MemVT = MemIntr->getMemoryVT();
26899 uint16_t TruncationOp = IntrData->Opc0;
26900 switch (TruncationOp) {
26901 case X86ISD::VTRUNC: {
26902 if (isAllOnesConstant(Mask)) // return just a truncate store
26903 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
26904 MemIntr->getMemOperand());
26906 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26907 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26908 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
26910 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
26911 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
26912 true /* truncating */);
26914 case X86ISD::VTRUNCUS:
26915 case X86ISD::VTRUNCS: {
26916 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
26917 if (isAllOnesConstant(Mask))
26918 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
26919 MemIntr->getMemOperand(), DAG);
26921 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26922 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26924 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
26925 VMask, MemVT, MemIntr->getMemOperand(), DAG);
26927 default:
26928 llvm_unreachable("Unsupported truncstore intrinsic");
26934 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
26935 SelectionDAG &DAG) const {
26936 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26937 MFI.setReturnAddressIsTaken(true);
26939 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
26940 return SDValue();
26942 unsigned Depth = Op.getConstantOperandVal(0);
26943 SDLoc dl(Op);
26944 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26946 if (Depth > 0) {
26947 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
26948 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26949 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
26950 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26951 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
26952 MachinePointerInfo());
26955 // Just load the return address.
26956 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
26957 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
26958 MachinePointerInfo());
26961 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
26962 SelectionDAG &DAG) const {
26963 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
26964 return getReturnAddressFrameIndex(DAG);
26967 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
26968 MachineFunction &MF = DAG.getMachineFunction();
26969 MachineFrameInfo &MFI = MF.getFrameInfo();
26970 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
26971 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26972 EVT VT = Op.getValueType();
26974 MFI.setFrameAddressIsTaken(true);
26976 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
26977 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
26978 // is not possible to crawl up the stack without looking at the unwind codes
26979 // simultaneously.
26980 int FrameAddrIndex = FuncInfo->getFAIndex();
26981 if (!FrameAddrIndex) {
26982 // Set up a frame object for the return address.
26983 unsigned SlotSize = RegInfo->getSlotSize();
26984 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
26985 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
26986 FuncInfo->setFAIndex(FrameAddrIndex);
26988 return DAG.getFrameIndex(FrameAddrIndex, VT);
26991 unsigned FrameReg =
26992 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
26993 SDLoc dl(Op); // FIXME probably not meaningful
26994 unsigned Depth = Op.getConstantOperandVal(0);
26995 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
26996 (FrameReg == X86::EBP && VT == MVT::i32)) &&
26997 "Invalid Frame Register!");
26998 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
26999 while (Depth--)
27000 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27001 MachinePointerInfo());
27002 return FrameAddr;
27005 // FIXME? Maybe this could be a TableGen attribute on some registers and
27006 // this table could be generated automatically from RegInfo.
27007 Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
27008 const MachineFunction &MF) const {
27009 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27011 Register Reg = StringSwitch<unsigned>(RegName)
27012 .Case("esp", X86::ESP)
27013 .Case("rsp", X86::RSP)
27014 .Case("ebp", X86::EBP)
27015 .Case("rbp", X86::RBP)
27016 .Default(0);
27018 if (Reg == X86::EBP || Reg == X86::RBP) {
27019 if (!TFI.hasFP(MF))
27020 report_fatal_error("register " + StringRef(RegName) +
27021 " is allocatable: function has no frame pointer");
27022 #ifndef NDEBUG
27023 else {
27024 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27025 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27026 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
27027 "Invalid Frame Register!");
27029 #endif
27032 if (Reg)
27033 return Reg;
27035 report_fatal_error("Invalid register name global variable");
27038 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27039 SelectionDAG &DAG) const {
27040 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27041 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27044 Register X86TargetLowering::getExceptionPointerRegister(
27045 const Constant *PersonalityFn) const {
27046 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27047 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27049 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27052 Register X86TargetLowering::getExceptionSelectorRegister(
27053 const Constant *PersonalityFn) const {
27054 // Funclet personalities don't use selectors (the runtime does the selection).
27055 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
27056 return X86::NoRegister;
27057 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27060 bool X86TargetLowering::needsFixedCatchObjects() const {
27061 return Subtarget.isTargetWin64();
27064 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27065 SDValue Chain = Op.getOperand(0);
27066 SDValue Offset = Op.getOperand(1);
27067 SDValue Handler = Op.getOperand(2);
27068 SDLoc dl (Op);
27070 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27071 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27072 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27073 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
27074 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
27075 "Invalid Frame Register!");
27076 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27077 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27079 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27080 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27081 dl));
27082 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27083 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27084 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27086 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27087 DAG.getRegister(StoreAddrReg, PtrVT));
27090 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27091 SelectionDAG &DAG) const {
27092 SDLoc DL(Op);
27093 // If the subtarget is not 64bit, we may need the global base reg
27094 // after isel expand pseudo, i.e., after CGBR pass ran.
27095 // Therefore, ask for the GlobalBaseReg now, so that the pass
27096 // inserts the code for us in case we need it.
27097 // Otherwise, we will end up in a situation where we will
27098 // reference a virtual register that is not defined!
27099 if (!Subtarget.is64Bit()) {
27100 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27101 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27103 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27104 DAG.getVTList(MVT::i32, MVT::Other),
27105 Op.getOperand(0), Op.getOperand(1));
27108 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27109 SelectionDAG &DAG) const {
27110 SDLoc DL(Op);
27111 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27112 Op.getOperand(0), Op.getOperand(1));
27115 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27116 SelectionDAG &DAG) const {
27117 SDLoc DL(Op);
27118 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27119 Op.getOperand(0));
27122 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
27123 return Op.getOperand(0);
27126 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27127 SelectionDAG &DAG) const {
27128 SDValue Root = Op.getOperand(0);
27129 SDValue Trmp = Op.getOperand(1); // trampoline
27130 SDValue FPtr = Op.getOperand(2); // nested function
27131 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27132 SDLoc dl (Op);
27134 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27135 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27137 if (Subtarget.is64Bit()) {
27138 SDValue OutChains[6];
27140 // Large code-model.
27141 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
27142 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27144 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27145 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27147 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27149 // Load the pointer to the nested function into R11.
27150 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27151 SDValue Addr = Trmp;
27152 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27153 Addr, MachinePointerInfo(TrmpAddr));
27155 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27156 DAG.getConstant(2, dl, MVT::i64));
27157 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27158 MachinePointerInfo(TrmpAddr, 2), Align(2));
27160 // Load the 'nest' parameter value into R10.
27161 // R10 is specified in X86CallingConv.td
27162 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27163 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27164 DAG.getConstant(10, dl, MVT::i64));
27165 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27166 Addr, MachinePointerInfo(TrmpAddr, 10));
27168 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27169 DAG.getConstant(12, dl, MVT::i64));
27170 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27171 MachinePointerInfo(TrmpAddr, 12), Align(2));
27173 // Jump to the nested function.
27174 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27175 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27176 DAG.getConstant(20, dl, MVT::i64));
27177 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27178 Addr, MachinePointerInfo(TrmpAddr, 20));
27180 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27181 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27182 DAG.getConstant(22, dl, MVT::i64));
27183 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27184 Addr, MachinePointerInfo(TrmpAddr, 22));
27186 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27187 } else {
27188 const Function *Func =
27189 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27190 CallingConv::ID CC = Func->getCallingConv();
27191 unsigned NestReg;
27193 switch (CC) {
27194 default:
27195 llvm_unreachable("Unsupported calling convention");
27196 case CallingConv::C:
27197 case CallingConv::X86_StdCall: {
27198 // Pass 'nest' parameter in ECX.
27199 // Must be kept in sync with X86CallingConv.td
27200 NestReg = X86::ECX;
27202 // Check that ECX wasn't needed by an 'inreg' parameter.
27203 FunctionType *FTy = Func->getFunctionType();
27204 const AttributeList &Attrs = Func->getAttributes();
27206 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27207 unsigned InRegCount = 0;
27208 unsigned Idx = 0;
27210 for (FunctionType::param_iterator I = FTy->param_begin(),
27211 E = FTy->param_end(); I != E; ++I, ++Idx)
27212 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
27213 const DataLayout &DL = DAG.getDataLayout();
27214 // FIXME: should only count parameters that are lowered to integers.
27215 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27218 if (InRegCount > 2) {
27219 report_fatal_error("Nest register in use - reduce number of inreg"
27220 " parameters!");
27223 break;
27225 case CallingConv::X86_FastCall:
27226 case CallingConv::X86_ThisCall:
27227 case CallingConv::Fast:
27228 case CallingConv::Tail:
27229 case CallingConv::SwiftTail:
27230 // Pass 'nest' parameter in EAX.
27231 // Must be kept in sync with X86CallingConv.td
27232 NestReg = X86::EAX;
27233 break;
27236 SDValue OutChains[4];
27237 SDValue Addr, Disp;
27239 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27240 DAG.getConstant(10, dl, MVT::i32));
27241 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27243 // This is storing the opcode for MOV32ri.
27244 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27245 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27246 OutChains[0] =
27247 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27248 Trmp, MachinePointerInfo(TrmpAddr));
27250 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27251 DAG.getConstant(1, dl, MVT::i32));
27252 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27253 MachinePointerInfo(TrmpAddr, 1), Align(1));
27255 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27256 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27257 DAG.getConstant(5, dl, MVT::i32));
27258 OutChains[2] =
27259 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27260 MachinePointerInfo(TrmpAddr, 5), Align(1));
27262 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27263 DAG.getConstant(6, dl, MVT::i32));
27264 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27265 MachinePointerInfo(TrmpAddr, 6), Align(1));
27267 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27271 SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
27272 SelectionDAG &DAG) const {
27274 The rounding mode is in bits 11:10 of FPSR, and has the following
27275 settings:
27276 00 Round to nearest
27277 01 Round to -inf
27278 10 Round to +inf
27279 11 Round to 0
27281 GET_ROUNDING, on the other hand, expects the following:
27282 -1 Undefined
27283 0 Round to 0
27284 1 Round to nearest
27285 2 Round to +inf
27286 3 Round to -inf
27288 To perform the conversion, we use a packed lookup table of the four 2-bit
27289 values that we can index by FPSP[11:10]
27290 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27292 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27295 MachineFunction &MF = DAG.getMachineFunction();
27296 MVT VT = Op.getSimpleValueType();
27297 SDLoc DL(Op);
27299 // Save FP Control Word to stack slot
27300 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27301 SDValue StackSlot =
27302 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27304 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
27306 SDValue Chain = Op.getOperand(0);
27307 SDValue Ops[] = {Chain, StackSlot};
27308 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
27309 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27310 Align(2), MachineMemOperand::MOStore);
27312 // Load FP Control Word from stack slot
27313 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27314 Chain = CWD.getValue(1);
27316 // Mask and turn the control bits into a shift for the lookup table.
27317 SDValue Shift =
27318 DAG.getNode(ISD::SRL, DL, MVT::i16,
27319 DAG.getNode(ISD::AND, DL, MVT::i16,
27320 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27321 DAG.getConstant(9, DL, MVT::i8));
27322 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27324 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27325 SDValue RetVal =
27326 DAG.getNode(ISD::AND, DL, MVT::i32,
27327 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27328 DAG.getConstant(3, DL, MVT::i32));
27330 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27332 return DAG.getMergeValues({RetVal, Chain}, DL);
27335 SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27336 SelectionDAG &DAG) const {
27337 MachineFunction &MF = DAG.getMachineFunction();
27338 SDLoc DL(Op);
27339 SDValue Chain = Op.getNode()->getOperand(0);
27341 // FP control word may be set only from data in memory. So we need to allocate
27342 // stack space to save/load FP control word.
27343 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27344 SDValue StackSlot =
27345 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27346 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
27347 MachineMemOperand *MMO =
27348 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
27350 // Store FP control word into memory.
27351 SDValue Ops[] = {Chain, StackSlot};
27352 Chain = DAG.getMemIntrinsicNode(
27353 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27355 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27356 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27357 Chain = CWD.getValue(1);
27358 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27359 DAG.getConstant(0xf3ff, DL, MVT::i16));
27361 // Calculate new rounding mode.
27362 SDValue NewRM = Op.getNode()->getOperand(1);
27363 SDValue RMBits;
27364 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27365 uint64_t RM = CVal->getZExtValue();
27366 int FieldVal;
27367 switch (static_cast<RoundingMode>(RM)) {
27368 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27369 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
27370 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
27371 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
27372 default:
27373 llvm_unreachable("rounding mode is not supported by X86 hardware");
27375 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27376 } else {
27377 // Need to convert argument into bits of control word:
27378 // 0 Round to 0 -> 11
27379 // 1 Round to nearest -> 00
27380 // 2 Round to +inf -> 10
27381 // 3 Round to -inf -> 01
27382 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27383 // To make the conversion, put all these values into a value 0xc9 and shift
27384 // it left depending on the rounding mode:
27385 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27386 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
27387 // ...
27388 // (0xc9 << (2 * NewRM + 4)) & 0xc00
27389 SDValue ShiftValue =
27390 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27391 DAG.getNode(ISD::ADD, DL, MVT::i32,
27392 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27393 DAG.getConstant(1, DL, MVT::i8)),
27394 DAG.getConstant(4, DL, MVT::i32)));
27395 SDValue Shifted =
27396 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27397 ShiftValue);
27398 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27399 DAG.getConstant(0xc00, DL, MVT::i16));
27402 // Update rounding mode bits and store the new FP Control Word into stack.
27403 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27404 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
27406 // Load FP control word from the slot.
27407 SDValue OpsLD[] = {Chain, StackSlot};
27408 MachineMemOperand *MMOL =
27409 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
27410 Chain = DAG.getMemIntrinsicNode(
27411 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27413 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27414 // same way but in bits 14:13.
27415 if (Subtarget.hasSSE1()) {
27416 // Store MXCSR into memory.
27417 Chain = DAG.getNode(
27418 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27419 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27420 StackSlot);
27422 // Load MXCSR from stack slot and clear RM field (bits 14:13).
27423 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27424 Chain = CWD.getValue(1);
27425 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27426 DAG.getConstant(0xffff9fff, DL, MVT::i32));
27428 // Shift X87 RM bits from 11:10 to 14:13.
27429 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27430 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27431 DAG.getConstant(3, DL, MVT::i8));
27433 // Update rounding mode bits and store the new FP Control Word into stack.
27434 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27435 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
27437 // Load MXCSR from the slot.
27438 Chain = DAG.getNode(
27439 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27440 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27441 StackSlot);
27444 return Chain;
27447 const unsigned X87StateSize = 28;
27448 const unsigned FPStateSize = 32;
27449 [[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
27451 SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
27452 SelectionDAG &DAG) const {
27453 MachineFunction &MF = DAG.getMachineFunction();
27454 SDLoc DL(Op);
27455 SDValue Chain = Op->getOperand(0);
27456 SDValue Ptr = Op->getOperand(1);
27457 auto *Node = cast<FPStateAccessSDNode>(Op);
27458 EVT MemVT = Node->getMemoryVT();
27459 assert(MemVT.getSizeInBits() == FPStateSizeInBits);
27460 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27462 // Get x87 state, if it presents.
27463 if (Subtarget.hasX87()) {
27464 Chain =
27465 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
27466 {Chain, Ptr}, MemVT, MMO);
27468 // FNSTENV changes the exception mask, so load back the stored environment.
27469 MachineMemOperand::Flags NewFlags =
27470 MachineMemOperand::MOLoad |
27471 (MMO->getFlags() & ~MachineMemOperand::MOStore);
27472 MMO = MF.getMachineMemOperand(MMO, NewFlags);
27473 Chain =
27474 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27475 {Chain, Ptr}, MemVT, MMO);
27478 // If target supports SSE, get MXCSR as well.
27479 if (Subtarget.hasSSE1()) {
27480 // Get pointer to the MXCSR location in memory.
27481 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
27482 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27483 DAG.getConstant(X87StateSize, DL, PtrVT));
27484 // Store MXCSR into memory.
27485 Chain = DAG.getNode(
27486 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27487 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27488 MXCSRAddr);
27491 return Chain;
27494 static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, SDLoc DL,
27495 EVT MemVT, MachineMemOperand *MMO,
27496 SelectionDAG &DAG,
27497 const X86Subtarget &Subtarget) {
27498 // Set x87 state, if it presents.
27499 if (Subtarget.hasX87())
27500 Chain =
27501 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27502 {Chain, Ptr}, MemVT, MMO);
27503 // If target supports SSE, set MXCSR as well.
27504 if (Subtarget.hasSSE1()) {
27505 // Get pointer to the MXCSR location in memory.
27506 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
27507 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27508 DAG.getConstant(X87StateSize, DL, PtrVT));
27509 // Load MXCSR from memory.
27510 Chain = DAG.getNode(
27511 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27512 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27513 MXCSRAddr);
27515 return Chain;
27518 SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
27519 SelectionDAG &DAG) const {
27520 SDLoc DL(Op);
27521 SDValue Chain = Op->getOperand(0);
27522 SDValue Ptr = Op->getOperand(1);
27523 auto *Node = cast<FPStateAccessSDNode>(Op);
27524 EVT MemVT = Node->getMemoryVT();
27525 assert(MemVT.getSizeInBits() == FPStateSizeInBits);
27526 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27527 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
27530 SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
27531 SelectionDAG &DAG) const {
27532 MachineFunction &MF = DAG.getMachineFunction();
27533 SDLoc DL(Op);
27534 SDValue Chain = Op.getNode()->getOperand(0);
27536 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
27537 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
27538 SmallVector<Constant *, 8> FPEnvVals;
27540 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
27541 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
27542 // for compatibility with glibc.
27543 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
27544 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
27545 Constant *Zero = ConstantInt::get(ItemTy, 0);
27546 for (unsigned I = 0; I < 6; ++I)
27547 FPEnvVals.push_back(Zero);
27549 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
27550 // all exceptions, sets DAZ and FTZ to 0.
27551 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
27552 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
27553 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
27554 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
27555 MachinePointerInfo MPI =
27556 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
27557 MachineMemOperand *MMO = MF.getMachineMemOperand(
27558 MPI, MachineMemOperand::MOStore, X87StateSize, Align(4));
27560 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
27563 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
27565 // i8/i16 vector implemented using dword LZCNT vector instruction
27566 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27567 // split the vector, perform operation on it's Lo a Hi part and
27568 // concatenate the results.
27569 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
27570 const X86Subtarget &Subtarget) {
27571 assert(Op.getOpcode() == ISD::CTLZ);
27572 SDLoc dl(Op);
27573 MVT VT = Op.getSimpleValueType();
27574 MVT EltVT = VT.getVectorElementType();
27575 unsigned NumElems = VT.getVectorNumElements();
27577 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
27578 "Unsupported element type");
27580 // Split vector, it's Lo and Hi parts will be handled in next iteration.
27581 if (NumElems > 16 ||
27582 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27583 return splitVectorIntUnary(Op, DAG);
27585 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27586 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
27587 "Unsupported value type for operation");
27589 // Use native supported vector instruction vplzcntd.
27590 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27591 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27592 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27593 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27595 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27598 // Lower CTLZ using a PSHUFB lookup table implementation.
27599 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
27600 const X86Subtarget &Subtarget,
27601 SelectionDAG &DAG) {
27602 MVT VT = Op.getSimpleValueType();
27603 int NumElts = VT.getVectorNumElements();
27604 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27605 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27607 // Per-nibble leading zero PSHUFB lookup table.
27608 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27609 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27610 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27611 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27613 SmallVector<SDValue, 64> LUTVec;
27614 for (int i = 0; i < NumBytes; ++i)
27615 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27616 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27618 // Begin by bitcasting the input to byte vector, then split those bytes
27619 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
27620 // If the hi input nibble is zero then we add both results together, otherwise
27621 // we just take the hi result (by masking the lo result to zero before the
27622 // add).
27623 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27624 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27626 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27627 SDValue Lo = Op0;
27628 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27629 SDValue HiZ;
27630 if (CurrVT.is512BitVector()) {
27631 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27632 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27633 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27634 } else {
27635 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27638 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27639 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27640 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27641 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27643 // Merge result back from vXi8 back to VT, working on the lo/hi halves
27644 // of the current vector width in the same way we did for the nibbles.
27645 // If the upper half of the input element is zero then add the halves'
27646 // leading zero counts together, otherwise just use the upper half's.
27647 // Double the width of the result until we are at target width.
27648 while (CurrVT != VT) {
27649 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27650 int CurrNumElts = CurrVT.getVectorNumElements();
27651 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27652 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27653 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27655 // Check if the upper half of the input element is zero.
27656 if (CurrVT.is512BitVector()) {
27657 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27658 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27659 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27660 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27661 } else {
27662 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27663 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27665 HiZ = DAG.getBitcast(NextVT, HiZ);
27667 // Move the upper/lower halves to the lower bits as we'll be extending to
27668 // NextVT. Mask the lower result to zero if HiZ is true and add the results
27669 // together.
27670 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27671 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27672 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27673 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27674 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27675 CurrVT = NextVT;
27678 return Res;
27681 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
27682 const X86Subtarget &Subtarget,
27683 SelectionDAG &DAG) {
27684 MVT VT = Op.getSimpleValueType();
27686 if (Subtarget.hasCDI() &&
27687 // vXi8 vectors need to be promoted to 512-bits for vXi32.
27688 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27689 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27691 // Decompose 256-bit ops into smaller 128-bit ops.
27692 if (VT.is256BitVector() && !Subtarget.hasInt256())
27693 return splitVectorIntUnary(Op, DAG);
27695 // Decompose 512-bit ops into smaller 256-bit ops.
27696 if (VT.is512BitVector() && !Subtarget.hasBWI())
27697 return splitVectorIntUnary(Op, DAG);
27699 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
27700 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
27703 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
27704 SelectionDAG &DAG) {
27705 MVT VT = Op.getSimpleValueType();
27706 MVT OpVT = VT;
27707 unsigned NumBits = VT.getSizeInBits();
27708 SDLoc dl(Op);
27709 unsigned Opc = Op.getOpcode();
27711 if (VT.isVector())
27712 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
27714 Op = Op.getOperand(0);
27715 if (VT == MVT::i8) {
27716 // Zero extend to i32 since there is not an i8 bsr.
27717 OpVT = MVT::i32;
27718 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
27721 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
27722 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
27723 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
27725 if (Opc == ISD::CTLZ) {
27726 // If src is zero (i.e. bsr sets ZF), returns NumBits.
27727 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
27728 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27729 Op.getValue(1)};
27730 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
27733 // Finally xor with NumBits-1.
27734 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
27735 DAG.getConstant(NumBits - 1, dl, OpVT));
27737 if (VT == MVT::i8)
27738 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
27739 return Op;
27742 static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
27743 SelectionDAG &DAG) {
27744 MVT VT = Op.getSimpleValueType();
27745 unsigned NumBits = VT.getScalarSizeInBits();
27746 SDValue N0 = Op.getOperand(0);
27747 SDLoc dl(Op);
27749 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
27750 "Only scalar CTTZ requires custom lowering");
27752 // Issue a bsf (scan bits forward) which also sets EFLAGS.
27753 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
27754 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
27756 // If src is known never zero we can skip the CMOV.
27757 if (DAG.isKnownNeverZero(N0))
27758 return Op;
27760 // If src is zero (i.e. bsf sets ZF), returns NumBits.
27761 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
27762 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27763 Op.getValue(1)};
27764 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
27767 static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
27768 const X86Subtarget &Subtarget) {
27769 MVT VT = Op.getSimpleValueType();
27770 if (VT == MVT::i16 || VT == MVT::i32)
27771 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
27773 if (VT == MVT::v32i16 || VT == MVT::v64i8)
27774 return splitVectorIntBinary(Op, DAG);
27776 assert(Op.getSimpleValueType().is256BitVector() &&
27777 Op.getSimpleValueType().isInteger() &&
27778 "Only handle AVX 256-bit vector integer operation");
27779 return splitVectorIntBinary(Op, DAG);
27782 static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
27783 const X86Subtarget &Subtarget) {
27784 MVT VT = Op.getSimpleValueType();
27785 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
27786 unsigned Opcode = Op.getOpcode();
27787 SDLoc DL(Op);
27789 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
27790 (VT.is256BitVector() && !Subtarget.hasInt256())) {
27791 assert(Op.getSimpleValueType().isInteger() &&
27792 "Only handle AVX vector integer operation");
27793 return splitVectorIntBinary(Op, DAG);
27796 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
27797 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27798 EVT SetCCResultType =
27799 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27801 unsigned BitWidth = VT.getScalarSizeInBits();
27802 if (Opcode == ISD::USUBSAT) {
27803 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
27804 // Handle a special-case with a bit-hack instead of cmp+select:
27805 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
27806 // If the target can use VPTERNLOG, DAGToDAG will match this as
27807 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
27808 // "broadcast" constant load.
27809 ConstantSDNode *C = isConstOrConstSplat(Y, true);
27810 if (C && C->getAPIntValue().isSignMask()) {
27811 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
27812 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
27813 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
27814 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
27815 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
27818 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
27819 // usubsat X, Y --> (X >u Y) ? X - Y : 0
27820 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
27821 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
27822 // TODO: Move this to DAGCombiner?
27823 if (SetCCResultType == VT &&
27824 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
27825 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
27826 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
27830 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
27831 (!VT.isVector() || VT == MVT::v2i64)) {
27832 APInt MinVal = APInt::getSignedMinValue(BitWidth);
27833 APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
27834 SDValue Zero = DAG.getConstant(0, DL, VT);
27835 SDValue Result =
27836 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
27837 DAG.getVTList(VT, SetCCResultType), X, Y);
27838 SDValue SumDiff = Result.getValue(0);
27839 SDValue Overflow = Result.getValue(1);
27840 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
27841 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
27842 SDValue SumNeg =
27843 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
27844 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
27845 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
27848 // Use default expansion.
27849 return SDValue();
27852 static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
27853 SelectionDAG &DAG) {
27854 MVT VT = Op.getSimpleValueType();
27855 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
27856 // Since X86 does not have CMOV for 8-bit integer, we don't convert
27857 // 8-bit integer abs to NEG and CMOV.
27858 SDLoc DL(Op);
27859 SDValue N0 = Op.getOperand(0);
27860 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
27861 DAG.getConstant(0, DL, VT), N0);
27862 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
27863 SDValue(Neg.getNode(), 1)};
27864 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
27867 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
27868 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
27869 SDLoc DL(Op);
27870 SDValue Src = Op.getOperand(0);
27871 SDValue Sub =
27872 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
27873 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
27876 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
27877 assert(VT.isInteger() &&
27878 "Only handle AVX 256-bit vector integer operation");
27879 return splitVectorIntUnary(Op, DAG);
27882 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27883 return splitVectorIntUnary(Op, DAG);
27885 // Default to expand.
27886 return SDValue();
27889 static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
27890 SelectionDAG &DAG) {
27891 MVT VT = Op.getSimpleValueType();
27893 // For AVX1 cases, split to use legal ops.
27894 if (VT.is256BitVector() && !Subtarget.hasInt256())
27895 return splitVectorIntBinary(Op, DAG);
27897 if (VT == MVT::v32i16 || VT == MVT::v64i8)
27898 return splitVectorIntBinary(Op, DAG);
27900 // Default to expand.
27901 return SDValue();
27904 static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
27905 SelectionDAG &DAG) {
27906 MVT VT = Op.getSimpleValueType();
27908 // For AVX1 cases, split to use legal ops.
27909 if (VT.is256BitVector() && !Subtarget.hasInt256())
27910 return splitVectorIntBinary(Op, DAG);
27912 if (VT == MVT::v32i16 || VT == MVT::v64i8)
27913 return splitVectorIntBinary(Op, DAG);
27915 // Default to expand.
27916 return SDValue();
27919 static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
27920 SelectionDAG &DAG) {
27921 assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
27922 "Expected FMAXIMUM or FMINIMUM opcode");
27923 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27924 EVT VT = Op.getValueType();
27925 SDValue X = Op.getOperand(0);
27926 SDValue Y = Op.getOperand(1);
27927 SDLoc DL(Op);
27928 uint64_t SizeInBits = VT.getScalarSizeInBits();
27929 APInt PreferredZero = APInt::getZero(SizeInBits);
27930 APInt OppositeZero = PreferredZero;
27931 EVT IVT = VT.changeTypeToInteger();
27932 X86ISD::NodeType MinMaxOp;
27933 if (Op.getOpcode() == ISD::FMAXIMUM) {
27934 MinMaxOp = X86ISD::FMAX;
27935 OppositeZero.setSignBit();
27936 } else {
27937 PreferredZero.setSignBit();
27938 MinMaxOp = X86ISD::FMIN;
27940 EVT SetCCType =
27941 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27943 // The tables below show the expected result of Max in cases of NaN and
27944 // signed zeros.
27946 // Y Y
27947 // Num xNaN +0 -0
27948 // --------------- ---------------
27949 // Num | Max | Y | +0 | +0 | +0 |
27950 // X --------------- X ---------------
27951 // xNaN | X | X/Y | -0 | +0 | -0 |
27952 // --------------- ---------------
27954 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
27955 // reordering.
27957 // We check if any of operands is NaN and return NaN. Then we check if any of
27958 // operands is zero or negative zero (for fmaximum and fminimum respectively)
27959 // to ensure the correct zero is returned.
27960 auto MatchesZero = [](SDValue Op, APInt Zero) {
27961 Op = peekThroughBitcasts(Op);
27962 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
27963 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
27964 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
27965 return CstOp->getAPIntValue() == Zero;
27966 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
27967 Op->getOpcode() == ISD::SPLAT_VECTOR) {
27968 for (const SDValue &OpVal : Op->op_values()) {
27969 if (OpVal.isUndef())
27970 continue;
27971 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
27972 if (!CstOp)
27973 return false;
27974 if (!CstOp->getValueAPF().isZero())
27975 continue;
27976 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
27977 return false;
27979 return true;
27981 return false;
27984 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
27985 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
27986 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
27987 Op->getFlags().hasNoSignedZeros() ||
27988 DAG.isKnownNeverZeroFloat(X) ||
27989 DAG.isKnownNeverZeroFloat(Y);
27990 SDValue NewX, NewY;
27991 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
27992 MatchesZero(X, OppositeZero)) {
27993 // Operands are already in right order or order does not matter.
27994 NewX = X;
27995 NewY = Y;
27996 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
27997 NewX = Y;
27998 NewY = X;
27999 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
28000 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
28001 if (IsXNeverNaN)
28002 std::swap(X, Y);
28003 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
28004 // xmm register.
28005 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
28006 SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorType, X);
28007 // Bits of classes:
28008 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
28009 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
28010 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
28011 DL, MVT::i32);
28012 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
28013 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
28014 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
28015 DAG.getIntPtrConstant(0, DL));
28016 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
28017 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
28018 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
28019 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28020 } else {
28021 SDValue IsXSigned;
28022 if (Subtarget.is64Bit() || VT != MVT::f64) {
28023 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
28024 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
28025 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
28026 } else {
28027 assert(VT == MVT::f64);
28028 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
28029 DAG.getConstantFP(0, DL, MVT::v2f64), X,
28030 DAG.getIntPtrConstant(0, DL));
28031 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
28032 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
28033 DAG.getIntPtrConstant(1, DL));
28034 Hi = DAG.getBitcast(MVT::i32, Hi);
28035 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
28036 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
28037 *DAG.getContext(), MVT::i32);
28038 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
28040 if (MinMaxOp == X86ISD::FMAX) {
28041 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28042 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28043 } else {
28044 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28045 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28049 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
28050 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
28052 // If we did no ordering operands for signed zero handling and we need
28053 // to process NaN and we know that the second operand is not NaN then put
28054 // it in first operand and we will not need to post handle NaN after max/min.
28055 if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
28056 std::swap(NewX, NewY);
28058 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28060 if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
28061 return MinMax;
28063 SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
28064 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
28067 static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
28068 SelectionDAG &DAG) {
28069 MVT VT = Op.getSimpleValueType();
28071 // For AVX1 cases, split to use legal ops.
28072 if (VT.is256BitVector() && !Subtarget.hasInt256())
28073 return splitVectorIntBinary(Op, DAG);
28075 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
28076 return splitVectorIntBinary(Op, DAG);
28078 SDLoc dl(Op);
28079 bool IsSigned = Op.getOpcode() == ISD::ABDS;
28080 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28082 // TODO: Move to TargetLowering expandABD() once we have ABD promotion.
28083 if (VT.isScalarInteger()) {
28084 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
28085 MVT WideVT = MVT::getIntegerVT(WideBits);
28086 if (TLI.isTypeLegal(WideVT)) {
28087 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
28088 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
28089 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28090 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
28091 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
28092 SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS);
28093 SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff);
28094 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
28098 // TODO: Move to TargetLowering expandABD().
28099 if (!Subtarget.hasSSE41() &&
28100 ((IsSigned && VT == MVT::v16i8) || VT == MVT::v4i32)) {
28101 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
28102 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
28103 ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
28104 SDValue Cmp = DAG.getSetCC(dl, VT, LHS, RHS, CC);
28105 SDValue Diff0 = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
28106 SDValue Diff1 = DAG.getNode(ISD::SUB, dl, VT, RHS, LHS);
28107 return getBitSelect(dl, VT, Diff0, Diff1, Cmp, DAG);
28110 // Default to expand.
28111 return SDValue();
28114 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28115 SelectionDAG &DAG) {
28116 SDLoc dl(Op);
28117 MVT VT = Op.getSimpleValueType();
28119 // Decompose 256-bit ops into 128-bit ops.
28120 if (VT.is256BitVector() && !Subtarget.hasInt256())
28121 return splitVectorIntBinary(Op, DAG);
28123 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28124 return splitVectorIntBinary(Op, DAG);
28126 SDValue A = Op.getOperand(0);
28127 SDValue B = Op.getOperand(1);
28129 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28130 // vector pairs, multiply and truncate.
28131 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28132 unsigned NumElts = VT.getVectorNumElements();
28134 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28135 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28136 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28137 return DAG.getNode(
28138 ISD::TRUNCATE, dl, VT,
28139 DAG.getNode(ISD::MUL, dl, ExVT,
28140 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28141 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28144 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28146 // Extract the lo/hi parts to any extend to i16.
28147 // We're going to mask off the low byte of each result element of the
28148 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28149 // element.
28150 SDValue Undef = DAG.getUNDEF(VT);
28151 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28152 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28154 SDValue BLo, BHi;
28155 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28156 // If the RHS is a constant, manually unpackl/unpackh.
28157 SmallVector<SDValue, 16> LoOps, HiOps;
28158 for (unsigned i = 0; i != NumElts; i += 16) {
28159 for (unsigned j = 0; j != 8; ++j) {
28160 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28161 MVT::i16));
28162 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28163 MVT::i16));
28167 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28168 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28169 } else {
28170 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28171 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28174 // Multiply, mask the lower 8bits of the lo/hi results and pack.
28175 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28176 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28177 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28180 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28181 if (VT == MVT::v4i32) {
28182 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
28183 "Should not custom lower when pmulld is available!");
28185 // Extract the odd parts.
28186 static const int UnpackMask[] = { 1, -1, 3, -1 };
28187 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28188 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28190 // Multiply the even parts.
28191 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28192 DAG.getBitcast(MVT::v2i64, A),
28193 DAG.getBitcast(MVT::v2i64, B));
28194 // Now multiply odd parts.
28195 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28196 DAG.getBitcast(MVT::v2i64, Aodds),
28197 DAG.getBitcast(MVT::v2i64, Bodds));
28199 Evens = DAG.getBitcast(VT, Evens);
28200 Odds = DAG.getBitcast(VT, Odds);
28202 // Merge the two vectors back together with a shuffle. This expands into 2
28203 // shuffles.
28204 static const int ShufMask[] = { 0, 4, 2, 6 };
28205 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28208 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
28209 "Only know how to lower V2I64/V4I64/V8I64 multiply");
28210 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
28212 // Ahi = psrlqi(a, 32);
28213 // Bhi = psrlqi(b, 32);
28215 // AloBlo = pmuludq(a, b);
28216 // AloBhi = pmuludq(a, Bhi);
28217 // AhiBlo = pmuludq(Ahi, b);
28219 // Hi = psllqi(AloBhi + AhiBlo, 32);
28220 // return AloBlo + Hi;
28221 KnownBits AKnown = DAG.computeKnownBits(A);
28222 KnownBits BKnown = DAG.computeKnownBits(B);
28224 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28225 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28226 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28228 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28229 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28230 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28232 SDValue Zero = DAG.getConstant(0, dl, VT);
28234 // Only multiply lo/hi halves that aren't known to be zero.
28235 SDValue AloBlo = Zero;
28236 if (!ALoIsZero && !BLoIsZero)
28237 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28239 SDValue AloBhi = Zero;
28240 if (!ALoIsZero && !BHiIsZero) {
28241 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28242 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28245 SDValue AhiBlo = Zero;
28246 if (!AHiIsZero && !BLoIsZero) {
28247 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28248 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28251 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28252 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28254 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28257 static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
28258 MVT VT, bool IsSigned,
28259 const X86Subtarget &Subtarget,
28260 SelectionDAG &DAG,
28261 SDValue *Low = nullptr) {
28262 unsigned NumElts = VT.getVectorNumElements();
28264 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28265 // to a vXi16 type. Do the multiplies, shift the results and pack the half
28266 // lane results back together.
28268 // We'll take different approaches for signed and unsigned.
28269 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28270 // and use pmullw to calculate the full 16-bit product.
28271 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28272 // shift them left into the upper byte of each word. This allows us to use
28273 // pmulhw to calculate the full 16-bit product. This trick means we don't
28274 // need to sign extend the bytes to use pmullw.
28276 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28277 SDValue Zero = DAG.getConstant(0, dl, VT);
28279 SDValue ALo, AHi;
28280 if (IsSigned) {
28281 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28282 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28283 } else {
28284 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28285 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28288 SDValue BLo, BHi;
28289 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28290 // If the RHS is a constant, manually unpackl/unpackh and extend.
28291 SmallVector<SDValue, 16> LoOps, HiOps;
28292 for (unsigned i = 0; i != NumElts; i += 16) {
28293 for (unsigned j = 0; j != 8; ++j) {
28294 SDValue LoOp = B.getOperand(i + j);
28295 SDValue HiOp = B.getOperand(i + j + 8);
28297 if (IsSigned) {
28298 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28299 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28300 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28301 DAG.getConstant(8, dl, MVT::i16));
28302 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28303 DAG.getConstant(8, dl, MVT::i16));
28304 } else {
28305 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28306 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28309 LoOps.push_back(LoOp);
28310 HiOps.push_back(HiOp);
28314 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28315 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28316 } else if (IsSigned) {
28317 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28318 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28319 } else {
28320 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28321 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28324 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28325 // pack back to vXi8.
28326 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28327 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28328 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28330 if (Low)
28331 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28333 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
28336 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28337 SelectionDAG &DAG) {
28338 SDLoc dl(Op);
28339 MVT VT = Op.getSimpleValueType();
28340 bool IsSigned = Op->getOpcode() == ISD::MULHS;
28341 unsigned NumElts = VT.getVectorNumElements();
28342 SDValue A = Op.getOperand(0);
28343 SDValue B = Op.getOperand(1);
28345 // Decompose 256-bit ops into 128-bit ops.
28346 if (VT.is256BitVector() && !Subtarget.hasInt256())
28347 return splitVectorIntBinary(Op, DAG);
28349 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28350 return splitVectorIntBinary(Op, DAG);
28352 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28353 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
28354 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
28355 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
28357 // PMULxD operations multiply each even value (starting at 0) of LHS with
28358 // the related value of RHS and produce a widen result.
28359 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28360 // => <2 x i64> <ae|cg>
28362 // In other word, to have all the results, we need to perform two PMULxD:
28363 // 1. one with the even values.
28364 // 2. one with the odd values.
28365 // To achieve #2, with need to place the odd values at an even position.
28367 // Place the odd value at an even position (basically, shift all values 1
28368 // step to the left):
28369 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
28370 9, -1, 11, -1, 13, -1, 15, -1};
28371 // <a|b|c|d> => <b|undef|d|undef>
28372 SDValue Odd0 =
28373 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
28374 // <e|f|g|h> => <f|undef|h|undef>
28375 SDValue Odd1 =
28376 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
28378 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28379 // ints.
28380 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28381 unsigned Opcode =
28382 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28383 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28384 // => <2 x i64> <ae|cg>
28385 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28386 DAG.getBitcast(MulVT, A),
28387 DAG.getBitcast(MulVT, B)));
28388 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28389 // => <2 x i64> <bf|dh>
28390 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28391 DAG.getBitcast(MulVT, Odd0),
28392 DAG.getBitcast(MulVT, Odd1)));
28394 // Shuffle it back into the right order.
28395 SmallVector<int, 16> ShufMask(NumElts);
28396 for (int i = 0; i != (int)NumElts; ++i)
28397 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28399 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28401 // If we have a signed multiply but no PMULDQ fix up the result of an
28402 // unsigned multiply.
28403 if (IsSigned && !Subtarget.hasSSE41()) {
28404 SDValue Zero = DAG.getConstant(0, dl, VT);
28405 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28406 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28407 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28408 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28410 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28411 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28414 return Res;
28417 // Only i8 vectors should need custom lowering after this.
28418 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28419 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28420 "Unsupported vector type");
28422 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28423 // logical shift down the upper half and pack back to i8.
28425 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28426 // and then ashr/lshr the upper bits down to the lower bits before multiply.
28428 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28429 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28430 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28431 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28432 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28433 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28434 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28435 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28436 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28439 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28442 // Custom lowering for SMULO/UMULO.
28443 static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28444 SelectionDAG &DAG) {
28445 MVT VT = Op.getSimpleValueType();
28447 // Scalars defer to LowerXALUO.
28448 if (!VT.isVector())
28449 return LowerXALUO(Op, DAG);
28451 SDLoc dl(Op);
28452 bool IsSigned = Op->getOpcode() == ISD::SMULO;
28453 SDValue A = Op.getOperand(0);
28454 SDValue B = Op.getOperand(1);
28455 EVT OvfVT = Op->getValueType(1);
28457 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28458 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28459 // Extract the LHS Lo/Hi vectors
28460 SDValue LHSLo, LHSHi;
28461 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28463 // Extract the RHS Lo/Hi vectors
28464 SDValue RHSLo, RHSHi;
28465 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28467 EVT LoOvfVT, HiOvfVT;
28468 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28469 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28470 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28472 // Issue the split operations.
28473 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28474 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28476 // Join the separate data results and the overflow results.
28477 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28478 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28479 Hi.getValue(1));
28481 return DAG.getMergeValues({Res, Ovf}, dl);
28484 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28485 EVT SetccVT =
28486 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28488 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28489 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28490 unsigned NumElts = VT.getVectorNumElements();
28491 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28492 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28493 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28494 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28495 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28497 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28499 SDValue Ovf;
28500 if (IsSigned) {
28501 SDValue High, LowSign;
28502 if (OvfVT.getVectorElementType() == MVT::i1 &&
28503 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28504 // Rather the truncating try to do the compare on vXi16 or vXi32.
28505 // Shift the high down filling with sign bits.
28506 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
28507 // Fill all 16 bits with the sign bit from the low.
28508 LowSign =
28509 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
28510 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
28511 15, DAG);
28512 SetccVT = OvfVT;
28513 if (!Subtarget.hasBWI()) {
28514 // We can't do a vXi16 compare so sign extend to v16i32.
28515 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28516 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28518 } else {
28519 // Otherwise do the compare at vXi8.
28520 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28521 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28522 LowSign =
28523 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28526 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28527 } else {
28528 SDValue High =
28529 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28530 if (OvfVT.getVectorElementType() == MVT::i1 &&
28531 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28532 // Rather the truncating try to do the compare on vXi16 or vXi32.
28533 SetccVT = OvfVT;
28534 if (!Subtarget.hasBWI()) {
28535 // We can't do a vXi16 compare so sign extend to v16i32.
28536 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28538 } else {
28539 // Otherwise do the compare at vXi8.
28540 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28543 Ovf =
28544 DAG.getSetCC(dl, SetccVT, High,
28545 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28548 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28550 return DAG.getMergeValues({Low, Ovf}, dl);
28553 SDValue Low;
28554 SDValue High =
28555 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28557 SDValue Ovf;
28558 if (IsSigned) {
28559 // SMULO overflows if the high bits don't match the sign of the low.
28560 SDValue LowSign =
28561 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28562 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28563 } else {
28564 // UMULO overflows if the high bits are non-zero.
28565 Ovf =
28566 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28569 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28571 return DAG.getMergeValues({Low, Ovf}, dl);
28574 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28575 assert(Subtarget.isTargetWin64() && "Unexpected target");
28576 EVT VT = Op.getValueType();
28577 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28578 "Unexpected return type for lowering");
28580 if (isa<ConstantSDNode>(Op->getOperand(1))) {
28581 SmallVector<SDValue> Result;
28582 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
28583 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
28586 RTLIB::Libcall LC;
28587 bool isSigned;
28588 switch (Op->getOpcode()) {
28589 default: llvm_unreachable("Unexpected request for libcall!");
28590 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
28591 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
28592 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
28593 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
28596 SDLoc dl(Op);
28597 SDValue InChain = DAG.getEntryNode();
28599 TargetLowering::ArgListTy Args;
28600 TargetLowering::ArgListEntry Entry;
28601 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28602 EVT ArgVT = Op->getOperand(i).getValueType();
28603 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28604 "Unexpected argument type for lowering");
28605 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28606 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28607 MachinePointerInfo MPI =
28608 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28609 Entry.Node = StackPtr;
28610 InChain =
28611 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28612 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28613 Entry.Ty = PointerType::get(ArgTy,0);
28614 Entry.IsSExt = false;
28615 Entry.IsZExt = false;
28616 Args.push_back(Entry);
28619 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
28620 getPointerTy(DAG.getDataLayout()));
28622 TargetLowering::CallLoweringInfo CLI(DAG);
28623 CLI.setDebugLoc(dl)
28624 .setChain(InChain)
28625 .setLibCallee(
28626 getLibcallCallingConv(LC),
28627 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28628 std::move(Args))
28629 .setInRegister()
28630 .setSExtResult(isSigned)
28631 .setZExtResult(!isSigned);
28633 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28634 return DAG.getBitcast(VT, CallInfo.first);
28637 SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
28638 SelectionDAG &DAG,
28639 SDValue &Chain) const {
28640 assert(Subtarget.isTargetWin64() && "Unexpected target");
28641 EVT VT = Op.getValueType();
28642 bool IsStrict = Op->isStrictFPOpcode();
28644 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28645 EVT ArgVT = Arg.getValueType();
28647 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28648 "Unexpected return type for lowering");
28650 RTLIB::Libcall LC;
28651 if (Op->getOpcode() == ISD::FP_TO_SINT ||
28652 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
28653 LC = RTLIB::getFPTOSINT(ArgVT, VT);
28654 else
28655 LC = RTLIB::getFPTOUINT(ArgVT, VT);
28656 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28658 SDLoc dl(Op);
28659 MakeLibCallOptions CallOptions;
28660 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
28662 SDValue Result;
28663 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
28664 // expected VT (i128).
28665 std::tie(Result, Chain) =
28666 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
28667 Result = DAG.getBitcast(VT, Result);
28668 return Result;
28671 SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
28672 SelectionDAG &DAG) const {
28673 assert(Subtarget.isTargetWin64() && "Unexpected target");
28674 EVT VT = Op.getValueType();
28675 bool IsStrict = Op->isStrictFPOpcode();
28677 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28678 EVT ArgVT = Arg.getValueType();
28680 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28681 "Unexpected argument type for lowering");
28683 RTLIB::Libcall LC;
28684 if (Op->getOpcode() == ISD::SINT_TO_FP ||
28685 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
28686 LC = RTLIB::getSINTTOFP(ArgVT, VT);
28687 else
28688 LC = RTLIB::getUINTTOFP(ArgVT, VT);
28689 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28691 SDLoc dl(Op);
28692 MakeLibCallOptions CallOptions;
28693 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
28695 // Pass the i128 argument as an indirect argument on the stack.
28696 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28697 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28698 MachinePointerInfo MPI =
28699 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28700 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
28702 SDValue Result;
28703 std::tie(Result, Chain) =
28704 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
28705 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
28708 // Return true if the required (according to Opcode) shift-imm form is natively
28709 // supported by the Subtarget
28710 static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
28711 unsigned Opcode) {
28712 if (!VT.isSimple())
28713 return false;
28715 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
28716 return false;
28718 if (VT.getScalarSizeInBits() < 16)
28719 return false;
28721 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
28722 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
28723 return true;
28725 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
28726 (VT.is256BitVector() && Subtarget.hasInt256());
28728 bool AShift = LShift && (Subtarget.hasAVX512() ||
28729 (VT != MVT::v2i64 && VT != MVT::v4i64));
28730 return (Opcode == ISD::SRA) ? AShift : LShift;
28733 // The shift amount is a variable, but it is the same for all vector lanes.
28734 // These instructions are defined together with shift-immediate.
28735 static
28736 bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget,
28737 unsigned Opcode) {
28738 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
28741 // Return true if the required (according to Opcode) variable-shift form is
28742 // natively supported by the Subtarget
28743 static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
28744 unsigned Opcode) {
28745 if (!VT.isSimple())
28746 return false;
28748 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
28749 return false;
28751 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
28752 return false;
28754 // vXi16 supported only on AVX-512, BWI
28755 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
28756 return false;
28758 if (Subtarget.hasAVX512() &&
28759 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
28760 return true;
28762 bool LShift = VT.is128BitVector() || VT.is256BitVector();
28763 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
28764 return (Opcode == ISD::SRA) ? AShift : LShift;
28767 static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
28768 const X86Subtarget &Subtarget) {
28769 MVT VT = Op.getSimpleValueType();
28770 SDLoc dl(Op);
28771 SDValue R = Op.getOperand(0);
28772 SDValue Amt = Op.getOperand(1);
28773 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
28775 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
28776 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
28777 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
28778 SDValue Ex = DAG.getBitcast(ExVT, R);
28780 // ashr(R, 63) === cmp_slt(R, 0)
28781 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
28782 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
28783 "Unsupported PCMPGT op");
28784 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
28787 if (ShiftAmt >= 32) {
28788 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
28789 SDValue Upper =
28790 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
28791 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28792 ShiftAmt - 32, DAG);
28793 if (VT == MVT::v2i64)
28794 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
28795 if (VT == MVT::v4i64)
28796 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28797 {9, 1, 11, 3, 13, 5, 15, 7});
28798 } else {
28799 // SRA upper i32, SRL whole i64 and select lower i32.
28800 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28801 ShiftAmt, DAG);
28802 SDValue Lower =
28803 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
28804 Lower = DAG.getBitcast(ExVT, Lower);
28805 if (VT == MVT::v2i64)
28806 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
28807 if (VT == MVT::v4i64)
28808 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28809 {8, 1, 10, 3, 12, 5, 14, 7});
28811 return DAG.getBitcast(VT, Ex);
28814 // Optimize shl/srl/sra with constant shift amount.
28815 APInt APIntShiftAmt;
28816 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
28817 return SDValue();
28819 // If the shift amount is out of range, return undef.
28820 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
28821 return DAG.getUNDEF(VT);
28823 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
28825 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
28826 // Hardware support for vector shifts is sparse which makes us scalarize the
28827 // vector operations in many cases. Also, on sandybridge ADD is faster than
28828 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
28829 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
28830 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
28831 // must be 0). (add undef, undef) however can be any value. To make this
28832 // safe, we must freeze R to ensure that register allocation uses the same
28833 // register for an undefined value. This ensures that the result will
28834 // still be even and preserves the original semantics.
28835 R = DAG.getFreeze(R);
28836 return DAG.getNode(ISD::ADD, dl, VT, R, R);
28839 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
28842 // i64 SRA needs to be performed as partial shifts.
28843 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
28844 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
28845 Op.getOpcode() == ISD::SRA)
28846 return ArithmeticShiftRight64(ShiftAmt);
28848 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28849 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
28850 unsigned NumElts = VT.getVectorNumElements();
28851 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28853 // Simple i8 add case
28854 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
28855 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
28856 // must be 0). (add undef, undef) however can be any value. To make this
28857 // safe, we must freeze R to ensure that register allocation uses the same
28858 // register for an undefined value. This ensures that the result will
28859 // still be even and preserves the original semantics.
28860 R = DAG.getFreeze(R);
28861 return DAG.getNode(ISD::ADD, dl, VT, R, R);
28864 // ashr(R, 7) === cmp_slt(R, 0)
28865 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
28866 SDValue Zeros = DAG.getConstant(0, dl, VT);
28867 if (VT.is512BitVector()) {
28868 assert(VT == MVT::v64i8 && "Unexpected element type!");
28869 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
28870 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
28872 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
28875 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
28876 if (VT == MVT::v16i8 && Subtarget.hasXOP())
28877 return SDValue();
28879 if (Op.getOpcode() == ISD::SHL) {
28880 // Make a large shift.
28881 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
28882 ShiftAmt, DAG);
28883 SHL = DAG.getBitcast(VT, SHL);
28884 // Zero out the rightmost bits.
28885 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
28886 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
28888 if (Op.getOpcode() == ISD::SRL) {
28889 // Make a large shift.
28890 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
28891 ShiftAmt, DAG);
28892 SRL = DAG.getBitcast(VT, SRL);
28893 // Zero out the leftmost bits.
28894 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
28895 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
28897 if (Op.getOpcode() == ISD::SRA) {
28898 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
28899 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28901 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
28902 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
28903 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
28904 return Res;
28906 llvm_unreachable("Unknown shift opcode.");
28909 return SDValue();
28912 static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
28913 const X86Subtarget &Subtarget) {
28914 MVT VT = Op.getSimpleValueType();
28915 SDLoc dl(Op);
28916 SDValue R = Op.getOperand(0);
28917 SDValue Amt = Op.getOperand(1);
28918 unsigned Opcode = Op.getOpcode();
28919 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
28921 int BaseShAmtIdx = -1;
28922 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
28923 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
28924 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
28925 Subtarget, DAG);
28927 // vXi8 shifts - shift as v8i16 + mask result.
28928 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
28929 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
28930 VT == MVT::v64i8) &&
28931 !Subtarget.hasXOP()) {
28932 unsigned NumElts = VT.getVectorNumElements();
28933 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28934 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
28935 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
28936 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
28938 // Create the mask using vXi16 shifts. For shift-rights we need to move
28939 // the upper byte down before splatting the vXi8 mask.
28940 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
28941 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
28942 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
28943 if (Opcode != ISD::SHL)
28944 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
28945 8, DAG);
28946 BitMask = DAG.getBitcast(VT, BitMask);
28947 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
28948 SmallVector<int, 64>(NumElts, 0));
28950 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
28951 DAG.getBitcast(ExtVT, R), BaseShAmt,
28952 BaseShAmtIdx, Subtarget, DAG);
28953 Res = DAG.getBitcast(VT, Res);
28954 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
28956 if (Opcode == ISD::SRA) {
28957 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
28958 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
28959 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
28960 SignMask =
28961 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
28962 BaseShAmtIdx, Subtarget, DAG);
28963 SignMask = DAG.getBitcast(VT, SignMask);
28964 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
28965 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
28967 return Res;
28972 return SDValue();
28975 // Convert a shift/rotate left amount to a multiplication scale factor.
28976 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
28977 const X86Subtarget &Subtarget,
28978 SelectionDAG &DAG) {
28979 MVT VT = Amt.getSimpleValueType();
28980 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
28981 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
28982 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
28983 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
28984 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28985 (Subtarget.hasBWI() && VT == MVT::v64i8)))
28986 return SDValue();
28988 MVT SVT = VT.getVectorElementType();
28989 unsigned SVTBits = SVT.getSizeInBits();
28990 unsigned NumElems = VT.getVectorNumElements();
28992 APInt UndefElts;
28993 SmallVector<APInt> EltBits;
28994 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
28995 APInt One(SVTBits, 1);
28996 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
28997 for (unsigned I = 0; I != NumElems; ++I) {
28998 if (UndefElts[I] || EltBits[I].uge(SVTBits))
28999 continue;
29000 uint64_t ShAmt = EltBits[I].getZExtValue();
29001 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
29003 return DAG.getBuildVector(VT, dl, Elts);
29006 // If the target doesn't support variable shifts, use either FP conversion
29007 // or integer multiplication to avoid shifting each element individually.
29008 if (VT == MVT::v4i32) {
29009 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
29010 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
29011 DAG.getConstant(0x3f800000U, dl, VT));
29012 Amt = DAG.getBitcast(MVT::v4f32, Amt);
29013 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
29016 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
29017 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
29018 SDValue Z = DAG.getConstant(0, dl, VT);
29019 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
29020 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
29021 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
29022 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
29023 if (Subtarget.hasSSE41())
29024 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29025 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
29028 return SDValue();
29031 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
29032 SelectionDAG &DAG) {
29033 MVT VT = Op.getSimpleValueType();
29034 SDLoc dl(Op);
29035 SDValue R = Op.getOperand(0);
29036 SDValue Amt = Op.getOperand(1);
29037 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29038 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29040 unsigned Opc = Op.getOpcode();
29041 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
29042 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
29044 assert(VT.isVector() && "Custom lowering only for vector shifts!");
29045 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
29047 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
29048 return V;
29050 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
29051 return V;
29053 if (supportedVectorVarShift(VT, Subtarget, Opc))
29054 return Op;
29056 // i64 vector arithmetic shift can be emulated with the transform:
29057 // M = lshr(SIGN_MASK, Amt)
29058 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29059 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
29060 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29061 Opc == ISD::SRA) {
29062 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29063 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29064 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29065 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29066 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29067 return R;
29070 // XOP has 128-bit variable logical/arithmetic shifts.
29071 // +ve/-ve Amt = shift left/right.
29072 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
29073 VT == MVT::v8i16 || VT == MVT::v16i8)) {
29074 if (Opc == ISD::SRL || Opc == ISD::SRA) {
29075 SDValue Zero = DAG.getConstant(0, dl, VT);
29076 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
29078 if (Opc == ISD::SHL || Opc == ISD::SRL)
29079 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
29080 if (Opc == ISD::SRA)
29081 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
29084 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29085 // shifts per-lane and then shuffle the partial results back together.
29086 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
29087 // Splat the shift amounts so the scalar shifts above will catch it.
29088 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29089 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29090 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29091 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29092 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29095 // If possible, lower this shift as a sequence of two shifts by
29096 // constant plus a BLENDing shuffle instead of scalarizing it.
29097 // Example:
29098 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29100 // Could be rewritten as:
29101 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29103 // The advantage is that the two shifts from the example would be
29104 // lowered as X86ISD::VSRLI nodes in parallel before blending.
29105 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29106 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29107 SDValue Amt1, Amt2;
29108 unsigned NumElts = VT.getVectorNumElements();
29109 SmallVector<int, 8> ShuffleMask;
29110 for (unsigned i = 0; i != NumElts; ++i) {
29111 SDValue A = Amt->getOperand(i);
29112 if (A.isUndef()) {
29113 ShuffleMask.push_back(SM_SentinelUndef);
29114 continue;
29116 if (!Amt1 || Amt1 == A) {
29117 ShuffleMask.push_back(i);
29118 Amt1 = A;
29119 continue;
29121 if (!Amt2 || Amt2 == A) {
29122 ShuffleMask.push_back(i + NumElts);
29123 Amt2 = A;
29124 continue;
29126 break;
29129 // Only perform this blend if we can perform it without loading a mask.
29130 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29131 (VT != MVT::v16i16 ||
29132 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29133 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29134 canWidenShuffleElements(ShuffleMask))) {
29135 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29136 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29137 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29138 Cst2->getAPIntValue().ult(EltSizeInBits)) {
29139 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29140 Cst1->getZExtValue(), DAG);
29141 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29142 Cst2->getZExtValue(), DAG);
29143 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29148 // If possible, lower this packed shift into a vector multiply instead of
29149 // expanding it into a sequence of scalar shifts.
29150 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
29151 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
29152 Subtarget.canExtendTo512BW())))
29153 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29154 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29156 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29157 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29158 if (Opc == ISD::SRL && ConstantAmt &&
29159 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29160 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29161 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29162 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29163 SDValue Zero = DAG.getConstant(0, dl, VT);
29164 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29165 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29166 return DAG.getSelect(dl, VT, ZAmt, R, Res);
29170 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29171 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29172 // TODO: Special case handling for shift by 0/1, really we can afford either
29173 // of these cases in pre-SSE41/XOP/AVX512 but not both.
29174 if (Opc == ISD::SRA && ConstantAmt &&
29175 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29176 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29177 !Subtarget.hasAVX512()) ||
29178 DAG.isKnownNeverZero(Amt))) {
29179 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29180 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29181 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29182 SDValue Amt0 =
29183 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29184 SDValue Amt1 =
29185 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29186 SDValue Sra1 =
29187 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29188 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29189 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29190 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29194 // v4i32 Non Uniform Shifts.
29195 // If the shift amount is constant we can shift each lane using the SSE2
29196 // immediate shifts, else we need to zero-extend each lane to the lower i64
29197 // and shift using the SSE2 variable shifts.
29198 // The separate results can then be blended together.
29199 if (VT == MVT::v4i32) {
29200 SDValue Amt0, Amt1, Amt2, Amt3;
29201 if (ConstantAmt) {
29202 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29203 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29204 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29205 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29206 } else {
29207 // The SSE2 shifts use the lower i64 as the same shift amount for
29208 // all lanes and the upper i64 is ignored. On AVX we're better off
29209 // just zero-extending, but for SSE just duplicating the top 16-bits is
29210 // cheaper and has the same effect for out of range values.
29211 if (Subtarget.hasAVX()) {
29212 SDValue Z = DAG.getConstant(0, dl, VT);
29213 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29214 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29215 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29216 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29217 } else {
29218 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29219 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29220 {4, 5, 6, 7, -1, -1, -1, -1});
29221 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
29222 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
29223 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
29224 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
29225 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
29226 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
29230 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29231 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29232 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29233 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29234 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29236 // Merge the shifted lane results optimally with/without PBLENDW.
29237 // TODO - ideally shuffle combining would handle this.
29238 if (Subtarget.hasSSE41()) {
29239 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29240 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29241 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29243 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29244 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29245 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29248 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29249 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29250 // make the existing SSE solution better.
29251 // NOTE: We honor prefered vector width before promoting to 512-bits.
29252 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29253 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29254 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29255 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29256 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29257 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
29258 "Unexpected vector type");
29259 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29260 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29261 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29262 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29263 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29264 return DAG.getNode(ISD::TRUNCATE, dl, VT,
29265 DAG.getNode(Opc, dl, ExtVT, R, Amt));
29268 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29269 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29270 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29271 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29272 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29273 !Subtarget.hasXOP()) {
29274 int NumElts = VT.getVectorNumElements();
29275 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29277 // Extend constant shift amount to vXi16 (it doesn't matter if the type
29278 // isn't legal).
29279 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29280 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29281 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29282 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29283 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
29284 "Constant build vector expected");
29286 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29287 bool IsSigned = Opc == ISD::SRA;
29288 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
29289 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29290 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29291 return DAG.getZExtOrTrunc(R, dl, VT);
29294 SmallVector<SDValue, 16> LoAmt, HiAmt;
29295 for (int i = 0; i != NumElts; i += 16) {
29296 for (int j = 0; j != 8; ++j) {
29297 LoAmt.push_back(Amt.getOperand(i + j));
29298 HiAmt.push_back(Amt.getOperand(i + j + 8));
29302 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29303 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29304 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29306 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29307 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29308 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29309 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29310 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29311 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29312 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29313 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29314 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29317 if (VT == MVT::v16i8 ||
29318 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29319 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29320 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29322 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29323 if (VT.is512BitVector()) {
29324 // On AVX512BW targets we make use of the fact that VSELECT lowers
29325 // to a masked blend which selects bytes based just on the sign bit
29326 // extracted to a mask.
29327 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29328 V0 = DAG.getBitcast(VT, V0);
29329 V1 = DAG.getBitcast(VT, V1);
29330 Sel = DAG.getBitcast(VT, Sel);
29331 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29332 ISD::SETGT);
29333 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29334 } else if (Subtarget.hasSSE41()) {
29335 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29336 // on the sign bit.
29337 V0 = DAG.getBitcast(VT, V0);
29338 V1 = DAG.getBitcast(VT, V1);
29339 Sel = DAG.getBitcast(VT, Sel);
29340 return DAG.getBitcast(SelVT,
29341 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29343 // On pre-SSE41 targets we test for the sign bit by comparing to
29344 // zero - a negative value will set all bits of the lanes to true
29345 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29346 SDValue Z = DAG.getConstant(0, dl, SelVT);
29347 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29348 return DAG.getSelect(dl, SelVT, C, V0, V1);
29351 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29352 // We can safely do this using i16 shifts as we're only interested in
29353 // the 3 lower bits of each byte.
29354 Amt = DAG.getBitcast(ExtVT, Amt);
29355 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29356 Amt = DAG.getBitcast(VT, Amt);
29358 if (Opc == ISD::SHL || Opc == ISD::SRL) {
29359 // r = VSELECT(r, shift(r, 4), a);
29360 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29361 R = SignBitSelect(VT, Amt, M, R);
29363 // a += a
29364 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29366 // r = VSELECT(r, shift(r, 2), a);
29367 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29368 R = SignBitSelect(VT, Amt, M, R);
29370 // a += a
29371 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29373 // return VSELECT(r, shift(r, 1), a);
29374 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29375 R = SignBitSelect(VT, Amt, M, R);
29376 return R;
29379 if (Opc == ISD::SRA) {
29380 // For SRA we need to unpack each byte to the higher byte of a i16 vector
29381 // so we can correctly sign extend. We don't care what happens to the
29382 // lower byte.
29383 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29384 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29385 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29386 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29387 ALo = DAG.getBitcast(ExtVT, ALo);
29388 AHi = DAG.getBitcast(ExtVT, AHi);
29389 RLo = DAG.getBitcast(ExtVT, RLo);
29390 RHi = DAG.getBitcast(ExtVT, RHi);
29392 // r = VSELECT(r, shift(r, 4), a);
29393 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29394 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29395 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29396 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29398 // a += a
29399 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29400 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29402 // r = VSELECT(r, shift(r, 2), a);
29403 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29404 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29405 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29406 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29408 // a += a
29409 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29410 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29412 // r = VSELECT(r, shift(r, 1), a);
29413 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29414 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29415 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29416 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29418 // Logical shift the result back to the lower byte, leaving a zero upper
29419 // byte meaning that we can safely pack with PACKUSWB.
29420 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29421 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29422 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29426 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29427 MVT ExtVT = MVT::v8i32;
29428 SDValue Z = DAG.getConstant(0, dl, VT);
29429 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29430 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29431 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29432 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29433 ALo = DAG.getBitcast(ExtVT, ALo);
29434 AHi = DAG.getBitcast(ExtVT, AHi);
29435 RLo = DAG.getBitcast(ExtVT, RLo);
29436 RHi = DAG.getBitcast(ExtVT, RHi);
29437 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29438 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29439 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29440 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29441 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29444 if (VT == MVT::v8i16) {
29445 // If we have a constant shift amount, the non-SSE41 path is best as
29446 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29447 bool UseSSE41 = Subtarget.hasSSE41() &&
29448 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29450 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29451 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29452 // the sign bit.
29453 if (UseSSE41) {
29454 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29455 V0 = DAG.getBitcast(ExtVT, V0);
29456 V1 = DAG.getBitcast(ExtVT, V1);
29457 Sel = DAG.getBitcast(ExtVT, Sel);
29458 return DAG.getBitcast(
29459 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29461 // On pre-SSE41 targets we splat the sign bit - a negative value will
29462 // set all bits of the lanes to true and VSELECT uses that in
29463 // its OR(AND(V0,C),AND(V1,~C)) lowering.
29464 SDValue C =
29465 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29466 return DAG.getSelect(dl, VT, C, V0, V1);
29469 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29470 if (UseSSE41) {
29471 // On SSE41 targets we need to replicate the shift mask in both
29472 // bytes for PBLENDVB.
29473 Amt = DAG.getNode(
29474 ISD::OR, dl, VT,
29475 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29476 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29477 } else {
29478 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29481 // r = VSELECT(r, shift(r, 8), a);
29482 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29483 R = SignBitSelect(Amt, M, R);
29485 // a += a
29486 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29488 // r = VSELECT(r, shift(r, 4), a);
29489 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29490 R = SignBitSelect(Amt, M, R);
29492 // a += a
29493 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29495 // r = VSELECT(r, shift(r, 2), a);
29496 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29497 R = SignBitSelect(Amt, M, R);
29499 // a += a
29500 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29502 // return VSELECT(r, shift(r, 1), a);
29503 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29504 R = SignBitSelect(Amt, M, R);
29505 return R;
29508 // Decompose 256-bit shifts into 128-bit shifts.
29509 if (VT.is256BitVector())
29510 return splitVectorIntBinary(Op, DAG);
29512 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29513 return splitVectorIntBinary(Op, DAG);
29515 return SDValue();
29518 static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
29519 SelectionDAG &DAG) {
29520 MVT VT = Op.getSimpleValueType();
29521 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
29522 "Unexpected funnel shift opcode!");
29524 SDLoc DL(Op);
29525 SDValue Op0 = Op.getOperand(0);
29526 SDValue Op1 = Op.getOperand(1);
29527 SDValue Amt = Op.getOperand(2);
29528 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29529 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
29531 if (VT.isVector()) {
29532 APInt APIntShiftAmt;
29533 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
29535 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
29536 if (IsFSHR)
29537 std::swap(Op0, Op1);
29539 if (IsCstSplat) {
29540 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29541 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
29542 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
29543 {Op0, Op1, Imm}, DAG, Subtarget);
29545 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
29546 {Op0, Op1, Amt}, DAG, Subtarget);
29548 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
29549 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
29550 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
29551 "Unexpected funnel shift type!");
29553 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
29554 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
29555 if (IsCstSplat) {
29556 // TODO: Can't use generic expansion as UNDEF amt elements can be
29557 // converted to other values when folded to shift amounts, losing the
29558 // splat.
29559 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29560 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
29561 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
29562 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
29563 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
29564 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
29565 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
29566 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
29569 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
29570 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29571 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
29573 // Constant vXi16 funnel shifts can be efficiently handled by default.
29574 if (IsCst && EltSizeInBits == 16)
29575 return SDValue();
29577 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
29578 unsigned NumElts = VT.getVectorNumElements();
29579 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
29580 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
29582 // Split 256-bit integers on XOP/pre-AVX2 targets.
29583 // Split 512-bit integers on non 512-bit BWI targets.
29584 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
29585 !Subtarget.hasAVX2())) ||
29586 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
29587 EltSizeInBits < 32)) {
29588 // Pre-mask the amount modulo using the wider vector.
29589 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
29590 return splitVectorOp(Op, DAG);
29593 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
29594 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
29595 int ScalarAmtIdx = -1;
29596 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
29597 // Uniform vXi16 funnel shifts can be efficiently handled by default.
29598 if (EltSizeInBits == 16)
29599 return SDValue();
29601 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29602 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29603 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
29604 ScalarAmtIdx, Subtarget, DAG);
29605 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
29606 ScalarAmtIdx, Subtarget, DAG);
29607 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29611 MVT WideSVT = MVT::getIntegerVT(
29612 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
29613 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
29615 // If per-element shifts are legal, fallback to generic expansion.
29616 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
29617 return SDValue();
29619 // Attempt to fold as:
29620 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
29621 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
29622 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
29623 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
29624 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
29625 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
29626 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
29627 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
29628 EltSizeInBits, DAG);
29629 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
29630 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
29631 if (!IsFSHR)
29632 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
29633 EltSizeInBits, DAG);
29634 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
29637 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
29638 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
29639 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
29640 SDValue Z = DAG.getConstant(0, DL, VT);
29641 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29642 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29643 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
29644 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
29645 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
29646 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
29647 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29650 // Fallback to generic expansion.
29651 return SDValue();
29653 assert(
29654 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
29655 "Unexpected funnel shift type!");
29657 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
29658 bool OptForSize = DAG.shouldOptForSize();
29659 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
29661 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
29662 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
29663 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
29664 !isa<ConstantSDNode>(Amt)) {
29665 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
29666 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
29667 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
29668 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
29669 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
29670 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
29671 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
29672 if (IsFSHR) {
29673 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
29674 } else {
29675 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
29676 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
29678 return DAG.getZExtOrTrunc(Res, DL, VT);
29681 if (VT == MVT::i8 || ExpandFunnel)
29682 return SDValue();
29684 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
29685 if (VT == MVT::i16) {
29686 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
29687 DAG.getConstant(15, DL, Amt.getValueType()));
29688 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
29689 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
29692 return Op;
29695 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
29696 SelectionDAG &DAG) {
29697 MVT VT = Op.getSimpleValueType();
29698 assert(VT.isVector() && "Custom lowering only for vector rotates!");
29700 SDLoc DL(Op);
29701 SDValue R = Op.getOperand(0);
29702 SDValue Amt = Op.getOperand(1);
29703 unsigned Opcode = Op.getOpcode();
29704 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29705 int NumElts = VT.getVectorNumElements();
29706 bool IsROTL = Opcode == ISD::ROTL;
29708 // Check for constant splat rotation amount.
29709 APInt CstSplatValue;
29710 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
29712 // Check for splat rotate by zero.
29713 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
29714 return R;
29716 // AVX512 implicitly uses modulo rotation amounts.
29717 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
29718 // Attempt to rotate by immediate.
29719 if (IsCstSplat) {
29720 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
29721 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
29722 return DAG.getNode(RotOpc, DL, VT, R,
29723 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
29726 // Else, fall-back on VPROLV/VPRORV.
29727 return Op;
29730 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
29731 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
29732 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
29733 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
29736 SDValue Z = DAG.getConstant(0, DL, VT);
29738 if (!IsROTL) {
29739 // If the ISD::ROTR amount is constant, we're always better converting to
29740 // ISD::ROTL.
29741 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
29742 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
29744 // XOP targets always prefers ISD::ROTL.
29745 if (Subtarget.hasXOP())
29746 return DAG.getNode(ISD::ROTL, DL, VT, R,
29747 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
29750 // Split 256-bit integers on XOP/pre-AVX2 targets.
29751 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
29752 return splitVectorIntBinary(Op, DAG);
29754 // XOP has 128-bit vector variable + immediate rotates.
29755 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
29756 // XOP implicitly uses modulo rotation amounts.
29757 if (Subtarget.hasXOP()) {
29758 assert(IsROTL && "Only ROTL expected");
29759 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
29761 // Attempt to rotate by immediate.
29762 if (IsCstSplat) {
29763 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
29764 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
29765 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
29768 // Use general rotate by variable (per-element).
29769 return Op;
29772 // Rotate by an uniform constant - expand back to shifts.
29773 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
29774 // to other values when folded to shift amounts, losing the splat.
29775 if (IsCstSplat) {
29776 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
29777 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
29778 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
29779 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
29780 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
29781 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
29782 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
29783 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
29786 // Split 512-bit integers on non 512-bit BWI targets.
29787 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
29788 return splitVectorIntBinary(Op, DAG);
29790 assert(
29791 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
29792 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
29793 Subtarget.hasAVX2()) ||
29794 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
29795 "Only vXi32/vXi16/vXi8 vector rotates supported");
29797 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
29798 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
29800 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
29801 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29803 // Attempt to fold as unpack(x,x) << zext(splat(y)):
29804 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
29805 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
29806 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
29807 int BaseRotAmtIdx = -1;
29808 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
29809 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
29810 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
29811 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
29813 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
29814 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
29815 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
29816 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
29817 BaseRotAmtIdx, Subtarget, DAG);
29818 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
29819 BaseRotAmtIdx, Subtarget, DAG);
29820 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
29824 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29825 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
29827 // Attempt to fold as unpack(x,x) << zext(y):
29828 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
29829 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
29830 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
29831 if (!(ConstantAmt && EltSizeInBits != 8) &&
29832 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
29833 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
29834 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
29835 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
29836 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
29837 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
29838 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
29839 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
29840 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
29843 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
29844 // the amount bit.
29845 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
29846 if (EltSizeInBits == 8) {
29847 MVT WideVT =
29848 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
29850 // Attempt to fold as:
29851 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
29852 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
29853 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
29854 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
29855 // If we're rotating by constant, just use default promotion.
29856 if (ConstantAmt)
29857 return SDValue();
29858 // See if we can perform this by widening to vXi16 or vXi32.
29859 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
29860 R = DAG.getNode(
29861 ISD::OR, DL, WideVT, R,
29862 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
29863 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
29864 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
29865 if (IsROTL)
29866 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
29867 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
29870 // We don't need ModuloAmt here as we just peek at individual bits.
29871 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29872 if (Subtarget.hasSSE41()) {
29873 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29874 // on the sign bit.
29875 V0 = DAG.getBitcast(VT, V0);
29876 V1 = DAG.getBitcast(VT, V1);
29877 Sel = DAG.getBitcast(VT, Sel);
29878 return DAG.getBitcast(SelVT,
29879 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
29881 // On pre-SSE41 targets we test for the sign bit by comparing to
29882 // zero - a negative value will set all bits of the lanes to true
29883 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29884 SDValue Z = DAG.getConstant(0, DL, SelVT);
29885 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
29886 return DAG.getSelect(DL, SelVT, C, V0, V1);
29889 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
29890 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
29891 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
29892 IsROTL = true;
29895 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
29896 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
29898 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29899 // We can safely do this using i16 shifts as we're only interested in
29900 // the 3 lower bits of each byte.
29901 Amt = DAG.getBitcast(ExtVT, Amt);
29902 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
29903 Amt = DAG.getBitcast(VT, Amt);
29905 // r = VSELECT(r, rot(r, 4), a);
29906 SDValue M;
29907 M = DAG.getNode(
29908 ISD::OR, DL, VT,
29909 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
29910 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
29911 R = SignBitSelect(VT, Amt, M, R);
29913 // a += a
29914 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29916 // r = VSELECT(r, rot(r, 2), a);
29917 M = DAG.getNode(
29918 ISD::OR, DL, VT,
29919 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
29920 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
29921 R = SignBitSelect(VT, Amt, M, R);
29923 // a += a
29924 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29926 // return VSELECT(r, rot(r, 1), a);
29927 M = DAG.getNode(
29928 ISD::OR, DL, VT,
29929 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
29930 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
29931 return SignBitSelect(VT, Amt, M, R);
29934 bool IsSplatAmt = DAG.isSplatValue(Amt);
29935 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
29936 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
29938 // Fallback for splats + all supported variable shifts.
29939 // Fallback for non-constants AVX2 vXi16 as well.
29940 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
29941 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29942 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
29943 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
29944 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
29945 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
29946 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
29949 // Everything below assumes ISD::ROTL.
29950 if (!IsROTL) {
29951 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
29952 IsROTL = true;
29955 // ISD::ROT* uses modulo rotate amounts.
29956 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29958 assert(IsROTL && "Only ROTL supported");
29960 // As with shifts, attempt to convert the rotation amount to a multiplication
29961 // factor, fallback to general expansion.
29962 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
29963 if (!Scale)
29964 return SDValue();
29966 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
29967 if (EltSizeInBits == 16) {
29968 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
29969 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
29970 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29973 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
29974 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
29975 // that can then be OR'd with the lower 32-bits.
29976 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
29977 static const int OddMask[] = {1, -1, 3, -1};
29978 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
29979 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
29981 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29982 DAG.getBitcast(MVT::v2i64, R),
29983 DAG.getBitcast(MVT::v2i64, Scale));
29984 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29985 DAG.getBitcast(MVT::v2i64, R13),
29986 DAG.getBitcast(MVT::v2i64, Scale13));
29987 Res02 = DAG.getBitcast(VT, Res02);
29988 Res13 = DAG.getBitcast(VT, Res13);
29990 return DAG.getNode(ISD::OR, DL, VT,
29991 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
29992 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
29995 /// Returns true if the operand type is exactly twice the native width, and
29996 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
29997 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
29998 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
29999 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
30000 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
30002 if (OpWidth == 64)
30003 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
30004 if (OpWidth == 128)
30005 return Subtarget.canUseCMPXCHG16B();
30007 return false;
30010 TargetLoweringBase::AtomicExpansionKind
30011 X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
30012 Type *MemType = SI->getValueOperand()->getType();
30014 bool NoImplicitFloatOps =
30015 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30016 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30017 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30018 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30019 return AtomicExpansionKind::None;
30021 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
30022 : AtomicExpansionKind::None;
30025 // Note: this turns large loads into lock cmpxchg8b/16b.
30026 // TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
30027 TargetLowering::AtomicExpansionKind
30028 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
30029 Type *MemType = LI->getType();
30031 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30032 // can use movq to do the load. If we have X87 we can load into an 80-bit
30033 // X87 register and store it to a stack temporary.
30034 bool NoImplicitFloatOps =
30035 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30036 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30037 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30038 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30039 return AtomicExpansionKind::None;
30041 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30042 : AtomicExpansionKind::None;
30045 enum BitTestKind : unsigned {
30046 UndefBit,
30047 ConstantBit,
30048 NotConstantBit,
30049 ShiftBit,
30050 NotShiftBit
30053 static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
30054 using namespace llvm::PatternMatch;
30055 BitTestKind BTK = UndefBit;
30056 auto *C = dyn_cast<ConstantInt>(V);
30057 if (C) {
30058 // Check if V is a power of 2 or NOT power of 2.
30059 if (isPowerOf2_64(C->getZExtValue()))
30060 BTK = ConstantBit;
30061 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
30062 BTK = NotConstantBit;
30063 return {V, BTK};
30066 // Check if V is some power of 2 pattern known to be non-zero
30067 auto *I = dyn_cast<Instruction>(V);
30068 if (I) {
30069 bool Not = false;
30070 // Check if we have a NOT
30071 Value *PeekI;
30072 if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
30073 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
30074 Not = true;
30075 I = dyn_cast<Instruction>(PeekI);
30077 // If I is constant, it will fold and we can evaluate later. If its an
30078 // argument or something of that nature, we can't analyze.
30079 if (I == nullptr)
30080 return {nullptr, UndefBit};
30082 // We can only use 1 << X without more sophisticated analysis. C << X where
30083 // C is a power of 2 but not 1 can result in zero which cannot be translated
30084 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
30085 if (I->getOpcode() == Instruction::Shl) {
30086 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
30087 // -X` and some other provable power of 2 patterns that we can use CTZ on
30088 // may be profitable.
30089 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
30090 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
30091 // be provably a non-zero power of 2.
30092 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
30093 // transformable to bittest.
30094 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
30095 if (!ShiftVal)
30096 return {nullptr, UndefBit};
30097 if (ShiftVal->equalsInt(1))
30098 BTK = Not ? NotShiftBit : ShiftBit;
30100 if (BTK == UndefBit)
30101 return {nullptr, UndefBit};
30103 Value *BitV = I->getOperand(1);
30105 Value *AndOp;
30106 const APInt *AndC;
30107 if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
30108 // Read past a shiftmask instruction to find count
30109 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
30110 BitV = AndOp;
30112 return {BitV, BTK};
30115 return {nullptr, UndefBit};
30118 TargetLowering::AtomicExpansionKind
30119 X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
30120 using namespace llvm::PatternMatch;
30121 // If the atomicrmw's result isn't actually used, we can just add a "lock"
30122 // prefix to a normal instruction for these operations.
30123 if (AI->use_empty())
30124 return AtomicExpansionKind::None;
30126 if (AI->getOperation() == AtomicRMWInst::Xor) {
30127 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
30128 // preferable to both `cmpxchg` and `btc`.
30129 if (match(AI->getOperand(1), m_SignMask()))
30130 return AtomicExpansionKind::None;
30133 // If the atomicrmw's result is used by a single bit AND, we may use
30134 // bts/btr/btc instruction for these operations.
30135 // Note: InstCombinePass can cause a de-optimization here. It replaces the
30136 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
30137 // (depending on CC). This pattern can only use bts/btr/btc but we don't
30138 // detect it.
30139 Instruction *I = AI->user_back();
30140 auto BitChange = FindSingleBitChange(AI->getValOperand());
30141 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
30142 I->getOpcode() != Instruction::And ||
30143 AI->getType()->getPrimitiveSizeInBits() == 8 ||
30144 AI->getParent() != I->getParent())
30145 return AtomicExpansionKind::CmpXChg;
30147 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
30149 // This is a redundant AND, it should get cleaned up elsewhere.
30150 if (AI == I->getOperand(OtherIdx))
30151 return AtomicExpansionKind::CmpXChg;
30153 // The following instruction must be a AND single bit.
30154 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
30155 auto *C1 = cast<ConstantInt>(AI->getValOperand());
30156 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
30157 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
30158 return AtomicExpansionKind::CmpXChg;
30160 if (AI->getOperation() == AtomicRMWInst::And) {
30161 return ~C1->getValue() == C2->getValue()
30162 ? AtomicExpansionKind::BitTestIntrinsic
30163 : AtomicExpansionKind::CmpXChg;
30165 return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
30166 : AtomicExpansionKind::CmpXChg;
30169 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
30171 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
30172 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
30173 return AtomicExpansionKind::CmpXChg;
30175 assert(BitChange.first != nullptr && BitTested.first != nullptr);
30177 // If shift amounts are not the same we can't use BitTestIntrinsic.
30178 if (BitChange.first != BitTested.first)
30179 return AtomicExpansionKind::CmpXChg;
30181 // If atomic AND need to be masking all be one bit and testing the one bit
30182 // unset in the mask.
30183 if (AI->getOperation() == AtomicRMWInst::And)
30184 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
30185 ? AtomicExpansionKind::BitTestIntrinsic
30186 : AtomicExpansionKind::CmpXChg;
30188 // If atomic XOR/OR need to be setting and testing the same bit.
30189 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
30190 ? AtomicExpansionKind::BitTestIntrinsic
30191 : AtomicExpansionKind::CmpXChg;
30194 void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
30195 IRBuilder<> Builder(AI);
30196 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30197 Intrinsic::ID IID_C = Intrinsic::not_intrinsic;
30198 Intrinsic::ID IID_I = Intrinsic::not_intrinsic;
30199 switch (AI->getOperation()) {
30200 default:
30201 llvm_unreachable("Unknown atomic operation");
30202 case AtomicRMWInst::Or:
30203 IID_C = Intrinsic::x86_atomic_bts;
30204 IID_I = Intrinsic::x86_atomic_bts_rm;
30205 break;
30206 case AtomicRMWInst::Xor:
30207 IID_C = Intrinsic::x86_atomic_btc;
30208 IID_I = Intrinsic::x86_atomic_btc_rm;
30209 break;
30210 case AtomicRMWInst::And:
30211 IID_C = Intrinsic::x86_atomic_btr;
30212 IID_I = Intrinsic::x86_atomic_btr_rm;
30213 break;
30215 Instruction *I = AI->user_back();
30216 LLVMContext &Ctx = AI->getContext();
30217 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30218 PointerType::getUnqual(Ctx));
30219 Function *BitTest = nullptr;
30220 Value *Result = nullptr;
30221 auto BitTested = FindSingleBitChange(AI->getValOperand());
30222 assert(BitTested.first != nullptr);
30224 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
30225 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
30227 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
30229 unsigned Imm = llvm::countr_zero(C->getZExtValue());
30230 Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
30231 } else {
30232 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
30234 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
30236 Value *SI = BitTested.first;
30237 assert(SI != nullptr);
30239 // BT{S|R|C} on memory operand don't modulo bit position so we need to
30240 // mask it.
30241 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
30242 Value *BitPos =
30243 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
30244 // Todo(1): In many cases it may be provable that SI is less than
30245 // ShiftBits in which case this mask is unnecessary
30246 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
30247 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
30248 // favor of just a raw BT{S|R|C}.
30250 Result = Builder.CreateCall(BitTest, {Addr, BitPos});
30251 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
30253 // If the result is only used for zero/non-zero status then we don't need to
30254 // shift value back. Otherwise do so.
30255 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
30256 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
30257 if (ICmp->isEquality()) {
30258 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
30259 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
30260 if (C0 || C1) {
30261 assert(C0 == nullptr || C1 == nullptr);
30262 if ((C0 ? C0 : C1)->isZero())
30263 continue;
30267 Result = Builder.CreateShl(Result, BitPos);
30268 break;
30272 I->replaceAllUsesWith(Result);
30273 I->eraseFromParent();
30274 AI->eraseFromParent();
30277 static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
30278 using namespace llvm::PatternMatch;
30279 if (!AI->hasOneUse())
30280 return false;
30282 Value *Op = AI->getOperand(1);
30283 ICmpInst::Predicate Pred;
30284 Instruction *I = AI->user_back();
30285 AtomicRMWInst::BinOp Opc = AI->getOperation();
30286 if (Opc == AtomicRMWInst::Add) {
30287 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
30288 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30289 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
30290 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30291 return Pred == CmpInst::ICMP_SLT;
30292 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30293 return Pred == CmpInst::ICMP_SGT;
30295 return false;
30297 if (Opc == AtomicRMWInst::Sub) {
30298 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30299 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30300 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
30301 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30302 return Pred == CmpInst::ICMP_SLT;
30303 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30304 return Pred == CmpInst::ICMP_SGT;
30306 return false;
30308 if ((Opc == AtomicRMWInst::Or &&
30309 match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||
30310 (Opc == AtomicRMWInst::And &&
30311 match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {
30312 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30313 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
30314 Pred == CmpInst::ICMP_SLT;
30315 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30316 return Pred == CmpInst::ICMP_SGT;
30317 return false;
30319 if (Opc == AtomicRMWInst::Xor) {
30320 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30321 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30322 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
30323 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30324 return Pred == CmpInst::ICMP_SLT;
30325 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30326 return Pred == CmpInst::ICMP_SGT;
30328 return false;
30331 return false;
30334 void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
30335 AtomicRMWInst *AI) const {
30336 IRBuilder<> Builder(AI);
30337 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30338 Instruction *TempI = nullptr;
30339 LLVMContext &Ctx = AI->getContext();
30340 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
30341 if (!ICI) {
30342 TempI = AI->user_back();
30343 assert(TempI->hasOneUse() && "Must have one use");
30344 ICI = cast<ICmpInst>(TempI->user_back());
30346 X86::CondCode CC = X86::COND_INVALID;
30347 ICmpInst::Predicate Pred = ICI->getPredicate();
30348 switch (Pred) {
30349 default:
30350 llvm_unreachable("Not supported Pred");
30351 case CmpInst::ICMP_EQ:
30352 CC = X86::COND_E;
30353 break;
30354 case CmpInst::ICMP_NE:
30355 CC = X86::COND_NE;
30356 break;
30357 case CmpInst::ICMP_SLT:
30358 CC = X86::COND_S;
30359 break;
30360 case CmpInst::ICMP_SGT:
30361 CC = X86::COND_NS;
30362 break;
30364 Intrinsic::ID IID = Intrinsic::not_intrinsic;
30365 switch (AI->getOperation()) {
30366 default:
30367 llvm_unreachable("Unknown atomic operation");
30368 case AtomicRMWInst::Add:
30369 IID = Intrinsic::x86_atomic_add_cc;
30370 break;
30371 case AtomicRMWInst::Sub:
30372 IID = Intrinsic::x86_atomic_sub_cc;
30373 break;
30374 case AtomicRMWInst::Or:
30375 IID = Intrinsic::x86_atomic_or_cc;
30376 break;
30377 case AtomicRMWInst::And:
30378 IID = Intrinsic::x86_atomic_and_cc;
30379 break;
30380 case AtomicRMWInst::Xor:
30381 IID = Intrinsic::x86_atomic_xor_cc;
30382 break;
30384 Function *CmpArith =
30385 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
30386 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30387 PointerType::getUnqual(Ctx));
30388 Value *Call = Builder.CreateCall(
30389 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
30390 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
30391 ICI->replaceAllUsesWith(Result);
30392 ICI->eraseFromParent();
30393 if (TempI)
30394 TempI->eraseFromParent();
30395 AI->eraseFromParent();
30398 TargetLowering::AtomicExpansionKind
30399 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
30400 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30401 Type *MemType = AI->getType();
30403 // If the operand is too big, we must see if cmpxchg8/16b is available
30404 // and default to library calls otherwise.
30405 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
30406 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30407 : AtomicExpansionKind::None;
30410 AtomicRMWInst::BinOp Op = AI->getOperation();
30411 switch (Op) {
30412 case AtomicRMWInst::Xchg:
30413 return AtomicExpansionKind::None;
30414 case AtomicRMWInst::Add:
30415 case AtomicRMWInst::Sub:
30416 if (shouldExpandCmpArithRMWInIR(AI))
30417 return AtomicExpansionKind::CmpArithIntrinsic;
30418 // It's better to use xadd, xsub or xchg for these in other cases.
30419 return AtomicExpansionKind::None;
30420 case AtomicRMWInst::Or:
30421 case AtomicRMWInst::And:
30422 case AtomicRMWInst::Xor:
30423 if (shouldExpandCmpArithRMWInIR(AI))
30424 return AtomicExpansionKind::CmpArithIntrinsic;
30425 return shouldExpandLogicAtomicRMWInIR(AI);
30426 case AtomicRMWInst::Nand:
30427 case AtomicRMWInst::Max:
30428 case AtomicRMWInst::Min:
30429 case AtomicRMWInst::UMax:
30430 case AtomicRMWInst::UMin:
30431 case AtomicRMWInst::FAdd:
30432 case AtomicRMWInst::FSub:
30433 case AtomicRMWInst::FMax:
30434 case AtomicRMWInst::FMin:
30435 case AtomicRMWInst::UIncWrap:
30436 case AtomicRMWInst::UDecWrap:
30437 default:
30438 // These always require a non-trivial set of data operations on x86. We must
30439 // use a cmpxchg loop.
30440 return AtomicExpansionKind::CmpXChg;
30444 LoadInst *
30445 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
30446 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30447 Type *MemType = AI->getType();
30448 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
30449 // there is no benefit in turning such RMWs into loads, and it is actually
30450 // harmful as it introduces a mfence.
30451 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
30452 return nullptr;
30454 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
30455 // lowering available in lowerAtomicArith.
30456 // TODO: push more cases through this path.
30457 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
30458 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
30459 AI->use_empty())
30460 return nullptr;
30462 IRBuilder<> Builder(AI);
30463 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30464 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30465 auto SSID = AI->getSyncScopeID();
30466 // We must restrict the ordering to avoid generating loads with Release or
30467 // ReleaseAcquire orderings.
30468 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
30470 // Before the load we need a fence. Here is an example lifted from
30471 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
30472 // is required:
30473 // Thread 0:
30474 // x.store(1, relaxed);
30475 // r1 = y.fetch_add(0, release);
30476 // Thread 1:
30477 // y.fetch_add(42, acquire);
30478 // r2 = x.load(relaxed);
30479 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
30480 // lowered to just a load without a fence. A mfence flushes the store buffer,
30481 // making the optimization clearly correct.
30482 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
30483 // otherwise, we might be able to be more aggressive on relaxed idempotent
30484 // rmw. In practice, they do not look useful, so we don't try to be
30485 // especially clever.
30486 if (SSID == SyncScope::SingleThread)
30487 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
30488 // the IR level, so we must wrap it in an intrinsic.
30489 return nullptr;
30491 if (!Subtarget.hasMFence())
30492 // FIXME: it might make sense to use a locked operation here but on a
30493 // different cache-line to prevent cache-line bouncing. In practice it
30494 // is probably a small win, and x86 processors without mfence are rare
30495 // enough that we do not bother.
30496 return nullptr;
30498 Function *MFence =
30499 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
30500 Builder.CreateCall(MFence, {});
30502 // Finally we can emit the atomic load.
30503 LoadInst *Loaded = Builder.CreateAlignedLoad(
30504 AI->getType(), AI->getPointerOperand(), AI->getAlign());
30505 Loaded->setAtomic(Order, SSID);
30506 AI->replaceAllUsesWith(Loaded);
30507 AI->eraseFromParent();
30508 return Loaded;
30511 bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
30512 if (!SI.isUnordered())
30513 return false;
30514 return ExperimentalUnorderedISEL;
30516 bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
30517 if (!LI.isUnordered())
30518 return false;
30519 return ExperimentalUnorderedISEL;
30523 /// Emit a locked operation on a stack location which does not change any
30524 /// memory location, but does involve a lock prefix. Location is chosen to be
30525 /// a) very likely accessed only by a single thread to minimize cache traffic,
30526 /// and b) definitely dereferenceable. Returns the new Chain result.
30527 static SDValue emitLockedStackOp(SelectionDAG &DAG,
30528 const X86Subtarget &Subtarget, SDValue Chain,
30529 const SDLoc &DL) {
30530 // Implementation notes:
30531 // 1) LOCK prefix creates a full read/write reordering barrier for memory
30532 // operations issued by the current processor. As such, the location
30533 // referenced is not relevant for the ordering properties of the instruction.
30534 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
30535 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
30536 // 2) Using an immediate operand appears to be the best encoding choice
30537 // here since it doesn't require an extra register.
30538 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
30539 // is small enough it might just be measurement noise.)
30540 // 4) When choosing offsets, there are several contributing factors:
30541 // a) If there's no redzone, we default to TOS. (We could allocate a cache
30542 // line aligned stack object to improve this case.)
30543 // b) To minimize our chances of introducing a false dependence, we prefer
30544 // to offset the stack usage from TOS slightly.
30545 // c) To minimize concerns about cross thread stack usage - in particular,
30546 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
30547 // captures state in the TOS frame and accesses it from many threads -
30548 // we want to use an offset such that the offset is in a distinct cache
30549 // line from the TOS frame.
30551 // For a general discussion of the tradeoffs and benchmark results, see:
30552 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
30554 auto &MF = DAG.getMachineFunction();
30555 auto &TFL = *Subtarget.getFrameLowering();
30556 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
30558 if (Subtarget.is64Bit()) {
30559 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30560 SDValue Ops[] = {
30561 DAG.getRegister(X86::RSP, MVT::i64), // Base
30562 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30563 DAG.getRegister(0, MVT::i64), // Index
30564 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30565 DAG.getRegister(0, MVT::i16), // Segment.
30566 Zero,
30567 Chain};
30568 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30569 MVT::Other, Ops);
30570 return SDValue(Res, 1);
30573 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30574 SDValue Ops[] = {
30575 DAG.getRegister(X86::ESP, MVT::i32), // Base
30576 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30577 DAG.getRegister(0, MVT::i32), // Index
30578 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30579 DAG.getRegister(0, MVT::i16), // Segment.
30580 Zero,
30581 Chain
30583 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30584 MVT::Other, Ops);
30585 return SDValue(Res, 1);
30588 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
30589 SelectionDAG &DAG) {
30590 SDLoc dl(Op);
30591 AtomicOrdering FenceOrdering =
30592 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
30593 SyncScope::ID FenceSSID =
30594 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
30596 // The only fence that needs an instruction is a sequentially-consistent
30597 // cross-thread fence.
30598 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
30599 FenceSSID == SyncScope::System) {
30600 if (Subtarget.hasMFence())
30601 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
30603 SDValue Chain = Op.getOperand(0);
30604 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
30607 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
30608 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
30611 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
30612 SelectionDAG &DAG) {
30613 MVT T = Op.getSimpleValueType();
30614 SDLoc DL(Op);
30615 unsigned Reg = 0;
30616 unsigned size = 0;
30617 switch(T.SimpleTy) {
30618 default: llvm_unreachable("Invalid value type!");
30619 case MVT::i8: Reg = X86::AL; size = 1; break;
30620 case MVT::i16: Reg = X86::AX; size = 2; break;
30621 case MVT::i32: Reg = X86::EAX; size = 4; break;
30622 case MVT::i64:
30623 assert(Subtarget.is64Bit() && "Node not type legal!");
30624 Reg = X86::RAX; size = 8;
30625 break;
30627 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
30628 Op.getOperand(2), SDValue());
30629 SDValue Ops[] = { cpIn.getValue(0),
30630 Op.getOperand(1),
30631 Op.getOperand(3),
30632 DAG.getTargetConstant(size, DL, MVT::i8),
30633 cpIn.getValue(1) };
30634 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
30635 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
30636 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
30637 Ops, T, MMO);
30639 SDValue cpOut =
30640 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
30641 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
30642 MVT::i32, cpOut.getValue(2));
30643 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
30645 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
30646 cpOut, Success, EFLAGS.getValue(1));
30649 // Create MOVMSKB, taking into account whether we need to split for AVX1.
30650 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
30651 const X86Subtarget &Subtarget) {
30652 MVT InVT = V.getSimpleValueType();
30654 if (InVT == MVT::v64i8) {
30655 SDValue Lo, Hi;
30656 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30657 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
30658 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
30659 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
30660 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
30661 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
30662 DAG.getConstant(32, DL, MVT::i8));
30663 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
30665 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30666 SDValue Lo, Hi;
30667 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30668 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30669 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30670 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30671 DAG.getConstant(16, DL, MVT::i8));
30672 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30675 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30678 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
30679 SelectionDAG &DAG) {
30680 SDValue Src = Op.getOperand(0);
30681 MVT SrcVT = Src.getSimpleValueType();
30682 MVT DstVT = Op.getSimpleValueType();
30684 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
30685 // half to v32i1 and concatenating the result.
30686 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
30687 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
30688 assert(Subtarget.hasBWI() && "Expected BWI target");
30689 SDLoc dl(Op);
30690 SDValue Lo, Hi;
30691 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
30692 Lo = DAG.getBitcast(MVT::v32i1, Lo);
30693 Hi = DAG.getBitcast(MVT::v32i1, Hi);
30694 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
30697 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
30698 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
30699 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
30700 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
30701 SDLoc DL(Op);
30702 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
30703 V = getPMOVMSKB(DL, V, DAG, Subtarget);
30704 return DAG.getZExtOrTrunc(V, DL, DstVT);
30707 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
30708 SrcVT == MVT::i64) && "Unexpected VT!");
30710 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30711 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
30712 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
30713 // This conversion needs to be expanded.
30714 return SDValue();
30716 SDLoc dl(Op);
30717 if (SrcVT.isVector()) {
30718 // Widen the vector in input in the case of MVT::v2i32.
30719 // Example: from MVT::v2i32 to MVT::v4i32.
30720 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
30721 SrcVT.getVectorNumElements() * 2);
30722 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
30723 DAG.getUNDEF(SrcVT));
30724 } else {
30725 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
30726 "Unexpected source type in LowerBITCAST");
30727 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
30730 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
30731 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
30733 if (DstVT == MVT::x86mmx)
30734 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
30736 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
30737 DAG.getIntPtrConstant(0, dl));
30740 /// Compute the horizontal sum of bytes in V for the elements of VT.
30742 /// Requires V to be a byte vector and VT to be an integer vector type with
30743 /// wider elements than V's type. The width of the elements of VT determines
30744 /// how many bytes of V are summed horizontally to produce each element of the
30745 /// result.
30746 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
30747 const X86Subtarget &Subtarget,
30748 SelectionDAG &DAG) {
30749 SDLoc DL(V);
30750 MVT ByteVecVT = V.getSimpleValueType();
30751 MVT EltVT = VT.getVectorElementType();
30752 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
30753 "Expected value to have byte element type.");
30754 assert(EltVT != MVT::i8 &&
30755 "Horizontal byte sum only makes sense for wider elements!");
30756 unsigned VecSize = VT.getSizeInBits();
30757 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
30759 // PSADBW instruction horizontally add all bytes and leave the result in i64
30760 // chunks, thus directly computes the pop count for v2i64 and v4i64.
30761 if (EltVT == MVT::i64) {
30762 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
30763 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30764 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
30765 return DAG.getBitcast(VT, V);
30768 if (EltVT == MVT::i32) {
30769 // We unpack the low half and high half into i32s interleaved with zeros so
30770 // that we can use PSADBW to horizontally sum them. The most useful part of
30771 // this is that it lines up the results of two PSADBW instructions to be
30772 // two v2i64 vectors which concatenated are the 4 population counts. We can
30773 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
30774 SDValue Zeros = DAG.getConstant(0, DL, VT);
30775 SDValue V32 = DAG.getBitcast(VT, V);
30776 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
30777 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
30779 // Do the horizontal sums into two v2i64s.
30780 Zeros = DAG.getConstant(0, DL, ByteVecVT);
30781 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30782 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30783 DAG.getBitcast(ByteVecVT, Low), Zeros);
30784 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30785 DAG.getBitcast(ByteVecVT, High), Zeros);
30787 // Merge them together.
30788 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
30789 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
30790 DAG.getBitcast(ShortVecVT, Low),
30791 DAG.getBitcast(ShortVecVT, High));
30793 return DAG.getBitcast(VT, V);
30796 // The only element type left is i16.
30797 assert(EltVT == MVT::i16 && "Unknown how to handle type");
30799 // To obtain pop count for each i16 element starting from the pop count for
30800 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
30801 // right by 8. It is important to shift as i16s as i8 vector shift isn't
30802 // directly supported.
30803 SDValue ShifterV = DAG.getConstant(8, DL, VT);
30804 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30805 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
30806 DAG.getBitcast(ByteVecVT, V));
30807 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30810 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
30811 const X86Subtarget &Subtarget,
30812 SelectionDAG &DAG) {
30813 MVT VT = Op.getSimpleValueType();
30814 MVT EltVT = VT.getVectorElementType();
30815 int NumElts = VT.getVectorNumElements();
30816 (void)EltVT;
30817 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
30819 // Implement a lookup table in register by using an algorithm based on:
30820 // http://wm.ite.pl/articles/sse-popcount.html
30822 // The general idea is that every lower byte nibble in the input vector is an
30823 // index into a in-register pre-computed pop count table. We then split up the
30824 // input vector in two new ones: (1) a vector with only the shifted-right
30825 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
30826 // masked out higher ones) for each byte. PSHUFB is used separately with both
30827 // to index the in-register table. Next, both are added and the result is a
30828 // i8 vector where each element contains the pop count for input byte.
30829 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
30830 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
30831 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
30832 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
30834 SmallVector<SDValue, 64> LUTVec;
30835 for (int i = 0; i < NumElts; ++i)
30836 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
30837 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
30838 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
30840 // High nibbles
30841 SDValue FourV = DAG.getConstant(4, DL, VT);
30842 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
30844 // Low nibbles
30845 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
30847 // The input vector is used as the shuffle mask that index elements into the
30848 // LUT. After counting low and high nibbles, add the vector to obtain the
30849 // final pop count per i8 element.
30850 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
30851 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
30852 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
30855 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
30856 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
30857 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
30858 SelectionDAG &DAG) {
30859 MVT VT = Op.getSimpleValueType();
30860 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
30861 "Unknown CTPOP type to handle");
30862 SDLoc DL(Op.getNode());
30863 SDValue Op0 = Op.getOperand(0);
30865 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
30866 if (Subtarget.hasVPOPCNTDQ()) {
30867 unsigned NumElems = VT.getVectorNumElements();
30868 assert((VT.getVectorElementType() == MVT::i8 ||
30869 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
30870 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
30871 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
30872 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
30873 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
30874 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
30878 // Decompose 256-bit ops into smaller 128-bit ops.
30879 if (VT.is256BitVector() && !Subtarget.hasInt256())
30880 return splitVectorIntUnary(Op, DAG);
30882 // Decompose 512-bit ops into smaller 256-bit ops.
30883 if (VT.is512BitVector() && !Subtarget.hasBWI())
30884 return splitVectorIntUnary(Op, DAG);
30886 // For element types greater than i8, do vXi8 pop counts and a bytesum.
30887 if (VT.getScalarType() != MVT::i8) {
30888 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
30889 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
30890 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
30891 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
30894 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
30895 if (!Subtarget.hasSSSE3())
30896 return SDValue();
30898 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
30901 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
30902 SelectionDAG &DAG) {
30903 assert(Op.getSimpleValueType().isVector() &&
30904 "We only do custom lowering for vector population count.");
30905 return LowerVectorCTPOP(Op, Subtarget, DAG);
30908 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
30909 MVT VT = Op.getSimpleValueType();
30910 SDValue In = Op.getOperand(0);
30911 SDLoc DL(Op);
30913 // For scalars, its still beneficial to transfer to/from the SIMD unit to
30914 // perform the BITREVERSE.
30915 if (!VT.isVector()) {
30916 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
30917 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
30918 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
30919 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
30920 DAG.getIntPtrConstant(0, DL));
30923 int NumElts = VT.getVectorNumElements();
30924 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
30926 // Decompose 256-bit ops into smaller 128-bit ops.
30927 if (VT.is256BitVector())
30928 return splitVectorIntUnary(Op, DAG);
30930 assert(VT.is128BitVector() &&
30931 "Only 128-bit vector bitreverse lowering supported.");
30933 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
30934 // perform the BSWAP in the shuffle.
30935 // Its best to shuffle using the second operand as this will implicitly allow
30936 // memory folding for multiple vectors.
30937 SmallVector<SDValue, 16> MaskElts;
30938 for (int i = 0; i != NumElts; ++i) {
30939 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
30940 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
30941 int PermuteByte = SourceByte | (2 << 5);
30942 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
30946 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
30947 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
30948 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
30949 Res, Mask);
30950 return DAG.getBitcast(VT, Res);
30953 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
30954 SelectionDAG &DAG) {
30955 MVT VT = Op.getSimpleValueType();
30957 if (Subtarget.hasXOP() && !VT.is512BitVector())
30958 return LowerBITREVERSE_XOP(Op, DAG);
30960 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
30962 SDValue In = Op.getOperand(0);
30963 SDLoc DL(Op);
30965 assert(VT.getScalarType() == MVT::i8 &&
30966 "Only byte vector BITREVERSE supported");
30968 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
30969 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
30970 return splitVectorIntUnary(Op, DAG);
30972 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
30973 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
30974 return splitVectorIntUnary(Op, DAG);
30976 unsigned NumElts = VT.getVectorNumElements();
30978 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
30979 if (Subtarget.hasGFNI()) {
30980 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
30981 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
30982 Matrix = DAG.getBitcast(VT, Matrix);
30983 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
30984 DAG.getTargetConstant(0, DL, MVT::i8));
30987 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
30988 // two nibbles and a PSHUFB lookup to find the bitreverse of each
30989 // 0-15 value (moved to the other nibble).
30990 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
30991 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
30992 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
30994 const int LoLUT[16] = {
30995 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
30996 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
30997 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
30998 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
30999 const int HiLUT[16] = {
31000 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
31001 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
31002 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
31003 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
31005 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
31006 for (unsigned i = 0; i < NumElts; ++i) {
31007 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
31008 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
31011 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
31012 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
31013 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
31014 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
31015 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31018 static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
31019 SelectionDAG &DAG) {
31020 SDLoc DL(Op);
31021 SDValue X = Op.getOperand(0);
31022 MVT VT = Op.getSimpleValueType();
31024 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
31025 if (VT == MVT::i8 ||
31026 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
31027 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31028 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
31029 DAG.getConstant(0, DL, MVT::i8));
31030 // Copy the inverse of the parity flag into a register with setcc.
31031 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31032 // Extend to the original type.
31033 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31036 // If we have POPCNT, use the default expansion.
31037 if (Subtarget.hasPOPCNT())
31038 return SDValue();
31040 if (VT == MVT::i64) {
31041 // Xor the high and low 16-bits together using a 32-bit operation.
31042 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
31043 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
31044 DAG.getConstant(32, DL, MVT::i8)));
31045 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
31046 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
31049 if (VT != MVT::i16) {
31050 // Xor the high and low 16-bits together using a 32-bit operation.
31051 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
31052 DAG.getConstant(16, DL, MVT::i8));
31053 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
31054 } else {
31055 // If the input is 16-bits, we need to extend to use an i32 shift below.
31056 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
31059 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
31060 // This should allow an h-reg to be used to save a shift.
31061 SDValue Hi = DAG.getNode(
31062 ISD::TRUNCATE, DL, MVT::i8,
31063 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
31064 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31065 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
31066 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
31068 // Copy the inverse of the parity flag into a register with setcc.
31069 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31070 // Extend to the original type.
31071 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31074 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
31075 const X86Subtarget &Subtarget) {
31076 unsigned NewOpc = 0;
31077 switch (N->getOpcode()) {
31078 case ISD::ATOMIC_LOAD_ADD:
31079 NewOpc = X86ISD::LADD;
31080 break;
31081 case ISD::ATOMIC_LOAD_SUB:
31082 NewOpc = X86ISD::LSUB;
31083 break;
31084 case ISD::ATOMIC_LOAD_OR:
31085 NewOpc = X86ISD::LOR;
31086 break;
31087 case ISD::ATOMIC_LOAD_XOR:
31088 NewOpc = X86ISD::LXOR;
31089 break;
31090 case ISD::ATOMIC_LOAD_AND:
31091 NewOpc = X86ISD::LAND;
31092 break;
31093 default:
31094 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
31097 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
31099 return DAG.getMemIntrinsicNode(
31100 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
31101 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
31102 /*MemVT=*/N->getSimpleValueType(0), MMO);
31105 /// Lower atomic_load_ops into LOCK-prefixed operations.
31106 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
31107 const X86Subtarget &Subtarget) {
31108 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
31109 SDValue Chain = N->getOperand(0);
31110 SDValue LHS = N->getOperand(1);
31111 SDValue RHS = N->getOperand(2);
31112 unsigned Opc = N->getOpcode();
31113 MVT VT = N->getSimpleValueType(0);
31114 SDLoc DL(N);
31116 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
31117 // can only be lowered when the result is unused. They should have already
31118 // been transformed into a cmpxchg loop in AtomicExpand.
31119 if (N->hasAnyUseOfValue(0)) {
31120 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
31121 // select LXADD if LOCK_SUB can't be selected.
31122 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
31123 // can use LXADD as opposed to cmpxchg.
31124 if (Opc == ISD::ATOMIC_LOAD_SUB ||
31125 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS))) {
31126 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
31127 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS,
31128 AN->getMemOperand());
31130 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
31131 "Used AtomicRMW ops other than Add should have been expanded!");
31132 return N;
31135 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
31136 // The core idea here is that since the memory location isn't actually
31137 // changing, all we need is a lowering for the *ordering* impacts of the
31138 // atomicrmw. As such, we can chose a different operation and memory
31139 // location to minimize impact on other code.
31140 // The above holds unless the node is marked volatile in which
31141 // case it needs to be preserved according to the langref.
31142 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
31143 // On X86, the only ordering which actually requires an instruction is
31144 // seq_cst which isn't SingleThread, everything just needs to be preserved
31145 // during codegen and then dropped. Note that we expect (but don't assume),
31146 // that orderings other than seq_cst and acq_rel have been canonicalized to
31147 // a store or load.
31148 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
31149 AN->getSyncScopeID() == SyncScope::System) {
31150 // Prefer a locked operation against a stack location to minimize cache
31151 // traffic. This assumes that stack locations are very likely to be
31152 // accessed only by the owning thread.
31153 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
31154 assert(!N->hasAnyUseOfValue(0));
31155 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31156 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31157 DAG.getUNDEF(VT), NewChain);
31159 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31160 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
31161 assert(!N->hasAnyUseOfValue(0));
31162 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31163 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31164 DAG.getUNDEF(VT), NewChain);
31167 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
31168 // RAUW the chain, but don't worry about the result, as it's unused.
31169 assert(!N->hasAnyUseOfValue(0));
31170 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31171 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31172 DAG.getUNDEF(VT), LockOp.getValue(1));
31175 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
31176 const X86Subtarget &Subtarget) {
31177 auto *Node = cast<AtomicSDNode>(Op.getNode());
31178 SDLoc dl(Node);
31179 EVT VT = Node->getMemoryVT();
31181 bool IsSeqCst =
31182 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
31183 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
31185 // If this store is not sequentially consistent and the type is legal
31186 // we can just keep it.
31187 if (!IsSeqCst && IsTypeLegal)
31188 return Op;
31190 if (VT == MVT::i64 && !IsTypeLegal) {
31191 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
31192 // is enabled.
31193 bool NoImplicitFloatOps =
31194 DAG.getMachineFunction().getFunction().hasFnAttribute(
31195 Attribute::NoImplicitFloat);
31196 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31197 SDValue Chain;
31198 if (Subtarget.hasSSE1()) {
31199 SDValue SclToVec =
31200 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
31201 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31202 SclToVec = DAG.getBitcast(StVT, SclToVec);
31203 SDVTList Tys = DAG.getVTList(MVT::Other);
31204 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
31205 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
31206 MVT::i64, Node->getMemOperand());
31207 } else if (Subtarget.hasX87()) {
31208 // First load this into an 80-bit X87 register using a stack temporary.
31209 // This will put the whole integer into the significand.
31210 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31211 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31212 MachinePointerInfo MPI =
31213 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31214 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
31215 MPI, MaybeAlign(), MachineMemOperand::MOStore);
31216 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31217 SDValue LdOps[] = {Chain, StackPtr};
31218 SDValue Value = DAG.getMemIntrinsicNode(
31219 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
31220 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
31221 Chain = Value.getValue(1);
31223 // Now use an FIST to do the atomic store.
31224 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
31225 Chain =
31226 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
31227 StoreOps, MVT::i64, Node->getMemOperand());
31230 if (Chain) {
31231 // If this is a sequentially consistent store, also emit an appropriate
31232 // barrier.
31233 if (IsSeqCst)
31234 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
31236 return Chain;
31241 // Convert seq_cst store -> xchg
31242 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
31243 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
31244 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
31245 Node->getOperand(0), Node->getOperand(2),
31246 Node->getOperand(1), Node->getMemOperand());
31247 return Swap.getValue(1);
31250 static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
31251 SDNode *N = Op.getNode();
31252 MVT VT = N->getSimpleValueType(0);
31253 unsigned Opc = Op.getOpcode();
31255 // Let legalize expand this if it isn't a legal type yet.
31256 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31257 return SDValue();
31259 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
31260 SDLoc DL(N);
31262 // Set the carry flag.
31263 SDValue Carry = Op.getOperand(2);
31264 EVT CarryVT = Carry.getValueType();
31265 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
31266 Carry, DAG.getAllOnesConstant(DL, CarryVT));
31268 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
31269 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
31270 Op.getOperand(0), Op.getOperand(1),
31271 Carry.getValue(1));
31273 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
31274 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
31275 Sum.getValue(1), DL, DAG);
31276 if (N->getValueType(1) == MVT::i1)
31277 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
31279 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
31282 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
31283 SelectionDAG &DAG) {
31284 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
31286 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
31287 // which returns the values as { float, float } (in XMM0) or
31288 // { double, double } (which is returned in XMM0, XMM1).
31289 SDLoc dl(Op);
31290 SDValue Arg = Op.getOperand(0);
31291 EVT ArgVT = Arg.getValueType();
31292 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
31294 TargetLowering::ArgListTy Args;
31295 TargetLowering::ArgListEntry Entry;
31297 Entry.Node = Arg;
31298 Entry.Ty = ArgTy;
31299 Entry.IsSExt = false;
31300 Entry.IsZExt = false;
31301 Args.push_back(Entry);
31303 bool isF64 = ArgVT == MVT::f64;
31304 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
31305 // the small struct {f32, f32} is returned in (eax, edx). For f64,
31306 // the results are returned via SRet in memory.
31307 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31308 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
31309 const char *LibcallName = TLI.getLibcallName(LC);
31310 SDValue Callee =
31311 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
31313 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
31314 : (Type *)FixedVectorType::get(ArgTy, 4);
31316 TargetLowering::CallLoweringInfo CLI(DAG);
31317 CLI.setDebugLoc(dl)
31318 .setChain(DAG.getEntryNode())
31319 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
31321 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
31323 if (isF64)
31324 // Returned in xmm0 and xmm1.
31325 return CallResult.first;
31327 // Returned in bits 0:31 and 32:64 xmm0.
31328 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31329 CallResult.first, DAG.getIntPtrConstant(0, dl));
31330 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31331 CallResult.first, DAG.getIntPtrConstant(1, dl));
31332 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
31333 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
31336 /// Widen a vector input to a vector of NVT. The
31337 /// input vector must have the same element type as NVT.
31338 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
31339 bool FillWithZeroes = false) {
31340 // Check if InOp already has the right width.
31341 MVT InVT = InOp.getSimpleValueType();
31342 if (InVT == NVT)
31343 return InOp;
31345 if (InOp.isUndef())
31346 return DAG.getUNDEF(NVT);
31348 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
31349 "input and widen element type must match");
31351 unsigned InNumElts = InVT.getVectorNumElements();
31352 unsigned WidenNumElts = NVT.getVectorNumElements();
31353 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
31354 "Unexpected request for vector widening");
31356 SDLoc dl(InOp);
31357 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
31358 InOp.getNumOperands() == 2) {
31359 SDValue N1 = InOp.getOperand(1);
31360 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
31361 N1.isUndef()) {
31362 InOp = InOp.getOperand(0);
31363 InVT = InOp.getSimpleValueType();
31364 InNumElts = InVT.getVectorNumElements();
31367 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
31368 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
31369 SmallVector<SDValue, 16> Ops;
31370 for (unsigned i = 0; i < InNumElts; ++i)
31371 Ops.push_back(InOp.getOperand(i));
31373 EVT EltVT = InOp.getOperand(0).getValueType();
31375 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
31376 DAG.getUNDEF(EltVT);
31377 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
31378 Ops.push_back(FillVal);
31379 return DAG.getBuildVector(NVT, dl, Ops);
31381 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
31382 DAG.getUNDEF(NVT);
31383 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
31384 InOp, DAG.getIntPtrConstant(0, dl));
31387 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
31388 SelectionDAG &DAG) {
31389 assert(Subtarget.hasAVX512() &&
31390 "MGATHER/MSCATTER are supported on AVX-512 arch only");
31392 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
31393 SDValue Src = N->getValue();
31394 MVT VT = Src.getSimpleValueType();
31395 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
31396 SDLoc dl(Op);
31398 SDValue Scale = N->getScale();
31399 SDValue Index = N->getIndex();
31400 SDValue Mask = N->getMask();
31401 SDValue Chain = N->getChain();
31402 SDValue BasePtr = N->getBasePtr();
31404 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
31405 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31406 // If the index is v2i64 and we have VLX we can use xmm for data and index.
31407 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
31408 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31409 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
31410 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
31411 SDVTList VTs = DAG.getVTList(MVT::Other);
31412 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31413 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31414 N->getMemoryVT(), N->getMemOperand());
31416 return SDValue();
31419 MVT IndexVT = Index.getSimpleValueType();
31421 // If the index is v2i32, we're being called by type legalization and we
31422 // should just let the default handling take care of it.
31423 if (IndexVT == MVT::v2i32)
31424 return SDValue();
31426 // If we don't have VLX and neither the passthru or index is 512-bits, we
31427 // need to widen until one is.
31428 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
31429 !Index.getSimpleValueType().is512BitVector()) {
31430 // Determine how much we need to widen by to get a 512-bit type.
31431 unsigned Factor = std::min(512/VT.getSizeInBits(),
31432 512/IndexVT.getSizeInBits());
31433 unsigned NumElts = VT.getVectorNumElements() * Factor;
31435 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31436 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31437 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31439 Src = ExtendToType(Src, VT, DAG);
31440 Index = ExtendToType(Index, IndexVT, DAG);
31441 Mask = ExtendToType(Mask, MaskVT, DAG, true);
31444 SDVTList VTs = DAG.getVTList(MVT::Other);
31445 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31446 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31447 N->getMemoryVT(), N->getMemOperand());
31450 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
31451 SelectionDAG &DAG) {
31453 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
31454 MVT VT = Op.getSimpleValueType();
31455 MVT ScalarVT = VT.getScalarType();
31456 SDValue Mask = N->getMask();
31457 MVT MaskVT = Mask.getSimpleValueType();
31458 SDValue PassThru = N->getPassThru();
31459 SDLoc dl(Op);
31461 // Handle AVX masked loads which don't support passthru other than 0.
31462 if (MaskVT.getVectorElementType() != MVT::i1) {
31463 // We also allow undef in the isel pattern.
31464 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
31465 return Op;
31467 SDValue NewLoad = DAG.getMaskedLoad(
31468 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31469 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
31470 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
31471 N->isExpandingLoad());
31472 // Emit a blend.
31473 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
31474 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
31477 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
31478 "Expanding masked load is supported on AVX-512 target only!");
31480 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
31481 "Expanding masked load is supported for 32 and 64-bit types only!");
31483 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31484 "Cannot lower masked load op.");
31486 assert((ScalarVT.getSizeInBits() >= 32 ||
31487 (Subtarget.hasBWI() &&
31488 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31489 "Unsupported masked load op.");
31491 // This operation is legal for targets with VLX, but without
31492 // VLX the vector should be widened to 512 bit
31493 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
31494 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31495 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
31497 // Mask element has to be i1.
31498 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31499 "Unexpected mask type");
31501 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31503 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31504 SDValue NewLoad = DAG.getMaskedLoad(
31505 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31506 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
31507 N->getExtensionType(), N->isExpandingLoad());
31509 SDValue Extract =
31510 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
31511 DAG.getIntPtrConstant(0, dl));
31512 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
31513 return DAG.getMergeValues(RetOps, dl);
31516 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
31517 SelectionDAG &DAG) {
31518 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
31519 SDValue DataToStore = N->getValue();
31520 MVT VT = DataToStore.getSimpleValueType();
31521 MVT ScalarVT = VT.getScalarType();
31522 SDValue Mask = N->getMask();
31523 SDLoc dl(Op);
31525 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
31526 "Expanding masked load is supported on AVX-512 target only!");
31528 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
31529 "Expanding masked load is supported for 32 and 64-bit types only!");
31531 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31532 "Cannot lower masked store op.");
31534 assert((ScalarVT.getSizeInBits() >= 32 ||
31535 (Subtarget.hasBWI() &&
31536 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31537 "Unsupported masked store op.");
31539 // This operation is legal for targets with VLX, but without
31540 // VLX the vector should be widened to 512 bit
31541 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
31542 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31544 // Mask element has to be i1.
31545 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31546 "Unexpected mask type");
31548 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31550 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
31551 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31552 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
31553 N->getOffset(), Mask, N->getMemoryVT(),
31554 N->getMemOperand(), N->getAddressingMode(),
31555 N->isTruncatingStore(), N->isCompressingStore());
31558 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
31559 SelectionDAG &DAG) {
31560 assert(Subtarget.hasAVX2() &&
31561 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
31563 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
31564 SDLoc dl(Op);
31565 MVT VT = Op.getSimpleValueType();
31566 SDValue Index = N->getIndex();
31567 SDValue Mask = N->getMask();
31568 SDValue PassThru = N->getPassThru();
31569 MVT IndexVT = Index.getSimpleValueType();
31571 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
31573 // If the index is v2i32, we're being called by type legalization.
31574 if (IndexVT == MVT::v2i32)
31575 return SDValue();
31577 // If we don't have VLX and neither the passthru or index is 512-bits, we
31578 // need to widen until one is.
31579 MVT OrigVT = VT;
31580 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31581 !IndexVT.is512BitVector()) {
31582 // Determine how much we need to widen by to get a 512-bit type.
31583 unsigned Factor = std::min(512/VT.getSizeInBits(),
31584 512/IndexVT.getSizeInBits());
31586 unsigned NumElts = VT.getVectorNumElements() * Factor;
31588 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31589 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31590 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31592 PassThru = ExtendToType(PassThru, VT, DAG);
31593 Index = ExtendToType(Index, IndexVT, DAG);
31594 Mask = ExtendToType(Mask, MaskVT, DAG, true);
31597 // Break dependency on the data register.
31598 if (PassThru.isUndef())
31599 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
31601 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
31602 N->getScale() };
31603 SDValue NewGather = DAG.getMemIntrinsicNode(
31604 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
31605 N->getMemOperand());
31606 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
31607 NewGather, DAG.getIntPtrConstant(0, dl));
31608 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
31611 static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
31612 SDLoc dl(Op);
31613 SDValue Src = Op.getOperand(0);
31614 MVT DstVT = Op.getSimpleValueType();
31616 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
31617 unsigned SrcAS = N->getSrcAddressSpace();
31619 assert(SrcAS != N->getDestAddressSpace() &&
31620 "addrspacecast must be between different address spaces");
31622 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
31623 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
31624 } else if (DstVT == MVT::i64) {
31625 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
31626 } else if (DstVT == MVT::i32) {
31627 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
31628 } else {
31629 report_fatal_error("Bad address space in addrspacecast");
31631 return Op;
31634 SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
31635 SelectionDAG &DAG) const {
31636 // TODO: Eventually, the lowering of these nodes should be informed by or
31637 // deferred to the GC strategy for the function in which they appear. For
31638 // now, however, they must be lowered to something. Since they are logically
31639 // no-ops in the case of a null GC strategy (or a GC strategy which does not
31640 // require special handling for these nodes), lower them as literal NOOPs for
31641 // the time being.
31642 SmallVector<SDValue, 2> Ops;
31643 Ops.push_back(Op.getOperand(0));
31644 if (Op->getGluedNode())
31645 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
31647 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
31648 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
31651 // Custom split CVTPS2PH with wide types.
31652 static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
31653 SDLoc dl(Op);
31654 EVT VT = Op.getValueType();
31655 SDValue Lo, Hi;
31656 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
31657 EVT LoVT, HiVT;
31658 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31659 SDValue RC = Op.getOperand(1);
31660 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
31661 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
31662 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31665 static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget,
31666 SelectionDAG &DAG) {
31667 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
31669 // We don't support non-data prefetch without PREFETCHI.
31670 // Just preserve the chain.
31671 if (!IsData && !Subtarget.hasPREFETCHI())
31672 return Op.getOperand(0);
31674 return Op;
31677 static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,
31678 unsigned OpNo) {
31679 const APInt Operand(32, OpNo);
31680 std::string OpNoStr = llvm::toString(Operand, 10, false);
31681 std::string Str(" $");
31683 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
31684 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
31686 auto I = StringRef::npos;
31687 for (auto &AsmStr : AsmStrs) {
31688 // Match the OpNo string. We should match exactly to exclude match
31689 // sub-string, e.g. "$12" contain "$1"
31690 if (AsmStr.ends_with(OpNoStr1))
31691 I = AsmStr.size() - OpNoStr1.size();
31693 // Get the index of operand in AsmStr.
31694 if (I == StringRef::npos)
31695 I = AsmStr.find(OpNoStr1 + ",");
31696 if (I == StringRef::npos)
31697 I = AsmStr.find(OpNoStr2);
31699 if (I == StringRef::npos)
31700 continue;
31702 assert(I > 0 && "Unexpected inline asm string!");
31703 // Remove the operand string and label (if exsit).
31704 // For example:
31705 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
31706 // ==>
31707 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
31708 // ==>
31709 // "call dword ptr "
31710 auto TmpStr = AsmStr.substr(0, I);
31711 I = TmpStr.rfind(':');
31712 if (I != StringRef::npos)
31713 TmpStr = TmpStr.substr(I + 1);
31714 return TmpStr.take_while(llvm::isAlpha);
31717 return StringRef();
31720 bool X86TargetLowering::isInlineAsmTargetBranch(
31721 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
31722 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
31723 // changed from indirect TargetLowering::C_Memory to direct
31724 // TargetLowering::C_Address.
31725 // We don't need to special case LOOP* and Jcc, which cannot target a memory
31726 // location.
31727 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
31728 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
31731 /// Provide custom lowering hooks for some operations.
31732 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
31733 switch (Op.getOpcode()) {
31734 default: llvm_unreachable("Should not custom lower this!");
31735 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
31736 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
31737 return LowerCMP_SWAP(Op, Subtarget, DAG);
31738 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
31739 case ISD::ATOMIC_LOAD_ADD:
31740 case ISD::ATOMIC_LOAD_SUB:
31741 case ISD::ATOMIC_LOAD_OR:
31742 case ISD::ATOMIC_LOAD_XOR:
31743 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
31744 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
31745 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
31746 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
31747 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
31748 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
31749 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
31750 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
31751 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
31752 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
31753 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
31754 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
31755 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
31756 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
31757 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
31758 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
31759 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
31760 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
31761 case ISD::SHL_PARTS:
31762 case ISD::SRA_PARTS:
31763 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
31764 case ISD::FSHL:
31765 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
31766 case ISD::STRICT_SINT_TO_FP:
31767 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
31768 case ISD::STRICT_UINT_TO_FP:
31769 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
31770 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
31771 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
31772 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
31773 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
31774 case ISD::ZERO_EXTEND_VECTOR_INREG:
31775 case ISD::SIGN_EXTEND_VECTOR_INREG:
31776 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
31777 case ISD::FP_TO_SINT:
31778 case ISD::STRICT_FP_TO_SINT:
31779 case ISD::FP_TO_UINT:
31780 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
31781 case ISD::FP_TO_SINT_SAT:
31782 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
31783 case ISD::FP_EXTEND:
31784 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
31785 case ISD::FP_ROUND:
31786 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
31787 case ISD::FP16_TO_FP:
31788 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
31789 case ISD::FP_TO_FP16:
31790 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
31791 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
31792 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
31793 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
31794 case ISD::FADD:
31795 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
31796 case ISD::FROUND: return LowerFROUND(Op, DAG);
31797 case ISD::FABS:
31798 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
31799 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
31800 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
31801 case ISD::LRINT:
31802 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
31803 case ISD::SETCC:
31804 case ISD::STRICT_FSETCC:
31805 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
31806 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
31807 case ISD::SELECT: return LowerSELECT(Op, DAG);
31808 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
31809 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
31810 case ISD::VASTART: return LowerVASTART(Op, DAG);
31811 case ISD::VAARG: return LowerVAARG(Op, DAG);
31812 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
31813 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
31814 case ISD::INTRINSIC_VOID:
31815 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
31816 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
31817 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
31818 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
31819 case ISD::FRAME_TO_ARGS_OFFSET:
31820 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
31821 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
31822 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
31823 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
31824 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
31825 case ISD::EH_SJLJ_SETUP_DISPATCH:
31826 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
31827 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
31828 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
31829 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
31830 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
31831 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
31832 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
31833 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
31834 case ISD::CTLZ:
31835 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
31836 case ISD::CTTZ:
31837 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
31838 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
31839 case ISD::MULHS:
31840 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
31841 case ISD::ROTL:
31842 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
31843 case ISD::SRA:
31844 case ISD::SRL:
31845 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
31846 case ISD::SADDO:
31847 case ISD::UADDO:
31848 case ISD::SSUBO:
31849 case ISD::USUBO: return LowerXALUO(Op, DAG);
31850 case ISD::SMULO:
31851 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
31852 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
31853 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
31854 case ISD::SADDO_CARRY:
31855 case ISD::SSUBO_CARRY:
31856 case ISD::UADDO_CARRY:
31857 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
31858 case ISD::ADD:
31859 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
31860 case ISD::UADDSAT:
31861 case ISD::SADDSAT:
31862 case ISD::USUBSAT:
31863 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
31864 case ISD::SMAX:
31865 case ISD::SMIN:
31866 case ISD::UMAX:
31867 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
31868 case ISD::FMINIMUM:
31869 case ISD::FMAXIMUM:
31870 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
31871 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
31872 case ISD::ABDS:
31873 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
31874 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
31875 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
31876 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
31877 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
31878 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
31879 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
31880 case ISD::GC_TRANSITION_START:
31881 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
31882 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
31883 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
31884 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
31888 /// Replace a node with an illegal result type with a new node built out of
31889 /// custom code.
31890 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
31891 SmallVectorImpl<SDValue>&Results,
31892 SelectionDAG &DAG) const {
31893 SDLoc dl(N);
31894 switch (N->getOpcode()) {
31895 default:
31896 #ifndef NDEBUG
31897 dbgs() << "ReplaceNodeResults: ";
31898 N->dump(&DAG);
31899 #endif
31900 llvm_unreachable("Do not know how to custom type legalize this operation!");
31901 case X86ISD::CVTPH2PS: {
31902 EVT VT = N->getValueType(0);
31903 SDValue Lo, Hi;
31904 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31905 EVT LoVT, HiVT;
31906 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31907 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
31908 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
31909 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31910 Results.push_back(Res);
31911 return;
31913 case X86ISD::STRICT_CVTPH2PS: {
31914 EVT VT = N->getValueType(0);
31915 SDValue Lo, Hi;
31916 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
31917 EVT LoVT, HiVT;
31918 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31919 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
31920 {N->getOperand(0), Lo});
31921 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
31922 {N->getOperand(0), Hi});
31923 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31924 Lo.getValue(1), Hi.getValue(1));
31925 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31926 Results.push_back(Res);
31927 Results.push_back(Chain);
31928 return;
31930 case X86ISD::CVTPS2PH:
31931 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
31932 return;
31933 case ISD::CTPOP: {
31934 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
31935 // Use a v2i64 if possible.
31936 bool NoImplicitFloatOps =
31937 DAG.getMachineFunction().getFunction().hasFnAttribute(
31938 Attribute::NoImplicitFloat);
31939 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
31940 SDValue Wide =
31941 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
31942 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
31943 // Bit count should fit in 32-bits, extract it as that and then zero
31944 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
31945 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
31946 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
31947 DAG.getIntPtrConstant(0, dl));
31948 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
31949 Results.push_back(Wide);
31951 return;
31953 case ISD::MUL: {
31954 EVT VT = N->getValueType(0);
31955 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31956 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
31957 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
31958 // elements are needed.
31959 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
31960 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
31961 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
31962 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
31963 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
31964 unsigned NumConcats = 16 / VT.getVectorNumElements();
31965 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
31966 ConcatOps[0] = Res;
31967 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
31968 Results.push_back(Res);
31969 return;
31971 case ISD::SMULO:
31972 case ISD::UMULO: {
31973 EVT VT = N->getValueType(0);
31974 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31975 VT == MVT::v2i32 && "Unexpected VT!");
31976 bool IsSigned = N->getOpcode() == ISD::SMULO;
31977 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
31978 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
31979 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
31980 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
31981 // Extract the high 32 bits from each result using PSHUFD.
31982 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
31983 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
31984 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
31985 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
31986 DAG.getIntPtrConstant(0, dl));
31988 // Truncate the low bits of the result. This will become PSHUFD.
31989 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
31991 SDValue HiCmp;
31992 if (IsSigned) {
31993 // SMULO overflows if the high bits don't match the sign of the low.
31994 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
31995 } else {
31996 // UMULO overflows if the high bits are non-zero.
31997 HiCmp = DAG.getConstant(0, dl, VT);
31999 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
32001 // Widen the result with by padding with undef.
32002 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32003 DAG.getUNDEF(VT));
32004 Results.push_back(Res);
32005 Results.push_back(Ovf);
32006 return;
32008 case X86ISD::VPMADDWD: {
32009 // Legalize types for X86ISD::VPMADDWD by widening.
32010 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32012 EVT VT = N->getValueType(0);
32013 EVT InVT = N->getOperand(0).getValueType();
32014 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
32015 "Expected a VT that divides into 128 bits.");
32016 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32017 "Unexpected type action!");
32018 unsigned NumConcat = 128 / InVT.getSizeInBits();
32020 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
32021 InVT.getVectorElementType(),
32022 NumConcat * InVT.getVectorNumElements());
32023 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
32024 VT.getVectorElementType(),
32025 NumConcat * VT.getVectorNumElements());
32027 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
32028 Ops[0] = N->getOperand(0);
32029 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32030 Ops[0] = N->getOperand(1);
32031 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32033 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
32034 Results.push_back(Res);
32035 return;
32037 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
32038 case X86ISD::FMINC:
32039 case X86ISD::FMIN:
32040 case X86ISD::FMAXC:
32041 case X86ISD::FMAX: {
32042 EVT VT = N->getValueType(0);
32043 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
32044 SDValue UNDEF = DAG.getUNDEF(VT);
32045 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32046 N->getOperand(0), UNDEF);
32047 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32048 N->getOperand(1), UNDEF);
32049 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
32050 return;
32052 case ISD::SDIV:
32053 case ISD::UDIV:
32054 case ISD::SREM:
32055 case ISD::UREM: {
32056 EVT VT = N->getValueType(0);
32057 if (VT.isVector()) {
32058 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32059 "Unexpected type action!");
32060 // If this RHS is a constant splat vector we can widen this and let
32061 // division/remainder by constant optimize it.
32062 // TODO: Can we do something for non-splat?
32063 APInt SplatVal;
32064 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
32065 unsigned NumConcats = 128 / VT.getSizeInBits();
32066 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
32067 Ops0[0] = N->getOperand(0);
32068 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
32069 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
32070 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
32071 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
32072 Results.push_back(Res);
32074 return;
32077 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
32078 Results.push_back(V);
32079 return;
32081 case ISD::TRUNCATE: {
32082 MVT VT = N->getSimpleValueType(0);
32083 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
32084 return;
32086 // The generic legalizer will try to widen the input type to the same
32087 // number of elements as the widened result type. But this isn't always
32088 // the best thing so do some custom legalization to avoid some cases.
32089 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
32090 SDValue In = N->getOperand(0);
32091 EVT InVT = In.getValueType();
32092 EVT InEltVT = InVT.getVectorElementType();
32093 EVT EltVT = VT.getVectorElementType();
32094 unsigned MinElts = VT.getVectorNumElements();
32095 unsigned WidenNumElts = WidenVT.getVectorNumElements();
32096 unsigned InBits = InVT.getSizeInBits();
32098 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
32099 unsigned PackOpcode;
32100 if (SDValue Src =
32101 matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) {
32102 if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src,
32103 dl, DAG, Subtarget)) {
32104 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
32105 Results.push_back(Res);
32106 return;
32110 if (128 % InBits == 0) {
32111 // 128 bit and smaller inputs should avoid truncate all together and
32112 // just use a build_vector that will become a shuffle.
32113 // TODO: Widen and use a shuffle directly?
32114 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
32115 // Use the original element count so we don't do more scalar opts than
32116 // necessary.
32117 for (unsigned i=0; i < MinElts; ++i) {
32118 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
32119 DAG.getIntPtrConstant(i, dl));
32120 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
32122 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
32123 return;
32126 // With AVX512 there are some cases that can use a target specific
32127 // truncate node to go from 256/512 to less than 128 with zeros in the
32128 // upper elements of the 128 bit result.
32129 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
32130 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
32131 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
32132 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32133 return;
32135 // There's one case we can widen to 512 bits and use VTRUNC.
32136 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
32137 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
32138 DAG.getUNDEF(MVT::v4i64));
32139 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32140 return;
32143 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
32144 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
32145 isTypeLegal(MVT::v4i64)) {
32146 // Input needs to be split and output needs to widened. Let's use two
32147 // VTRUNCs, and shuffle their results together into the wider type.
32148 SDValue Lo, Hi;
32149 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
32151 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
32152 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
32153 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
32154 { 0, 1, 2, 3, 16, 17, 18, 19,
32155 -1, -1, -1, -1, -1, -1, -1, -1 });
32156 Results.push_back(Res);
32157 return;
32160 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
32161 // this via type legalization.
32162 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
32163 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
32164 (!Subtarget.hasSSSE3() ||
32165 (!isTypeLegal(InVT) &&
32166 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
32167 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
32168 InEltVT.getSizeInBits() * WidenNumElts);
32169 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
32170 return;
32173 return;
32175 case ISD::ANY_EXTEND:
32176 // Right now, only MVT::v8i8 has Custom action for an illegal type.
32177 // It's intended to custom handle the input type.
32178 assert(N->getValueType(0) == MVT::v8i8 &&
32179 "Do not know how to legalize this Node");
32180 return;
32181 case ISD::SIGN_EXTEND:
32182 case ISD::ZERO_EXTEND: {
32183 EVT VT = N->getValueType(0);
32184 SDValue In = N->getOperand(0);
32185 EVT InVT = In.getValueType();
32186 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
32187 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
32188 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
32189 "Unexpected type action!");
32190 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
32191 // Custom split this so we can extend i8/i16->i32 invec. This is better
32192 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
32193 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
32194 // we allow the sra from the extend to i32 to be shared by the split.
32195 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
32197 // Fill a vector with sign bits for each element.
32198 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
32199 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
32201 // Create an unpackl and unpackh to interleave the sign bits then bitcast
32202 // to v2i64.
32203 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32204 {0, 4, 1, 5});
32205 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
32206 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32207 {2, 6, 3, 7});
32208 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
32210 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32211 Results.push_back(Res);
32212 return;
32215 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
32216 if (!InVT.is128BitVector()) {
32217 // Not a 128 bit vector, but maybe type legalization will promote
32218 // it to 128 bits.
32219 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
32220 return;
32221 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
32222 if (!InVT.is128BitVector())
32223 return;
32225 // Promote the input to 128 bits. Type legalization will turn this into
32226 // zext_inreg/sext_inreg.
32227 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
32230 // Perform custom splitting instead of the two stage extend we would get
32231 // by default.
32232 EVT LoVT, HiVT;
32233 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
32234 assert(isTypeLegal(LoVT) && "Split VT not legal?");
32236 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
32238 // We need to shift the input over by half the number of elements.
32239 unsigned NumElts = InVT.getVectorNumElements();
32240 unsigned HalfNumElts = NumElts / 2;
32241 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
32242 for (unsigned i = 0; i != HalfNumElts; ++i)
32243 ShufMask[i] = i + HalfNumElts;
32245 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
32246 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
32248 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32249 Results.push_back(Res);
32251 return;
32253 case ISD::FP_TO_SINT:
32254 case ISD::STRICT_FP_TO_SINT:
32255 case ISD::FP_TO_UINT:
32256 case ISD::STRICT_FP_TO_UINT: {
32257 bool IsStrict = N->isStrictFPOpcode();
32258 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
32259 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
32260 EVT VT = N->getValueType(0);
32261 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32262 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
32263 EVT SrcVT = Src.getValueType();
32265 SDValue Res;
32266 if (isSoftF16(SrcVT, Subtarget)) {
32267 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
32268 if (IsStrict) {
32269 Res =
32270 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
32271 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
32272 {NVT, MVT::Other}, {Chain, Src})});
32273 Chain = Res.getValue(1);
32274 } else {
32275 Res = DAG.getNode(N->getOpcode(), dl, VT,
32276 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
32278 Results.push_back(Res);
32279 if (IsStrict)
32280 Results.push_back(Chain);
32282 return;
32285 if (VT.isVector() && Subtarget.hasFP16() &&
32286 SrcVT.getVectorElementType() == MVT::f16) {
32287 EVT EleVT = VT.getVectorElementType();
32288 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
32290 if (SrcVT != MVT::v8f16) {
32291 SDValue Tmp =
32292 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
32293 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
32294 Ops[0] = Src;
32295 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
32298 if (IsStrict) {
32299 unsigned Opc =
32300 IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32301 Res =
32302 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
32303 Chain = Res.getValue(1);
32304 } else {
32305 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32306 Res = DAG.getNode(Opc, dl, ResVT, Src);
32309 // TODO: Need to add exception check code for strict FP.
32310 if (EleVT.getSizeInBits() < 16) {
32311 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
32312 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
32314 // Now widen to 128 bits.
32315 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
32316 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
32317 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
32318 ConcatOps[0] = Res;
32319 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32322 Results.push_back(Res);
32323 if (IsStrict)
32324 Results.push_back(Chain);
32326 return;
32329 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
32330 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32331 "Unexpected type action!");
32333 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
32334 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
32335 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
32336 VT.getVectorNumElements());
32337 SDValue Res;
32338 SDValue Chain;
32339 if (IsStrict) {
32340 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
32341 {N->getOperand(0), Src});
32342 Chain = Res.getValue(1);
32343 } else
32344 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
32346 // Preserve what we know about the size of the original result. If the
32347 // result is v2i32, we have to manually widen the assert.
32348 if (PromoteVT == MVT::v2i32)
32349 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32350 DAG.getUNDEF(MVT::v2i32));
32352 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
32353 Res.getValueType(), Res,
32354 DAG.getValueType(VT.getVectorElementType()));
32356 if (PromoteVT == MVT::v2i32)
32357 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
32358 DAG.getIntPtrConstant(0, dl));
32360 // Truncate back to the original width.
32361 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32363 // Now widen to 128 bits.
32364 unsigned NumConcats = 128 / VT.getSizeInBits();
32365 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
32366 VT.getVectorNumElements() * NumConcats);
32367 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32368 ConcatOps[0] = Res;
32369 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32370 Results.push_back(Res);
32371 if (IsStrict)
32372 Results.push_back(Chain);
32373 return;
32377 if (VT == MVT::v2i32) {
32378 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
32379 "Strict unsigned conversion requires AVX512");
32380 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32381 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32382 "Unexpected type action!");
32383 if (Src.getValueType() == MVT::v2f64) {
32384 if (!IsSigned && !Subtarget.hasAVX512()) {
32385 SDValue Res =
32386 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
32387 Results.push_back(Res);
32388 return;
32391 unsigned Opc;
32392 if (IsStrict)
32393 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32394 else
32395 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32397 // If we have VLX we can emit a target specific FP_TO_UINT node,.
32398 if (!IsSigned && !Subtarget.hasVLX()) {
32399 // Otherwise we can defer to the generic legalizer which will widen
32400 // the input as well. This will be further widened during op
32401 // legalization to v8i32<-v8f64.
32402 // For strict nodes we'll need to widen ourselves.
32403 // FIXME: Fix the type legalizer to safely widen strict nodes?
32404 if (!IsStrict)
32405 return;
32406 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
32407 DAG.getConstantFP(0.0, dl, MVT::v2f64));
32408 Opc = N->getOpcode();
32410 SDValue Res;
32411 SDValue Chain;
32412 if (IsStrict) {
32413 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
32414 {N->getOperand(0), Src});
32415 Chain = Res.getValue(1);
32416 } else {
32417 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
32419 Results.push_back(Res);
32420 if (IsStrict)
32421 Results.push_back(Chain);
32422 return;
32425 // Custom widen strict v2f32->v2i32 by padding with zeros.
32426 // FIXME: Should generic type legalizer do this?
32427 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
32428 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
32429 DAG.getConstantFP(0.0, dl, MVT::v2f32));
32430 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
32431 {N->getOperand(0), Src});
32432 Results.push_back(Res);
32433 Results.push_back(Res.getValue(1));
32434 return;
32437 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
32438 // so early out here.
32439 return;
32442 assert(!VT.isVector() && "Vectors should have been handled above!");
32444 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
32445 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
32446 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
32447 assert(!Subtarget.is64Bit() && "i64 should be legal");
32448 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
32449 // If we use a 128-bit result we might need to use a target specific node.
32450 unsigned SrcElts =
32451 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
32452 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
32453 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
32454 unsigned Opc = N->getOpcode();
32455 if (NumElts != SrcElts) {
32456 if (IsStrict)
32457 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32458 else
32459 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32462 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
32463 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
32464 DAG.getConstantFP(0.0, dl, VecInVT), Src,
32465 ZeroIdx);
32466 SDValue Chain;
32467 if (IsStrict) {
32468 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
32469 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
32470 Chain = Res.getValue(1);
32471 } else
32472 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
32473 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
32474 Results.push_back(Res);
32475 if (IsStrict)
32476 Results.push_back(Chain);
32477 return;
32480 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
32481 SDValue Chain;
32482 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
32483 Results.push_back(V);
32484 if (IsStrict)
32485 Results.push_back(Chain);
32486 return;
32489 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
32490 Results.push_back(V);
32491 if (IsStrict)
32492 Results.push_back(Chain);
32494 return;
32496 case ISD::LRINT:
32497 case ISD::LLRINT: {
32498 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
32499 Results.push_back(V);
32500 return;
32503 case ISD::SINT_TO_FP:
32504 case ISD::STRICT_SINT_TO_FP:
32505 case ISD::UINT_TO_FP:
32506 case ISD::STRICT_UINT_TO_FP: {
32507 bool IsStrict = N->isStrictFPOpcode();
32508 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
32509 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
32510 EVT VT = N->getValueType(0);
32511 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32512 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
32513 Subtarget.hasVLX()) {
32514 if (Src.getValueType().getVectorElementType() == MVT::i16)
32515 return;
32517 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
32518 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32519 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
32520 : DAG.getUNDEF(MVT::v2i32));
32521 if (IsStrict) {
32522 unsigned Opc =
32523 IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
32524 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
32525 {N->getOperand(0), Src});
32526 Results.push_back(Res);
32527 Results.push_back(Res.getValue(1));
32528 } else {
32529 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32530 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
32532 return;
32534 if (VT != MVT::v2f32)
32535 return;
32536 EVT SrcVT = Src.getValueType();
32537 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
32538 if (IsStrict) {
32539 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
32540 : X86ISD::STRICT_CVTUI2P;
32541 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
32542 {N->getOperand(0), Src});
32543 Results.push_back(Res);
32544 Results.push_back(Res.getValue(1));
32545 } else {
32546 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32547 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
32549 return;
32551 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
32552 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
32553 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
32554 SDValue One = DAG.getConstant(1, dl, SrcVT);
32555 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
32556 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
32557 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
32558 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
32559 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
32560 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
32561 for (int i = 0; i != 2; ++i) {
32562 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
32563 SignSrc, DAG.getIntPtrConstant(i, dl));
32564 if (IsStrict)
32565 SignCvts[i] =
32566 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
32567 {N->getOperand(0), Elt});
32568 else
32569 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
32571 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
32572 SDValue Slow, Chain;
32573 if (IsStrict) {
32574 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32575 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
32576 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
32577 {Chain, SignCvt, SignCvt});
32578 Chain = Slow.getValue(1);
32579 } else {
32580 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
32582 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
32583 IsNeg =
32584 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
32585 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
32586 Results.push_back(Cvt);
32587 if (IsStrict)
32588 Results.push_back(Chain);
32589 return;
32592 if (SrcVT != MVT::v2i32)
32593 return;
32595 if (IsSigned || Subtarget.hasAVX512()) {
32596 if (!IsStrict)
32597 return;
32599 // Custom widen strict v2i32->v2f32 to avoid scalarization.
32600 // FIXME: Should generic type legalizer do this?
32601 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32602 DAG.getConstant(0, dl, MVT::v2i32));
32603 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
32604 {N->getOperand(0), Src});
32605 Results.push_back(Res);
32606 Results.push_back(Res.getValue(1));
32607 return;
32610 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32611 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
32612 SDValue VBias = DAG.getConstantFP(
32613 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
32614 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
32615 DAG.getBitcast(MVT::v2i64, VBias));
32616 Or = DAG.getBitcast(MVT::v2f64, Or);
32617 if (IsStrict) {
32618 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
32619 {N->getOperand(0), Or, VBias});
32620 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
32621 {MVT::v4f32, MVT::Other},
32622 {Sub.getValue(1), Sub});
32623 Results.push_back(Res);
32624 Results.push_back(Res.getValue(1));
32625 } else {
32626 // TODO: Are there any fast-math-flags to propagate here?
32627 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
32628 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
32630 return;
32632 case ISD::STRICT_FP_ROUND:
32633 case ISD::FP_ROUND: {
32634 bool IsStrict = N->isStrictFPOpcode();
32635 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
32636 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32637 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
32638 EVT SrcVT = Src.getValueType();
32639 EVT VT = N->getValueType(0);
32640 SDValue V;
32641 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
32642 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
32643 : DAG.getUNDEF(MVT::v2f32);
32644 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
32646 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
32647 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
32648 if (SrcVT.getVectorElementType() != MVT::f32)
32649 return;
32651 if (IsStrict)
32652 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
32653 {Chain, Src, Rnd});
32654 else
32655 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
32657 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
32658 if (IsStrict)
32659 Results.push_back(V.getValue(1));
32660 return;
32662 if (!isTypeLegal(Src.getValueType()))
32663 return;
32664 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
32665 if (IsStrict)
32666 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
32667 {Chain, Src});
32668 else
32669 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
32670 Results.push_back(V);
32671 if (IsStrict)
32672 Results.push_back(V.getValue(1));
32673 return;
32675 case ISD::FP_EXTEND:
32676 case ISD::STRICT_FP_EXTEND: {
32677 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
32678 // No other ValueType for FP_EXTEND should reach this point.
32679 assert(N->getValueType(0) == MVT::v2f32 &&
32680 "Do not know how to legalize this Node");
32681 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
32682 return;
32683 bool IsStrict = N->isStrictFPOpcode();
32684 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32685 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
32686 : DAG.getUNDEF(MVT::v2f16);
32687 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
32688 if (IsStrict)
32689 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
32690 {N->getOperand(0), V});
32691 else
32692 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
32693 Results.push_back(V);
32694 if (IsStrict)
32695 Results.push_back(V.getValue(1));
32696 return;
32698 case ISD::INTRINSIC_W_CHAIN: {
32699 unsigned IntNo = N->getConstantOperandVal(1);
32700 switch (IntNo) {
32701 default : llvm_unreachable("Do not know how to custom type "
32702 "legalize this intrinsic operation!");
32703 case Intrinsic::x86_rdtsc:
32704 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
32705 Results);
32706 case Intrinsic::x86_rdtscp:
32707 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
32708 Results);
32709 case Intrinsic::x86_rdpmc:
32710 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
32711 Results);
32712 return;
32713 case Intrinsic::x86_rdpru:
32714 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
32715 Results);
32716 return;
32717 case Intrinsic::x86_xgetbv:
32718 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
32719 Results);
32720 return;
32723 case ISD::READCYCLECOUNTER: {
32724 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
32726 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
32727 EVT T = N->getValueType(0);
32728 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
32729 bool Regs64bit = T == MVT::i128;
32730 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
32731 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
32732 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
32733 SDValue cpInL, cpInH;
32734 std::tie(cpInL, cpInH) =
32735 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
32736 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
32737 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
32738 cpInH =
32739 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
32740 cpInH, cpInL.getValue(1));
32741 SDValue swapInL, swapInH;
32742 std::tie(swapInL, swapInH) =
32743 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
32744 swapInH =
32745 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
32746 swapInH, cpInH.getValue(1));
32748 // In 64-bit mode we might need the base pointer in RBX, but we can't know
32749 // until later. So we keep the RBX input in a vreg and use a custom
32750 // inserter.
32751 // Since RBX will be a reserved register the register allocator will not
32752 // make sure its value will be properly saved and restored around this
32753 // live-range.
32754 SDValue Result;
32755 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32756 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
32757 if (Regs64bit) {
32758 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
32759 swapInH.getValue(1)};
32760 Result =
32761 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
32762 } else {
32763 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
32764 swapInH.getValue(1));
32765 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
32766 swapInL.getValue(1)};
32767 Result =
32768 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
32771 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
32772 Regs64bit ? X86::RAX : X86::EAX,
32773 HalfT, Result.getValue(1));
32774 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
32775 Regs64bit ? X86::RDX : X86::EDX,
32776 HalfT, cpOutL.getValue(2));
32777 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
32779 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
32780 MVT::i32, cpOutH.getValue(2));
32781 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
32782 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
32784 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
32785 Results.push_back(Success);
32786 Results.push_back(EFLAGS.getValue(1));
32787 return;
32789 case ISD::ATOMIC_LOAD: {
32790 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32791 bool NoImplicitFloatOps =
32792 DAG.getMachineFunction().getFunction().hasFnAttribute(
32793 Attribute::NoImplicitFloat);
32794 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
32795 auto *Node = cast<AtomicSDNode>(N);
32796 if (Subtarget.hasSSE1()) {
32797 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
32798 // Then extract the lower 64-bits.
32799 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
32800 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
32801 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
32802 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
32803 MVT::i64, Node->getMemOperand());
32804 if (Subtarget.hasSSE2()) {
32805 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
32806 DAG.getIntPtrConstant(0, dl));
32807 Results.push_back(Res);
32808 Results.push_back(Ld.getValue(1));
32809 return;
32811 // We use an alternative sequence for SSE1 that extracts as v2f32 and
32812 // then casts to i64. This avoids a 128-bit stack temporary being
32813 // created by type legalization if we were to cast v4f32->v2i64.
32814 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
32815 DAG.getIntPtrConstant(0, dl));
32816 Res = DAG.getBitcast(MVT::i64, Res);
32817 Results.push_back(Res);
32818 Results.push_back(Ld.getValue(1));
32819 return;
32821 if (Subtarget.hasX87()) {
32822 // First load this into an 80-bit X87 register. This will put the whole
32823 // integer into the significand.
32824 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
32825 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
32826 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
32827 dl, Tys, Ops, MVT::i64,
32828 Node->getMemOperand());
32829 SDValue Chain = Result.getValue(1);
32831 // Now store the X87 register to a stack temporary and convert to i64.
32832 // This store is not atomic and doesn't need to be.
32833 // FIXME: We don't need a stack temporary if the result of the load
32834 // is already being stored. We could just directly store there.
32835 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
32836 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
32837 MachinePointerInfo MPI =
32838 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
32839 SDValue StoreOps[] = { Chain, Result, StackPtr };
32840 Chain = DAG.getMemIntrinsicNode(
32841 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
32842 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
32844 // Finally load the value back from the stack temporary and return it.
32845 // This load is not atomic and doesn't need to be.
32846 // This load will be further type legalized.
32847 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
32848 Results.push_back(Result);
32849 Results.push_back(Result.getValue(1));
32850 return;
32853 // TODO: Use MOVLPS when SSE1 is available?
32854 // Delegate to generic TypeLegalization. Situations we can really handle
32855 // should have already been dealt with by AtomicExpandPass.cpp.
32856 break;
32858 case ISD::ATOMIC_SWAP:
32859 case ISD::ATOMIC_LOAD_ADD:
32860 case ISD::ATOMIC_LOAD_SUB:
32861 case ISD::ATOMIC_LOAD_AND:
32862 case ISD::ATOMIC_LOAD_OR:
32863 case ISD::ATOMIC_LOAD_XOR:
32864 case ISD::ATOMIC_LOAD_NAND:
32865 case ISD::ATOMIC_LOAD_MIN:
32866 case ISD::ATOMIC_LOAD_MAX:
32867 case ISD::ATOMIC_LOAD_UMIN:
32868 case ISD::ATOMIC_LOAD_UMAX:
32869 // Delegate to generic TypeLegalization. Situations we can really handle
32870 // should have already been dealt with by AtomicExpandPass.cpp.
32871 break;
32873 case ISD::BITCAST: {
32874 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32875 EVT DstVT = N->getValueType(0);
32876 EVT SrcVT = N->getOperand(0).getValueType();
32878 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
32879 // we can split using the k-register rather than memory.
32880 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
32881 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32882 SDValue Lo, Hi;
32883 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
32884 Lo = DAG.getBitcast(MVT::i32, Lo);
32885 Hi = DAG.getBitcast(MVT::i32, Hi);
32886 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
32887 Results.push_back(Res);
32888 return;
32891 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
32892 // FIXME: Use v4f32 for SSE1?
32893 assert(Subtarget.hasSSE2() && "Requires SSE2");
32894 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
32895 "Unexpected type action!");
32896 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
32897 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
32898 N->getOperand(0));
32899 Res = DAG.getBitcast(WideVT, Res);
32900 Results.push_back(Res);
32901 return;
32904 return;
32906 case ISD::MGATHER: {
32907 EVT VT = N->getValueType(0);
32908 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
32909 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
32910 auto *Gather = cast<MaskedGatherSDNode>(N);
32911 SDValue Index = Gather->getIndex();
32912 if (Index.getValueType() != MVT::v2i64)
32913 return;
32914 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32915 "Unexpected type action!");
32916 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32917 SDValue Mask = Gather->getMask();
32918 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
32919 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
32920 Gather->getPassThru(),
32921 DAG.getUNDEF(VT));
32922 if (!Subtarget.hasVLX()) {
32923 // We need to widen the mask, but the instruction will only use 2
32924 // of its elements. So we can use undef.
32925 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
32926 DAG.getUNDEF(MVT::v2i1));
32927 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
32929 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
32930 Gather->getBasePtr(), Index, Gather->getScale() };
32931 SDValue Res = DAG.getMemIntrinsicNode(
32932 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
32933 Gather->getMemoryVT(), Gather->getMemOperand());
32934 Results.push_back(Res);
32935 Results.push_back(Res.getValue(1));
32936 return;
32938 return;
32940 case ISD::LOAD: {
32941 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
32942 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
32943 // cast since type legalization will try to use an i64 load.
32944 MVT VT = N->getSimpleValueType(0);
32945 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
32946 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32947 "Unexpected type action!");
32948 if (!ISD::isNON_EXTLoad(N))
32949 return;
32950 auto *Ld = cast<LoadSDNode>(N);
32951 if (Subtarget.hasSSE2()) {
32952 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
32953 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
32954 Ld->getPointerInfo(), Ld->getOriginalAlign(),
32955 Ld->getMemOperand()->getFlags());
32956 SDValue Chain = Res.getValue(1);
32957 MVT VecVT = MVT::getVectorVT(LdVT, 2);
32958 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
32959 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32960 Res = DAG.getBitcast(WideVT, Res);
32961 Results.push_back(Res);
32962 Results.push_back(Chain);
32963 return;
32965 assert(Subtarget.hasSSE1() && "Expected SSE");
32966 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
32967 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
32968 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
32969 MVT::i64, Ld->getMemOperand());
32970 Results.push_back(Res);
32971 Results.push_back(Res.getValue(1));
32972 return;
32974 case ISD::ADDRSPACECAST: {
32975 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
32976 Results.push_back(V);
32977 return;
32979 case ISD::BITREVERSE: {
32980 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32981 assert(Subtarget.hasXOP() && "Expected XOP");
32982 // We can use VPPERM by copying to a vector register and back. We'll need
32983 // to move the scalar in two i32 pieces.
32984 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
32985 return;
32987 case ISD::EXTRACT_VECTOR_ELT: {
32988 // f16 = extract vXf16 %vec, i64 %idx
32989 assert(N->getSimpleValueType(0) == MVT::f16 &&
32990 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
32991 assert(Subtarget.hasFP16() && "Expected FP16");
32992 SDValue VecOp = N->getOperand(0);
32993 EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();
32994 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
32995 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
32996 N->getOperand(1));
32997 Split = DAG.getBitcast(MVT::f16, Split);
32998 Results.push_back(Split);
32999 return;
33004 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
33005 switch ((X86ISD::NodeType)Opcode) {
33006 case X86ISD::FIRST_NUMBER: break;
33007 #define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
33008 NODE_NAME_CASE(BSF)
33009 NODE_NAME_CASE(BSR)
33010 NODE_NAME_CASE(FSHL)
33011 NODE_NAME_CASE(FSHR)
33012 NODE_NAME_CASE(FAND)
33013 NODE_NAME_CASE(FANDN)
33014 NODE_NAME_CASE(FOR)
33015 NODE_NAME_CASE(FXOR)
33016 NODE_NAME_CASE(FILD)
33017 NODE_NAME_CASE(FIST)
33018 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
33019 NODE_NAME_CASE(FLD)
33020 NODE_NAME_CASE(FST)
33021 NODE_NAME_CASE(CALL)
33022 NODE_NAME_CASE(CALL_RVMARKER)
33023 NODE_NAME_CASE(BT)
33024 NODE_NAME_CASE(CMP)
33025 NODE_NAME_CASE(FCMP)
33026 NODE_NAME_CASE(STRICT_FCMP)
33027 NODE_NAME_CASE(STRICT_FCMPS)
33028 NODE_NAME_CASE(COMI)
33029 NODE_NAME_CASE(UCOMI)
33030 NODE_NAME_CASE(CMPM)
33031 NODE_NAME_CASE(CMPMM)
33032 NODE_NAME_CASE(STRICT_CMPM)
33033 NODE_NAME_CASE(CMPMM_SAE)
33034 NODE_NAME_CASE(SETCC)
33035 NODE_NAME_CASE(SETCC_CARRY)
33036 NODE_NAME_CASE(FSETCC)
33037 NODE_NAME_CASE(FSETCCM)
33038 NODE_NAME_CASE(FSETCCM_SAE)
33039 NODE_NAME_CASE(CMOV)
33040 NODE_NAME_CASE(BRCOND)
33041 NODE_NAME_CASE(RET_GLUE)
33042 NODE_NAME_CASE(IRET)
33043 NODE_NAME_CASE(REP_STOS)
33044 NODE_NAME_CASE(REP_MOVS)
33045 NODE_NAME_CASE(GlobalBaseReg)
33046 NODE_NAME_CASE(Wrapper)
33047 NODE_NAME_CASE(WrapperRIP)
33048 NODE_NAME_CASE(MOVQ2DQ)
33049 NODE_NAME_CASE(MOVDQ2Q)
33050 NODE_NAME_CASE(MMX_MOVD2W)
33051 NODE_NAME_CASE(MMX_MOVW2D)
33052 NODE_NAME_CASE(PEXTRB)
33053 NODE_NAME_CASE(PEXTRW)
33054 NODE_NAME_CASE(INSERTPS)
33055 NODE_NAME_CASE(PINSRB)
33056 NODE_NAME_CASE(PINSRW)
33057 NODE_NAME_CASE(PSHUFB)
33058 NODE_NAME_CASE(ANDNP)
33059 NODE_NAME_CASE(BLENDI)
33060 NODE_NAME_CASE(BLENDV)
33061 NODE_NAME_CASE(HADD)
33062 NODE_NAME_CASE(HSUB)
33063 NODE_NAME_CASE(FHADD)
33064 NODE_NAME_CASE(FHSUB)
33065 NODE_NAME_CASE(CONFLICT)
33066 NODE_NAME_CASE(FMAX)
33067 NODE_NAME_CASE(FMAXS)
33068 NODE_NAME_CASE(FMAX_SAE)
33069 NODE_NAME_CASE(FMAXS_SAE)
33070 NODE_NAME_CASE(FMIN)
33071 NODE_NAME_CASE(FMINS)
33072 NODE_NAME_CASE(FMIN_SAE)
33073 NODE_NAME_CASE(FMINS_SAE)
33074 NODE_NAME_CASE(FMAXC)
33075 NODE_NAME_CASE(FMINC)
33076 NODE_NAME_CASE(FRSQRT)
33077 NODE_NAME_CASE(FRCP)
33078 NODE_NAME_CASE(EXTRQI)
33079 NODE_NAME_CASE(INSERTQI)
33080 NODE_NAME_CASE(TLSADDR)
33081 NODE_NAME_CASE(TLSBASEADDR)
33082 NODE_NAME_CASE(TLSCALL)
33083 NODE_NAME_CASE(EH_SJLJ_SETJMP)
33084 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
33085 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
33086 NODE_NAME_CASE(EH_RETURN)
33087 NODE_NAME_CASE(TC_RETURN)
33088 NODE_NAME_CASE(FNSTCW16m)
33089 NODE_NAME_CASE(FLDCW16m)
33090 NODE_NAME_CASE(FNSTENVm)
33091 NODE_NAME_CASE(FLDENVm)
33092 NODE_NAME_CASE(LCMPXCHG_DAG)
33093 NODE_NAME_CASE(LCMPXCHG8_DAG)
33094 NODE_NAME_CASE(LCMPXCHG16_DAG)
33095 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
33096 NODE_NAME_CASE(LADD)
33097 NODE_NAME_CASE(LSUB)
33098 NODE_NAME_CASE(LOR)
33099 NODE_NAME_CASE(LXOR)
33100 NODE_NAME_CASE(LAND)
33101 NODE_NAME_CASE(LBTS)
33102 NODE_NAME_CASE(LBTC)
33103 NODE_NAME_CASE(LBTR)
33104 NODE_NAME_CASE(LBTS_RM)
33105 NODE_NAME_CASE(LBTC_RM)
33106 NODE_NAME_CASE(LBTR_RM)
33107 NODE_NAME_CASE(AADD)
33108 NODE_NAME_CASE(AOR)
33109 NODE_NAME_CASE(AXOR)
33110 NODE_NAME_CASE(AAND)
33111 NODE_NAME_CASE(VZEXT_MOVL)
33112 NODE_NAME_CASE(VZEXT_LOAD)
33113 NODE_NAME_CASE(VEXTRACT_STORE)
33114 NODE_NAME_CASE(VTRUNC)
33115 NODE_NAME_CASE(VTRUNCS)
33116 NODE_NAME_CASE(VTRUNCUS)
33117 NODE_NAME_CASE(VMTRUNC)
33118 NODE_NAME_CASE(VMTRUNCS)
33119 NODE_NAME_CASE(VMTRUNCUS)
33120 NODE_NAME_CASE(VTRUNCSTORES)
33121 NODE_NAME_CASE(VTRUNCSTOREUS)
33122 NODE_NAME_CASE(VMTRUNCSTORES)
33123 NODE_NAME_CASE(VMTRUNCSTOREUS)
33124 NODE_NAME_CASE(VFPEXT)
33125 NODE_NAME_CASE(STRICT_VFPEXT)
33126 NODE_NAME_CASE(VFPEXT_SAE)
33127 NODE_NAME_CASE(VFPEXTS)
33128 NODE_NAME_CASE(VFPEXTS_SAE)
33129 NODE_NAME_CASE(VFPROUND)
33130 NODE_NAME_CASE(STRICT_VFPROUND)
33131 NODE_NAME_CASE(VMFPROUND)
33132 NODE_NAME_CASE(VFPROUND_RND)
33133 NODE_NAME_CASE(VFPROUNDS)
33134 NODE_NAME_CASE(VFPROUNDS_RND)
33135 NODE_NAME_CASE(VSHLDQ)
33136 NODE_NAME_CASE(VSRLDQ)
33137 NODE_NAME_CASE(VSHL)
33138 NODE_NAME_CASE(VSRL)
33139 NODE_NAME_CASE(VSRA)
33140 NODE_NAME_CASE(VSHLI)
33141 NODE_NAME_CASE(VSRLI)
33142 NODE_NAME_CASE(VSRAI)
33143 NODE_NAME_CASE(VSHLV)
33144 NODE_NAME_CASE(VSRLV)
33145 NODE_NAME_CASE(VSRAV)
33146 NODE_NAME_CASE(VROTLI)
33147 NODE_NAME_CASE(VROTRI)
33148 NODE_NAME_CASE(VPPERM)
33149 NODE_NAME_CASE(CMPP)
33150 NODE_NAME_CASE(STRICT_CMPP)
33151 NODE_NAME_CASE(PCMPEQ)
33152 NODE_NAME_CASE(PCMPGT)
33153 NODE_NAME_CASE(PHMINPOS)
33154 NODE_NAME_CASE(ADD)
33155 NODE_NAME_CASE(SUB)
33156 NODE_NAME_CASE(ADC)
33157 NODE_NAME_CASE(SBB)
33158 NODE_NAME_CASE(SMUL)
33159 NODE_NAME_CASE(UMUL)
33160 NODE_NAME_CASE(OR)
33161 NODE_NAME_CASE(XOR)
33162 NODE_NAME_CASE(AND)
33163 NODE_NAME_CASE(BEXTR)
33164 NODE_NAME_CASE(BEXTRI)
33165 NODE_NAME_CASE(BZHI)
33166 NODE_NAME_CASE(PDEP)
33167 NODE_NAME_CASE(PEXT)
33168 NODE_NAME_CASE(MUL_IMM)
33169 NODE_NAME_CASE(MOVMSK)
33170 NODE_NAME_CASE(PTEST)
33171 NODE_NAME_CASE(TESTP)
33172 NODE_NAME_CASE(KORTEST)
33173 NODE_NAME_CASE(KTEST)
33174 NODE_NAME_CASE(KADD)
33175 NODE_NAME_CASE(KSHIFTL)
33176 NODE_NAME_CASE(KSHIFTR)
33177 NODE_NAME_CASE(PACKSS)
33178 NODE_NAME_CASE(PACKUS)
33179 NODE_NAME_CASE(PALIGNR)
33180 NODE_NAME_CASE(VALIGN)
33181 NODE_NAME_CASE(VSHLD)
33182 NODE_NAME_CASE(VSHRD)
33183 NODE_NAME_CASE(VSHLDV)
33184 NODE_NAME_CASE(VSHRDV)
33185 NODE_NAME_CASE(PSHUFD)
33186 NODE_NAME_CASE(PSHUFHW)
33187 NODE_NAME_CASE(PSHUFLW)
33188 NODE_NAME_CASE(SHUFP)
33189 NODE_NAME_CASE(SHUF128)
33190 NODE_NAME_CASE(MOVLHPS)
33191 NODE_NAME_CASE(MOVHLPS)
33192 NODE_NAME_CASE(MOVDDUP)
33193 NODE_NAME_CASE(MOVSHDUP)
33194 NODE_NAME_CASE(MOVSLDUP)
33195 NODE_NAME_CASE(MOVSD)
33196 NODE_NAME_CASE(MOVSS)
33197 NODE_NAME_CASE(MOVSH)
33198 NODE_NAME_CASE(UNPCKL)
33199 NODE_NAME_CASE(UNPCKH)
33200 NODE_NAME_CASE(VBROADCAST)
33201 NODE_NAME_CASE(VBROADCAST_LOAD)
33202 NODE_NAME_CASE(VBROADCASTM)
33203 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
33204 NODE_NAME_CASE(VPERMILPV)
33205 NODE_NAME_CASE(VPERMILPI)
33206 NODE_NAME_CASE(VPERM2X128)
33207 NODE_NAME_CASE(VPERMV)
33208 NODE_NAME_CASE(VPERMV3)
33209 NODE_NAME_CASE(VPERMI)
33210 NODE_NAME_CASE(VPTERNLOG)
33211 NODE_NAME_CASE(VFIXUPIMM)
33212 NODE_NAME_CASE(VFIXUPIMM_SAE)
33213 NODE_NAME_CASE(VFIXUPIMMS)
33214 NODE_NAME_CASE(VFIXUPIMMS_SAE)
33215 NODE_NAME_CASE(VRANGE)
33216 NODE_NAME_CASE(VRANGE_SAE)
33217 NODE_NAME_CASE(VRANGES)
33218 NODE_NAME_CASE(VRANGES_SAE)
33219 NODE_NAME_CASE(PMULUDQ)
33220 NODE_NAME_CASE(PMULDQ)
33221 NODE_NAME_CASE(PSADBW)
33222 NODE_NAME_CASE(DBPSADBW)
33223 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
33224 NODE_NAME_CASE(VAARG_64)
33225 NODE_NAME_CASE(VAARG_X32)
33226 NODE_NAME_CASE(DYN_ALLOCA)
33227 NODE_NAME_CASE(MFENCE)
33228 NODE_NAME_CASE(SEG_ALLOCA)
33229 NODE_NAME_CASE(PROBED_ALLOCA)
33230 NODE_NAME_CASE(RDRAND)
33231 NODE_NAME_CASE(RDSEED)
33232 NODE_NAME_CASE(RDPKRU)
33233 NODE_NAME_CASE(WRPKRU)
33234 NODE_NAME_CASE(VPMADDUBSW)
33235 NODE_NAME_CASE(VPMADDWD)
33236 NODE_NAME_CASE(VPSHA)
33237 NODE_NAME_CASE(VPSHL)
33238 NODE_NAME_CASE(VPCOM)
33239 NODE_NAME_CASE(VPCOMU)
33240 NODE_NAME_CASE(VPERMIL2)
33241 NODE_NAME_CASE(FMSUB)
33242 NODE_NAME_CASE(STRICT_FMSUB)
33243 NODE_NAME_CASE(FNMADD)
33244 NODE_NAME_CASE(STRICT_FNMADD)
33245 NODE_NAME_CASE(FNMSUB)
33246 NODE_NAME_CASE(STRICT_FNMSUB)
33247 NODE_NAME_CASE(FMADDSUB)
33248 NODE_NAME_CASE(FMSUBADD)
33249 NODE_NAME_CASE(FMADD_RND)
33250 NODE_NAME_CASE(FNMADD_RND)
33251 NODE_NAME_CASE(FMSUB_RND)
33252 NODE_NAME_CASE(FNMSUB_RND)
33253 NODE_NAME_CASE(FMADDSUB_RND)
33254 NODE_NAME_CASE(FMSUBADD_RND)
33255 NODE_NAME_CASE(VFMADDC)
33256 NODE_NAME_CASE(VFMADDC_RND)
33257 NODE_NAME_CASE(VFCMADDC)
33258 NODE_NAME_CASE(VFCMADDC_RND)
33259 NODE_NAME_CASE(VFMULC)
33260 NODE_NAME_CASE(VFMULC_RND)
33261 NODE_NAME_CASE(VFCMULC)
33262 NODE_NAME_CASE(VFCMULC_RND)
33263 NODE_NAME_CASE(VFMULCSH)
33264 NODE_NAME_CASE(VFMULCSH_RND)
33265 NODE_NAME_CASE(VFCMULCSH)
33266 NODE_NAME_CASE(VFCMULCSH_RND)
33267 NODE_NAME_CASE(VFMADDCSH)
33268 NODE_NAME_CASE(VFMADDCSH_RND)
33269 NODE_NAME_CASE(VFCMADDCSH)
33270 NODE_NAME_CASE(VFCMADDCSH_RND)
33271 NODE_NAME_CASE(VPMADD52H)
33272 NODE_NAME_CASE(VPMADD52L)
33273 NODE_NAME_CASE(VRNDSCALE)
33274 NODE_NAME_CASE(STRICT_VRNDSCALE)
33275 NODE_NAME_CASE(VRNDSCALE_SAE)
33276 NODE_NAME_CASE(VRNDSCALES)
33277 NODE_NAME_CASE(VRNDSCALES_SAE)
33278 NODE_NAME_CASE(VREDUCE)
33279 NODE_NAME_CASE(VREDUCE_SAE)
33280 NODE_NAME_CASE(VREDUCES)
33281 NODE_NAME_CASE(VREDUCES_SAE)
33282 NODE_NAME_CASE(VGETMANT)
33283 NODE_NAME_CASE(VGETMANT_SAE)
33284 NODE_NAME_CASE(VGETMANTS)
33285 NODE_NAME_CASE(VGETMANTS_SAE)
33286 NODE_NAME_CASE(PCMPESTR)
33287 NODE_NAME_CASE(PCMPISTR)
33288 NODE_NAME_CASE(XTEST)
33289 NODE_NAME_CASE(COMPRESS)
33290 NODE_NAME_CASE(EXPAND)
33291 NODE_NAME_CASE(SELECTS)
33292 NODE_NAME_CASE(ADDSUB)
33293 NODE_NAME_CASE(RCP14)
33294 NODE_NAME_CASE(RCP14S)
33295 NODE_NAME_CASE(RCP28)
33296 NODE_NAME_CASE(RCP28_SAE)
33297 NODE_NAME_CASE(RCP28S)
33298 NODE_NAME_CASE(RCP28S_SAE)
33299 NODE_NAME_CASE(EXP2)
33300 NODE_NAME_CASE(EXP2_SAE)
33301 NODE_NAME_CASE(RSQRT14)
33302 NODE_NAME_CASE(RSQRT14S)
33303 NODE_NAME_CASE(RSQRT28)
33304 NODE_NAME_CASE(RSQRT28_SAE)
33305 NODE_NAME_CASE(RSQRT28S)
33306 NODE_NAME_CASE(RSQRT28S_SAE)
33307 NODE_NAME_CASE(FADD_RND)
33308 NODE_NAME_CASE(FADDS)
33309 NODE_NAME_CASE(FADDS_RND)
33310 NODE_NAME_CASE(FSUB_RND)
33311 NODE_NAME_CASE(FSUBS)
33312 NODE_NAME_CASE(FSUBS_RND)
33313 NODE_NAME_CASE(FMUL_RND)
33314 NODE_NAME_CASE(FMULS)
33315 NODE_NAME_CASE(FMULS_RND)
33316 NODE_NAME_CASE(FDIV_RND)
33317 NODE_NAME_CASE(FDIVS)
33318 NODE_NAME_CASE(FDIVS_RND)
33319 NODE_NAME_CASE(FSQRT_RND)
33320 NODE_NAME_CASE(FSQRTS)
33321 NODE_NAME_CASE(FSQRTS_RND)
33322 NODE_NAME_CASE(FGETEXP)
33323 NODE_NAME_CASE(FGETEXP_SAE)
33324 NODE_NAME_CASE(FGETEXPS)
33325 NODE_NAME_CASE(FGETEXPS_SAE)
33326 NODE_NAME_CASE(SCALEF)
33327 NODE_NAME_CASE(SCALEF_RND)
33328 NODE_NAME_CASE(SCALEFS)
33329 NODE_NAME_CASE(SCALEFS_RND)
33330 NODE_NAME_CASE(MULHRS)
33331 NODE_NAME_CASE(SINT_TO_FP_RND)
33332 NODE_NAME_CASE(UINT_TO_FP_RND)
33333 NODE_NAME_CASE(CVTTP2SI)
33334 NODE_NAME_CASE(CVTTP2UI)
33335 NODE_NAME_CASE(STRICT_CVTTP2SI)
33336 NODE_NAME_CASE(STRICT_CVTTP2UI)
33337 NODE_NAME_CASE(MCVTTP2SI)
33338 NODE_NAME_CASE(MCVTTP2UI)
33339 NODE_NAME_CASE(CVTTP2SI_SAE)
33340 NODE_NAME_CASE(CVTTP2UI_SAE)
33341 NODE_NAME_CASE(CVTTS2SI)
33342 NODE_NAME_CASE(CVTTS2UI)
33343 NODE_NAME_CASE(CVTTS2SI_SAE)
33344 NODE_NAME_CASE(CVTTS2UI_SAE)
33345 NODE_NAME_CASE(CVTSI2P)
33346 NODE_NAME_CASE(CVTUI2P)
33347 NODE_NAME_CASE(STRICT_CVTSI2P)
33348 NODE_NAME_CASE(STRICT_CVTUI2P)
33349 NODE_NAME_CASE(MCVTSI2P)
33350 NODE_NAME_CASE(MCVTUI2P)
33351 NODE_NAME_CASE(VFPCLASS)
33352 NODE_NAME_CASE(VFPCLASSS)
33353 NODE_NAME_CASE(MULTISHIFT)
33354 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
33355 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
33356 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
33357 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
33358 NODE_NAME_CASE(CVTPS2PH)
33359 NODE_NAME_CASE(STRICT_CVTPS2PH)
33360 NODE_NAME_CASE(CVTPS2PH_SAE)
33361 NODE_NAME_CASE(MCVTPS2PH)
33362 NODE_NAME_CASE(MCVTPS2PH_SAE)
33363 NODE_NAME_CASE(CVTPH2PS)
33364 NODE_NAME_CASE(STRICT_CVTPH2PS)
33365 NODE_NAME_CASE(CVTPH2PS_SAE)
33366 NODE_NAME_CASE(CVTP2SI)
33367 NODE_NAME_CASE(CVTP2UI)
33368 NODE_NAME_CASE(MCVTP2SI)
33369 NODE_NAME_CASE(MCVTP2UI)
33370 NODE_NAME_CASE(CVTP2SI_RND)
33371 NODE_NAME_CASE(CVTP2UI_RND)
33372 NODE_NAME_CASE(CVTS2SI)
33373 NODE_NAME_CASE(CVTS2UI)
33374 NODE_NAME_CASE(CVTS2SI_RND)
33375 NODE_NAME_CASE(CVTS2UI_RND)
33376 NODE_NAME_CASE(CVTNE2PS2BF16)
33377 NODE_NAME_CASE(CVTNEPS2BF16)
33378 NODE_NAME_CASE(MCVTNEPS2BF16)
33379 NODE_NAME_CASE(DPBF16PS)
33380 NODE_NAME_CASE(LWPINS)
33381 NODE_NAME_CASE(MGATHER)
33382 NODE_NAME_CASE(MSCATTER)
33383 NODE_NAME_CASE(VPDPBUSD)
33384 NODE_NAME_CASE(VPDPBUSDS)
33385 NODE_NAME_CASE(VPDPWSSD)
33386 NODE_NAME_CASE(VPDPWSSDS)
33387 NODE_NAME_CASE(VPSHUFBITQMB)
33388 NODE_NAME_CASE(GF2P8MULB)
33389 NODE_NAME_CASE(GF2P8AFFINEQB)
33390 NODE_NAME_CASE(GF2P8AFFINEINVQB)
33391 NODE_NAME_CASE(NT_CALL)
33392 NODE_NAME_CASE(NT_BRIND)
33393 NODE_NAME_CASE(UMWAIT)
33394 NODE_NAME_CASE(TPAUSE)
33395 NODE_NAME_CASE(ENQCMD)
33396 NODE_NAME_CASE(ENQCMDS)
33397 NODE_NAME_CASE(VP2INTERSECT)
33398 NODE_NAME_CASE(VPDPBSUD)
33399 NODE_NAME_CASE(VPDPBSUDS)
33400 NODE_NAME_CASE(VPDPBUUD)
33401 NODE_NAME_CASE(VPDPBUUDS)
33402 NODE_NAME_CASE(VPDPBSSD)
33403 NODE_NAME_CASE(VPDPBSSDS)
33404 NODE_NAME_CASE(AESENC128KL)
33405 NODE_NAME_CASE(AESDEC128KL)
33406 NODE_NAME_CASE(AESENC256KL)
33407 NODE_NAME_CASE(AESDEC256KL)
33408 NODE_NAME_CASE(AESENCWIDE128KL)
33409 NODE_NAME_CASE(AESDECWIDE128KL)
33410 NODE_NAME_CASE(AESENCWIDE256KL)
33411 NODE_NAME_CASE(AESDECWIDE256KL)
33412 NODE_NAME_CASE(CMPCCXADD)
33413 NODE_NAME_CASE(TESTUI)
33414 NODE_NAME_CASE(FP80_ADD)
33415 NODE_NAME_CASE(STRICT_FP80_ADD)
33417 return nullptr;
33418 #undef NODE_NAME_CASE
33421 /// Return true if the addressing mode represented by AM is legal for this
33422 /// target, for a load/store of the specified type.
33423 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
33424 const AddrMode &AM, Type *Ty,
33425 unsigned AS,
33426 Instruction *I) const {
33427 // X86 supports extremely general addressing modes.
33428 CodeModel::Model M = getTargetMachine().getCodeModel();
33430 // X86 allows a sign-extended 32-bit immediate field as a displacement.
33431 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
33432 return false;
33434 if (AM.BaseGV) {
33435 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
33437 // If a reference to this global requires an extra load, we can't fold it.
33438 if (isGlobalStubReference(GVFlags))
33439 return false;
33441 // If BaseGV requires a register for the PIC base, we cannot also have a
33442 // BaseReg specified.
33443 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
33444 return false;
33446 // If lower 4G is not available, then we must use rip-relative addressing.
33447 if ((M != CodeModel::Small || isPositionIndependent()) &&
33448 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
33449 return false;
33452 switch (AM.Scale) {
33453 case 0:
33454 case 1:
33455 case 2:
33456 case 4:
33457 case 8:
33458 // These scales always work.
33459 break;
33460 case 3:
33461 case 5:
33462 case 9:
33463 // These scales are formed with basereg+scalereg. Only accept if there is
33464 // no basereg yet.
33465 if (AM.HasBaseReg)
33466 return false;
33467 break;
33468 default: // Other stuff never works.
33469 return false;
33472 return true;
33475 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
33476 unsigned Bits = Ty->getScalarSizeInBits();
33478 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
33479 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
33480 if (Subtarget.hasXOP() &&
33481 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
33482 return false;
33484 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
33485 // shifts just as cheap as scalar ones.
33486 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
33487 return false;
33489 // AVX512BW has shifts such as vpsllvw.
33490 if (Subtarget.hasBWI() && Bits == 16)
33491 return false;
33493 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
33494 // fully general vector.
33495 return true;
33498 bool X86TargetLowering::isBinOp(unsigned Opcode) const {
33499 switch (Opcode) {
33500 // These are non-commutative binops.
33501 // TODO: Add more X86ISD opcodes once we have test coverage.
33502 case X86ISD::ANDNP:
33503 case X86ISD::PCMPGT:
33504 case X86ISD::FMAX:
33505 case X86ISD::FMIN:
33506 case X86ISD::FANDN:
33507 case X86ISD::VPSHA:
33508 case X86ISD::VPSHL:
33509 case X86ISD::VSHLV:
33510 case X86ISD::VSRLV:
33511 case X86ISD::VSRAV:
33512 return true;
33515 return TargetLoweringBase::isBinOp(Opcode);
33518 bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
33519 switch (Opcode) {
33520 // TODO: Add more X86ISD opcodes once we have test coverage.
33521 case X86ISD::PCMPEQ:
33522 case X86ISD::PMULDQ:
33523 case X86ISD::PMULUDQ:
33524 case X86ISD::FMAXC:
33525 case X86ISD::FMINC:
33526 case X86ISD::FAND:
33527 case X86ISD::FOR:
33528 case X86ISD::FXOR:
33529 return true;
33532 return TargetLoweringBase::isCommutativeBinOp(Opcode);
33535 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
33536 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33537 return false;
33538 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
33539 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
33540 return NumBits1 > NumBits2;
33543 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
33544 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33545 return false;
33547 if (!isTypeLegal(EVT::getEVT(Ty1)))
33548 return false;
33550 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
33552 // Assuming the caller doesn't have a zeroext or signext return parameter,
33553 // truncation all the way down to i1 is valid.
33554 return true;
33557 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
33558 return isInt<32>(Imm);
33561 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
33562 // Can also use sub to handle negated immediates.
33563 return isInt<32>(Imm);
33566 bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
33567 return isInt<32>(Imm);
33570 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
33571 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
33572 return false;
33573 unsigned NumBits1 = VT1.getSizeInBits();
33574 unsigned NumBits2 = VT2.getSizeInBits();
33575 return NumBits1 > NumBits2;
33578 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
33579 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
33580 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
33583 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
33584 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
33585 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
33588 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
33589 EVT VT1 = Val.getValueType();
33590 if (isZExtFree(VT1, VT2))
33591 return true;
33593 if (Val.getOpcode() != ISD::LOAD)
33594 return false;
33596 if (!VT1.isSimple() || !VT1.isInteger() ||
33597 !VT2.isSimple() || !VT2.isInteger())
33598 return false;
33600 switch (VT1.getSimpleVT().SimpleTy) {
33601 default: break;
33602 case MVT::i8:
33603 case MVT::i16:
33604 case MVT::i32:
33605 // X86 has 8, 16, and 32-bit zero-extending loads.
33606 return true;
33609 return false;
33612 bool X86TargetLowering::shouldSinkOperands(Instruction *I,
33613 SmallVectorImpl<Use *> &Ops) const {
33614 using namespace llvm::PatternMatch;
33616 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
33617 if (!VTy)
33618 return false;
33620 if (I->getOpcode() == Instruction::Mul &&
33621 VTy->getElementType()->isIntegerTy(64)) {
33622 for (auto &Op : I->operands()) {
33623 // Make sure we are not already sinking this operand
33624 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
33625 continue;
33627 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
33628 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
33629 if (Subtarget.hasSSE41() &&
33630 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
33631 m_SpecificInt(32)))) {
33632 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
33633 Ops.push_back(&Op);
33634 } else if (Subtarget.hasSSE2() &&
33635 match(Op.get(),
33636 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
33637 Ops.push_back(&Op);
33641 return !Ops.empty();
33644 // A uniform shift amount in a vector shift or funnel shift may be much
33645 // cheaper than a generic variable vector shift, so make that pattern visible
33646 // to SDAG by sinking the shuffle instruction next to the shift.
33647 int ShiftAmountOpNum = -1;
33648 if (I->isShift())
33649 ShiftAmountOpNum = 1;
33650 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
33651 if (II->getIntrinsicID() == Intrinsic::fshl ||
33652 II->getIntrinsicID() == Intrinsic::fshr)
33653 ShiftAmountOpNum = 2;
33656 if (ShiftAmountOpNum == -1)
33657 return false;
33659 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
33660 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
33661 isVectorShiftByScalarCheap(I->getType())) {
33662 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
33663 return true;
33666 return false;
33669 bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
33670 if (!Subtarget.is64Bit())
33671 return false;
33672 return TargetLowering::shouldConvertPhiType(From, To);
33675 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
33676 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
33677 return false;
33679 EVT SrcVT = ExtVal.getOperand(0).getValueType();
33681 // There is no extending load for vXi1.
33682 if (SrcVT.getScalarType() == MVT::i1)
33683 return false;
33685 return true;
33688 bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
33689 EVT VT) const {
33690 if (!Subtarget.hasAnyFMA())
33691 return false;
33693 VT = VT.getScalarType();
33695 if (!VT.isSimple())
33696 return false;
33698 switch (VT.getSimpleVT().SimpleTy) {
33699 case MVT::f16:
33700 return Subtarget.hasFP16();
33701 case MVT::f32:
33702 case MVT::f64:
33703 return true;
33704 default:
33705 break;
33708 return false;
33711 bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
33712 // i16 instructions are longer (0x66 prefix) and potentially slower.
33713 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
33716 bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
33717 EVT VT) const {
33718 // TODO: This is too general. There are cases where pre-AVX512 codegen would
33719 // benefit. The transform may also be profitable for scalar code.
33720 if (!Subtarget.hasAVX512())
33721 return false;
33722 if (!Subtarget.hasVLX() && !VT.is512BitVector())
33723 return false;
33724 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
33725 return false;
33727 return true;
33730 /// Targets can use this to indicate that they only support *some*
33731 /// VECTOR_SHUFFLE operations, those with specific masks.
33732 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
33733 /// are assumed to be legal.
33734 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
33735 if (!VT.isSimple())
33736 return false;
33738 // Not for i1 vectors
33739 if (VT.getSimpleVT().getScalarType() == MVT::i1)
33740 return false;
33742 // Very little shuffling can be done for 64-bit vectors right now.
33743 if (VT.getSimpleVT().getSizeInBits() == 64)
33744 return false;
33746 // We only care that the types being shuffled are legal. The lowering can
33747 // handle any possible shuffle mask that results.
33748 return isTypeLegal(VT.getSimpleVT());
33751 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
33752 EVT VT) const {
33753 // Don't convert an 'and' into a shuffle that we don't directly support.
33754 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
33755 if (!Subtarget.hasAVX2())
33756 if (VT == MVT::v32i8 || VT == MVT::v16i16)
33757 return false;
33759 // Just delegate to the generic legality, clear masks aren't special.
33760 return isShuffleMaskLegal(Mask, VT);
33763 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
33764 // If the subtarget is using thunks, we need to not generate jump tables.
33765 if (Subtarget.useIndirectThunkBranches())
33766 return false;
33768 // Otherwise, fallback on the generic logic.
33769 return TargetLowering::areJTsAllowed(Fn);
33772 MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,
33773 EVT ConditionVT) const {
33774 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
33775 // zero-extensions.
33776 if (ConditionVT.getSizeInBits() < 32)
33777 return MVT::i32;
33778 return TargetLoweringBase::getPreferredSwitchConditionType(Context,
33779 ConditionVT);
33782 //===----------------------------------------------------------------------===//
33783 // X86 Scheduler Hooks
33784 //===----------------------------------------------------------------------===//
33786 // Returns true if EFLAG is consumed after this iterator in the rest of the
33787 // basic block or any successors of the basic block.
33788 static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
33789 MachineBasicBlock *BB) {
33790 // Scan forward through BB for a use/def of EFLAGS.
33791 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
33792 if (mi.readsRegister(X86::EFLAGS))
33793 return true;
33794 // If we found a def, we can stop searching.
33795 if (mi.definesRegister(X86::EFLAGS))
33796 return false;
33799 // If we hit the end of the block, check whether EFLAGS is live into a
33800 // successor.
33801 for (MachineBasicBlock *Succ : BB->successors())
33802 if (Succ->isLiveIn(X86::EFLAGS))
33803 return true;
33805 return false;
33808 /// Utility function to emit xbegin specifying the start of an RTM region.
33809 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
33810 const TargetInstrInfo *TII) {
33811 const MIMetadata MIMD(MI);
33813 const BasicBlock *BB = MBB->getBasicBlock();
33814 MachineFunction::iterator I = ++MBB->getIterator();
33816 // For the v = xbegin(), we generate
33818 // thisMBB:
33819 // xbegin sinkMBB
33821 // mainMBB:
33822 // s0 = -1
33824 // fallBB:
33825 // eax = # XABORT_DEF
33826 // s1 = eax
33828 // sinkMBB:
33829 // v = phi(s0/mainBB, s1/fallBB)
33831 MachineBasicBlock *thisMBB = MBB;
33832 MachineFunction *MF = MBB->getParent();
33833 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
33834 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
33835 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33836 MF->insert(I, mainMBB);
33837 MF->insert(I, fallMBB);
33838 MF->insert(I, sinkMBB);
33840 if (isEFLAGSLiveAfter(MI, MBB)) {
33841 mainMBB->addLiveIn(X86::EFLAGS);
33842 fallMBB->addLiveIn(X86::EFLAGS);
33843 sinkMBB->addLiveIn(X86::EFLAGS);
33846 // Transfer the remainder of BB and its successor edges to sinkMBB.
33847 sinkMBB->splice(sinkMBB->begin(), MBB,
33848 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33849 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33851 MachineRegisterInfo &MRI = MF->getRegInfo();
33852 Register DstReg = MI.getOperand(0).getReg();
33853 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
33854 Register mainDstReg = MRI.createVirtualRegister(RC);
33855 Register fallDstReg = MRI.createVirtualRegister(RC);
33857 // thisMBB:
33858 // xbegin fallMBB
33859 // # fallthrough to mainMBB
33860 // # abortion to fallMBB
33861 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
33862 thisMBB->addSuccessor(mainMBB);
33863 thisMBB->addSuccessor(fallMBB);
33865 // mainMBB:
33866 // mainDstReg := -1
33867 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
33868 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
33869 mainMBB->addSuccessor(sinkMBB);
33871 // fallMBB:
33872 // ; pseudo instruction to model hardware's definition from XABORT
33873 // EAX := XABORT_DEF
33874 // fallDstReg := EAX
33875 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
33876 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
33877 .addReg(X86::EAX);
33878 fallMBB->addSuccessor(sinkMBB);
33880 // sinkMBB:
33881 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
33882 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
33883 .addReg(mainDstReg).addMBB(mainMBB)
33884 .addReg(fallDstReg).addMBB(fallMBB);
33886 MI.eraseFromParent();
33887 return sinkMBB;
33890 MachineBasicBlock *
33891 X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
33892 MachineBasicBlock *MBB) const {
33893 // Emit va_arg instruction on X86-64.
33895 // Operands to this pseudo-instruction:
33896 // 0 ) Output : destination address (reg)
33897 // 1-5) Input : va_list address (addr, i64mem)
33898 // 6 ) ArgSize : Size (in bytes) of vararg type
33899 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
33900 // 8 ) Align : Alignment of type
33901 // 9 ) EFLAGS (implicit-def)
33903 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
33904 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
33906 Register DestReg = MI.getOperand(0).getReg();
33907 MachineOperand &Base = MI.getOperand(1);
33908 MachineOperand &Scale = MI.getOperand(2);
33909 MachineOperand &Index = MI.getOperand(3);
33910 MachineOperand &Disp = MI.getOperand(4);
33911 MachineOperand &Segment = MI.getOperand(5);
33912 unsigned ArgSize = MI.getOperand(6).getImm();
33913 unsigned ArgMode = MI.getOperand(7).getImm();
33914 Align Alignment = Align(MI.getOperand(8).getImm());
33916 MachineFunction *MF = MBB->getParent();
33918 // Memory Reference
33919 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
33921 MachineMemOperand *OldMMO = MI.memoperands().front();
33923 // Clone the MMO into two separate MMOs for loading and storing
33924 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
33925 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
33926 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
33927 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
33929 // Machine Information
33930 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33931 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
33932 const TargetRegisterClass *AddrRegClass =
33933 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
33934 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
33935 const MIMetadata MIMD(MI);
33937 // struct va_list {
33938 // i32 gp_offset
33939 // i32 fp_offset
33940 // i64 overflow_area (address)
33941 // i64 reg_save_area (address)
33942 // }
33943 // sizeof(va_list) = 24
33944 // alignment(va_list) = 8
33946 unsigned TotalNumIntRegs = 6;
33947 unsigned TotalNumXMMRegs = 8;
33948 bool UseGPOffset = (ArgMode == 1);
33949 bool UseFPOffset = (ArgMode == 2);
33950 unsigned MaxOffset = TotalNumIntRegs * 8 +
33951 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
33953 /* Align ArgSize to a multiple of 8 */
33954 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
33955 bool NeedsAlign = (Alignment > 8);
33957 MachineBasicBlock *thisMBB = MBB;
33958 MachineBasicBlock *overflowMBB;
33959 MachineBasicBlock *offsetMBB;
33960 MachineBasicBlock *endMBB;
33962 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
33963 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
33964 unsigned OffsetReg = 0;
33966 if (!UseGPOffset && !UseFPOffset) {
33967 // If we only pull from the overflow region, we don't create a branch.
33968 // We don't need to alter control flow.
33969 OffsetDestReg = 0; // unused
33970 OverflowDestReg = DestReg;
33972 offsetMBB = nullptr;
33973 overflowMBB = thisMBB;
33974 endMBB = thisMBB;
33975 } else {
33976 // First emit code to check if gp_offset (or fp_offset) is below the bound.
33977 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
33978 // If not, pull from overflow_area. (branch to overflowMBB)
33980 // thisMBB
33981 // | .
33982 // | .
33983 // offsetMBB overflowMBB
33984 // | .
33985 // | .
33986 // endMBB
33988 // Registers for the PHI in endMBB
33989 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
33990 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
33992 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
33993 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33994 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33995 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33997 MachineFunction::iterator MBBIter = ++MBB->getIterator();
33999 // Insert the new basic blocks
34000 MF->insert(MBBIter, offsetMBB);
34001 MF->insert(MBBIter, overflowMBB);
34002 MF->insert(MBBIter, endMBB);
34004 // Transfer the remainder of MBB and its successor edges to endMBB.
34005 endMBB->splice(endMBB->begin(), thisMBB,
34006 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
34007 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
34009 // Make offsetMBB and overflowMBB successors of thisMBB
34010 thisMBB->addSuccessor(offsetMBB);
34011 thisMBB->addSuccessor(overflowMBB);
34013 // endMBB is a successor of both offsetMBB and overflowMBB
34014 offsetMBB->addSuccessor(endMBB);
34015 overflowMBB->addSuccessor(endMBB);
34017 // Load the offset value into a register
34018 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34019 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
34020 .add(Base)
34021 .add(Scale)
34022 .add(Index)
34023 .addDisp(Disp, UseFPOffset ? 4 : 0)
34024 .add(Segment)
34025 .setMemRefs(LoadOnlyMMO);
34027 // Check if there is enough room left to pull this argument.
34028 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
34029 .addReg(OffsetReg)
34030 .addImm(MaxOffset + 8 - ArgSizeA8);
34032 // Branch to "overflowMBB" if offset >= max
34033 // Fall through to "offsetMBB" otherwise
34034 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
34035 .addMBB(overflowMBB).addImm(X86::COND_AE);
34038 // In offsetMBB, emit code to use the reg_save_area.
34039 if (offsetMBB) {
34040 assert(OffsetReg != 0);
34042 // Read the reg_save_area address.
34043 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
34044 BuildMI(
34045 offsetMBB, MIMD,
34046 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34047 RegSaveReg)
34048 .add(Base)
34049 .add(Scale)
34050 .add(Index)
34051 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
34052 .add(Segment)
34053 .setMemRefs(LoadOnlyMMO);
34055 if (Subtarget.isTarget64BitLP64()) {
34056 // Zero-extend the offset
34057 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
34058 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
34059 .addImm(0)
34060 .addReg(OffsetReg)
34061 .addImm(X86::sub_32bit);
34063 // Add the offset to the reg_save_area to get the final address.
34064 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
34065 .addReg(OffsetReg64)
34066 .addReg(RegSaveReg);
34067 } else {
34068 // Add the offset to the reg_save_area to get the final address.
34069 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
34070 .addReg(OffsetReg)
34071 .addReg(RegSaveReg);
34074 // Compute the offset for the next argument
34075 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34076 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
34077 .addReg(OffsetReg)
34078 .addImm(UseFPOffset ? 16 : 8);
34080 // Store it back into the va_list.
34081 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
34082 .add(Base)
34083 .add(Scale)
34084 .add(Index)
34085 .addDisp(Disp, UseFPOffset ? 4 : 0)
34086 .add(Segment)
34087 .addReg(NextOffsetReg)
34088 .setMemRefs(StoreOnlyMMO);
34090 // Jump to endMBB
34091 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
34092 .addMBB(endMBB);
34096 // Emit code to use overflow area
34099 // Load the overflow_area address into a register.
34100 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
34101 BuildMI(overflowMBB, MIMD,
34102 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34103 OverflowAddrReg)
34104 .add(Base)
34105 .add(Scale)
34106 .add(Index)
34107 .addDisp(Disp, 8)
34108 .add(Segment)
34109 .setMemRefs(LoadOnlyMMO);
34111 // If we need to align it, do so. Otherwise, just copy the address
34112 // to OverflowDestReg.
34113 if (NeedsAlign) {
34114 // Align the overflow address
34115 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
34117 // aligned_addr = (addr + (align-1)) & ~(align-1)
34118 BuildMI(
34119 overflowMBB, MIMD,
34120 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34121 TmpReg)
34122 .addReg(OverflowAddrReg)
34123 .addImm(Alignment.value() - 1);
34125 BuildMI(
34126 overflowMBB, MIMD,
34127 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
34128 OverflowDestReg)
34129 .addReg(TmpReg)
34130 .addImm(~(uint64_t)(Alignment.value() - 1));
34131 } else {
34132 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
34133 .addReg(OverflowAddrReg);
34136 // Compute the next overflow address after this argument.
34137 // (the overflow address should be kept 8-byte aligned)
34138 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
34139 BuildMI(
34140 overflowMBB, MIMD,
34141 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34142 NextAddrReg)
34143 .addReg(OverflowDestReg)
34144 .addImm(ArgSizeA8);
34146 // Store the new overflow address.
34147 BuildMI(overflowMBB, MIMD,
34148 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
34149 .add(Base)
34150 .add(Scale)
34151 .add(Index)
34152 .addDisp(Disp, 8)
34153 .add(Segment)
34154 .addReg(NextAddrReg)
34155 .setMemRefs(StoreOnlyMMO);
34157 // If we branched, emit the PHI to the front of endMBB.
34158 if (offsetMBB) {
34159 BuildMI(*endMBB, endMBB->begin(), MIMD,
34160 TII->get(X86::PHI), DestReg)
34161 .addReg(OffsetDestReg).addMBB(offsetMBB)
34162 .addReg(OverflowDestReg).addMBB(overflowMBB);
34165 // Erase the pseudo instruction
34166 MI.eraseFromParent();
34168 return endMBB;
34171 // The EFLAGS operand of SelectItr might be missing a kill marker
34172 // because there were multiple uses of EFLAGS, and ISel didn't know
34173 // which to mark. Figure out whether SelectItr should have had a
34174 // kill marker, and set it if it should. Returns the correct kill
34175 // marker value.
34176 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
34177 MachineBasicBlock* BB,
34178 const TargetRegisterInfo* TRI) {
34179 if (isEFLAGSLiveAfter(SelectItr, BB))
34180 return false;
34182 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
34183 // out. SelectMI should have a kill flag on EFLAGS.
34184 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
34185 return true;
34188 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
34189 // together with other CMOV pseudo-opcodes into a single basic-block with
34190 // conditional jump around it.
34191 static bool isCMOVPseudo(MachineInstr &MI) {
34192 switch (MI.getOpcode()) {
34193 case X86::CMOV_FR16:
34194 case X86::CMOV_FR16X:
34195 case X86::CMOV_FR32:
34196 case X86::CMOV_FR32X:
34197 case X86::CMOV_FR64:
34198 case X86::CMOV_FR64X:
34199 case X86::CMOV_GR8:
34200 case X86::CMOV_GR16:
34201 case X86::CMOV_GR32:
34202 case X86::CMOV_RFP32:
34203 case X86::CMOV_RFP64:
34204 case X86::CMOV_RFP80:
34205 case X86::CMOV_VR64:
34206 case X86::CMOV_VR128:
34207 case X86::CMOV_VR128X:
34208 case X86::CMOV_VR256:
34209 case X86::CMOV_VR256X:
34210 case X86::CMOV_VR512:
34211 case X86::CMOV_VK1:
34212 case X86::CMOV_VK2:
34213 case X86::CMOV_VK4:
34214 case X86::CMOV_VK8:
34215 case X86::CMOV_VK16:
34216 case X86::CMOV_VK32:
34217 case X86::CMOV_VK64:
34218 return true;
34220 default:
34221 return false;
34225 // Helper function, which inserts PHI functions into SinkMBB:
34226 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
34227 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
34228 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
34229 // the last PHI function inserted.
34230 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
34231 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
34232 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
34233 MachineBasicBlock *SinkMBB) {
34234 MachineFunction *MF = TrueMBB->getParent();
34235 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
34236 const MIMetadata MIMD(*MIItBegin);
34238 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
34239 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
34241 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
34243 // As we are creating the PHIs, we have to be careful if there is more than
34244 // one. Later CMOVs may reference the results of earlier CMOVs, but later
34245 // PHIs have to reference the individual true/false inputs from earlier PHIs.
34246 // That also means that PHI construction must work forward from earlier to
34247 // later, and that the code must maintain a mapping from earlier PHI's
34248 // destination registers, and the registers that went into the PHI.
34249 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
34250 MachineInstrBuilder MIB;
34252 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
34253 Register DestReg = MIIt->getOperand(0).getReg();
34254 Register Op1Reg = MIIt->getOperand(1).getReg();
34255 Register Op2Reg = MIIt->getOperand(2).getReg();
34257 // If this CMOV we are generating is the opposite condition from
34258 // the jump we generated, then we have to swap the operands for the
34259 // PHI that is going to be generated.
34260 if (MIIt->getOperand(3).getImm() == OppCC)
34261 std::swap(Op1Reg, Op2Reg);
34263 if (RegRewriteTable.contains(Op1Reg))
34264 Op1Reg = RegRewriteTable[Op1Reg].first;
34266 if (RegRewriteTable.contains(Op2Reg))
34267 Op2Reg = RegRewriteTable[Op2Reg].second;
34269 MIB =
34270 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
34271 .addReg(Op1Reg)
34272 .addMBB(FalseMBB)
34273 .addReg(Op2Reg)
34274 .addMBB(TrueMBB);
34276 // Add this PHI to the rewrite table.
34277 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
34280 return MIB;
34283 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
34284 MachineBasicBlock *
34285 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
34286 MachineInstr &SecondCascadedCMOV,
34287 MachineBasicBlock *ThisMBB) const {
34288 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34289 const MIMetadata MIMD(FirstCMOV);
34291 // We lower cascaded CMOVs such as
34293 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
34295 // to two successive branches.
34297 // Without this, we would add a PHI between the two jumps, which ends up
34298 // creating a few copies all around. For instance, for
34300 // (sitofp (zext (fcmp une)))
34302 // we would generate:
34304 // ucomiss %xmm1, %xmm0
34305 // movss <1.0f>, %xmm0
34306 // movaps %xmm0, %xmm1
34307 // jne .LBB5_2
34308 // xorps %xmm1, %xmm1
34309 // .LBB5_2:
34310 // jp .LBB5_4
34311 // movaps %xmm1, %xmm0
34312 // .LBB5_4:
34313 // retq
34315 // because this custom-inserter would have generated:
34317 // A
34318 // | \
34319 // | B
34320 // | /
34321 // C
34322 // | \
34323 // | D
34324 // | /
34325 // E
34327 // A: X = ...; Y = ...
34328 // B: empty
34329 // C: Z = PHI [X, A], [Y, B]
34330 // D: empty
34331 // E: PHI [X, C], [Z, D]
34333 // If we lower both CMOVs in a single step, we can instead generate:
34335 // A
34336 // | \
34337 // | C
34338 // | /|
34339 // |/ |
34340 // | |
34341 // | D
34342 // | /
34343 // E
34345 // A: X = ...; Y = ...
34346 // D: empty
34347 // E: PHI [X, A], [X, C], [Y, D]
34349 // Which, in our sitofp/fcmp example, gives us something like:
34351 // ucomiss %xmm1, %xmm0
34352 // movss <1.0f>, %xmm0
34353 // jne .LBB5_4
34354 // jp .LBB5_4
34355 // xorps %xmm0, %xmm0
34356 // .LBB5_4:
34357 // retq
34360 // We lower cascaded CMOV into two successive branches to the same block.
34361 // EFLAGS is used by both, so mark it as live in the second.
34362 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34363 MachineFunction *F = ThisMBB->getParent();
34364 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34365 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34366 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34368 MachineFunction::iterator It = ++ThisMBB->getIterator();
34369 F->insert(It, FirstInsertedMBB);
34370 F->insert(It, SecondInsertedMBB);
34371 F->insert(It, SinkMBB);
34373 // For a cascaded CMOV, we lower it to two successive branches to
34374 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
34375 // the FirstInsertedMBB.
34376 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
34378 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34379 // live into the sink and copy blocks.
34380 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34381 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
34382 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
34383 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
34384 SinkMBB->addLiveIn(X86::EFLAGS);
34387 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34388 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
34389 std::next(MachineBasicBlock::iterator(FirstCMOV)),
34390 ThisMBB->end());
34391 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34393 // Fallthrough block for ThisMBB.
34394 ThisMBB->addSuccessor(FirstInsertedMBB);
34395 // The true block target of the first branch is always SinkMBB.
34396 ThisMBB->addSuccessor(SinkMBB);
34397 // Fallthrough block for FirstInsertedMBB.
34398 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
34399 // The true block for the branch of FirstInsertedMBB.
34400 FirstInsertedMBB->addSuccessor(SinkMBB);
34401 // This is fallthrough.
34402 SecondInsertedMBB->addSuccessor(SinkMBB);
34404 // Create the conditional branch instructions.
34405 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
34406 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
34408 X86::CondCode SecondCC =
34409 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
34410 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
34411 .addMBB(SinkMBB)
34412 .addImm(SecondCC);
34414 // SinkMBB:
34415 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
34416 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
34417 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
34418 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
34419 MachineInstrBuilder MIB =
34420 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
34421 .addReg(Op1Reg)
34422 .addMBB(SecondInsertedMBB)
34423 .addReg(Op2Reg)
34424 .addMBB(ThisMBB);
34426 // The second SecondInsertedMBB provides the same incoming value as the
34427 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
34428 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
34430 // Now remove the CMOVs.
34431 FirstCMOV.eraseFromParent();
34432 SecondCascadedCMOV.eraseFromParent();
34434 return SinkMBB;
34437 MachineBasicBlock *
34438 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
34439 MachineBasicBlock *ThisMBB) const {
34440 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34441 const MIMetadata MIMD(MI);
34443 // To "insert" a SELECT_CC instruction, we actually have to insert the
34444 // diamond control-flow pattern. The incoming instruction knows the
34445 // destination vreg to set, the condition code register to branch on, the
34446 // true/false values to select between and a branch opcode to use.
34448 // ThisMBB:
34449 // ...
34450 // TrueVal = ...
34451 // cmpTY ccX, r1, r2
34452 // bCC copy1MBB
34453 // fallthrough --> FalseMBB
34455 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
34456 // as described above, by inserting a BB, and then making a PHI at the join
34457 // point to select the true and false operands of the CMOV in the PHI.
34459 // The code also handles two different cases of multiple CMOV opcodes
34460 // in a row.
34462 // Case 1:
34463 // In this case, there are multiple CMOVs in a row, all which are based on
34464 // the same condition setting (or the exact opposite condition setting).
34465 // In this case we can lower all the CMOVs using a single inserted BB, and
34466 // then make a number of PHIs at the join point to model the CMOVs. The only
34467 // trickiness here, is that in a case like:
34469 // t2 = CMOV cond1 t1, f1
34470 // t3 = CMOV cond1 t2, f2
34472 // when rewriting this into PHIs, we have to perform some renaming on the
34473 // temps since you cannot have a PHI operand refer to a PHI result earlier
34474 // in the same block. The "simple" but wrong lowering would be:
34476 // t2 = PHI t1(BB1), f1(BB2)
34477 // t3 = PHI t2(BB1), f2(BB2)
34479 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
34480 // renaming is to note that on the path through BB1, t2 is really just a
34481 // copy of t1, and do that renaming, properly generating:
34483 // t2 = PHI t1(BB1), f1(BB2)
34484 // t3 = PHI t1(BB1), f2(BB2)
34486 // Case 2:
34487 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
34488 // function - EmitLoweredCascadedSelect.
34490 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
34491 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
34492 MachineInstr *LastCMOV = &MI;
34493 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
34495 // Check for case 1, where there are multiple CMOVs with the same condition
34496 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
34497 // number of jumps the most.
34499 if (isCMOVPseudo(MI)) {
34500 // See if we have a string of CMOVS with the same condition. Skip over
34501 // intervening debug insts.
34502 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
34503 (NextMIIt->getOperand(3).getImm() == CC ||
34504 NextMIIt->getOperand(3).getImm() == OppCC)) {
34505 LastCMOV = &*NextMIIt;
34506 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
34510 // This checks for case 2, but only do this if we didn't already find
34511 // case 1, as indicated by LastCMOV == MI.
34512 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
34513 NextMIIt->getOpcode() == MI.getOpcode() &&
34514 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
34515 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
34516 NextMIIt->getOperand(1).isKill()) {
34517 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
34520 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34521 MachineFunction *F = ThisMBB->getParent();
34522 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
34523 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34525 MachineFunction::iterator It = ++ThisMBB->getIterator();
34526 F->insert(It, FalseMBB);
34527 F->insert(It, SinkMBB);
34529 // Set the call frame size on entry to the new basic blocks.
34530 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
34531 FalseMBB->setCallFrameSize(CallFrameSize);
34532 SinkMBB->setCallFrameSize(CallFrameSize);
34534 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34535 // live into the sink and copy blocks.
34536 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34537 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
34538 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
34539 FalseMBB->addLiveIn(X86::EFLAGS);
34540 SinkMBB->addLiveIn(X86::EFLAGS);
34543 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
34544 auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),
34545 MachineBasicBlock::iterator(LastCMOV));
34546 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
34547 if (MI.isDebugInstr())
34548 SinkMBB->push_back(MI.removeFromParent());
34550 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34551 SinkMBB->splice(SinkMBB->end(), ThisMBB,
34552 std::next(MachineBasicBlock::iterator(LastCMOV)),
34553 ThisMBB->end());
34554 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34556 // Fallthrough block for ThisMBB.
34557 ThisMBB->addSuccessor(FalseMBB);
34558 // The true block target of the first (or only) branch is always a SinkMBB.
34559 ThisMBB->addSuccessor(SinkMBB);
34560 // Fallthrough block for FalseMBB.
34561 FalseMBB->addSuccessor(SinkMBB);
34563 // Create the conditional branch instruction.
34564 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
34566 // SinkMBB:
34567 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
34568 // ...
34569 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
34570 MachineBasicBlock::iterator MIItEnd =
34571 std::next(MachineBasicBlock::iterator(LastCMOV));
34572 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
34574 // Now remove the CMOV(s).
34575 ThisMBB->erase(MIItBegin, MIItEnd);
34577 return SinkMBB;
34580 static unsigned getSUBriOpcode(bool IsLP64) {
34581 if (IsLP64)
34582 return X86::SUB64ri32;
34583 else
34584 return X86::SUB32ri;
34587 MachineBasicBlock *
34588 X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
34589 MachineBasicBlock *MBB) const {
34590 MachineFunction *MF = MBB->getParent();
34591 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34592 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
34593 const MIMetadata MIMD(MI);
34594 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
34596 const unsigned ProbeSize = getStackProbeSize(*MF);
34598 MachineRegisterInfo &MRI = MF->getRegInfo();
34599 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34600 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34601 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34603 MachineFunction::iterator MBBIter = ++MBB->getIterator();
34604 MF->insert(MBBIter, testMBB);
34605 MF->insert(MBBIter, blockMBB);
34606 MF->insert(MBBIter, tailMBB);
34608 Register sizeVReg = MI.getOperand(1).getReg();
34610 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
34612 Register TmpStackPtr = MRI.createVirtualRegister(
34613 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
34614 Register FinalStackPtr = MRI.createVirtualRegister(
34615 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
34617 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
34618 .addReg(physSPReg);
34620 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
34621 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
34622 .addReg(TmpStackPtr)
34623 .addReg(sizeVReg);
34626 // test rsp size
34628 BuildMI(testMBB, MIMD,
34629 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
34630 .addReg(FinalStackPtr)
34631 .addReg(physSPReg);
34633 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
34634 .addMBB(tailMBB)
34635 .addImm(X86::COND_GE);
34636 testMBB->addSuccessor(blockMBB);
34637 testMBB->addSuccessor(tailMBB);
34639 // Touch the block then extend it. This is done on the opposite side of
34640 // static probe where we allocate then touch, to avoid the need of probing the
34641 // tail of the static alloca. Possible scenarios are:
34643 // + ---- <- ------------ <- ------------- <- ------------ +
34644 // | |
34645 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
34646 // | |
34647 // + <- ----------- <- ------------ <- ----------- <- ------------ +
34649 // The property we want to enforce is to never have more than [page alloc] between two probes.
34651 const unsigned XORMIOpc =
34652 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
34653 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
34654 .addImm(0);
34656 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
34657 physSPReg)
34658 .addReg(physSPReg)
34659 .addImm(ProbeSize);
34661 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
34662 blockMBB->addSuccessor(testMBB);
34664 // Replace original instruction by the expected stack ptr
34665 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
34666 MI.getOperand(0).getReg())
34667 .addReg(FinalStackPtr);
34669 tailMBB->splice(tailMBB->end(), MBB,
34670 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34671 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
34672 MBB->addSuccessor(testMBB);
34674 // Delete the original pseudo instruction.
34675 MI.eraseFromParent();
34677 // And we're done.
34678 return tailMBB;
34681 MachineBasicBlock *
34682 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
34683 MachineBasicBlock *BB) const {
34684 MachineFunction *MF = BB->getParent();
34685 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34686 const MIMetadata MIMD(MI);
34687 const BasicBlock *LLVM_BB = BB->getBasicBlock();
34689 assert(MF->shouldSplitStack());
34691 const bool Is64Bit = Subtarget.is64Bit();
34692 const bool IsLP64 = Subtarget.isTarget64BitLP64();
34694 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
34695 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
34697 // BB:
34698 // ... [Till the alloca]
34699 // If stacklet is not large enough, jump to mallocMBB
34701 // bumpMBB:
34702 // Allocate by subtracting from RSP
34703 // Jump to continueMBB
34705 // mallocMBB:
34706 // Allocate by call to runtime
34708 // continueMBB:
34709 // ...
34710 // [rest of original BB]
34713 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34714 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34715 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34717 MachineRegisterInfo &MRI = MF->getRegInfo();
34718 const TargetRegisterClass *AddrRegClass =
34719 getRegClassFor(getPointerTy(MF->getDataLayout()));
34721 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
34722 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
34723 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
34724 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
34725 sizeVReg = MI.getOperand(1).getReg(),
34726 physSPReg =
34727 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
34729 MachineFunction::iterator MBBIter = ++BB->getIterator();
34731 MF->insert(MBBIter, bumpMBB);
34732 MF->insert(MBBIter, mallocMBB);
34733 MF->insert(MBBIter, continueMBB);
34735 continueMBB->splice(continueMBB->begin(), BB,
34736 std::next(MachineBasicBlock::iterator(MI)), BB->end());
34737 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
34739 // Add code to the main basic block to check if the stack limit has been hit,
34740 // and if so, jump to mallocMBB otherwise to bumpMBB.
34741 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
34742 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
34743 .addReg(tmpSPVReg).addReg(sizeVReg);
34744 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
34745 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
34746 .addReg(SPLimitVReg);
34747 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
34749 // bumpMBB simply decreases the stack pointer, since we know the current
34750 // stacklet has enough space.
34751 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
34752 .addReg(SPLimitVReg);
34753 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
34754 .addReg(SPLimitVReg);
34755 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
34757 // Calls into a routine in libgcc to allocate more space from the heap.
34758 const uint32_t *RegMask =
34759 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
34760 if (IsLP64) {
34761 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
34762 .addReg(sizeVReg);
34763 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
34764 .addExternalSymbol("__morestack_allocate_stack_space")
34765 .addRegMask(RegMask)
34766 .addReg(X86::RDI, RegState::Implicit)
34767 .addReg(X86::RAX, RegState::ImplicitDefine);
34768 } else if (Is64Bit) {
34769 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
34770 .addReg(sizeVReg);
34771 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
34772 .addExternalSymbol("__morestack_allocate_stack_space")
34773 .addRegMask(RegMask)
34774 .addReg(X86::EDI, RegState::Implicit)
34775 .addReg(X86::EAX, RegState::ImplicitDefine);
34776 } else {
34777 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
34778 .addImm(12);
34779 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
34780 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
34781 .addExternalSymbol("__morestack_allocate_stack_space")
34782 .addRegMask(RegMask)
34783 .addReg(X86::EAX, RegState::ImplicitDefine);
34786 if (!Is64Bit)
34787 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
34788 .addImm(16);
34790 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
34791 .addReg(IsLP64 ? X86::RAX : X86::EAX);
34792 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
34794 // Set up the CFG correctly.
34795 BB->addSuccessor(bumpMBB);
34796 BB->addSuccessor(mallocMBB);
34797 mallocMBB->addSuccessor(continueMBB);
34798 bumpMBB->addSuccessor(continueMBB);
34800 // Take care of the PHI nodes.
34801 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
34802 MI.getOperand(0).getReg())
34803 .addReg(mallocPtrVReg)
34804 .addMBB(mallocMBB)
34805 .addReg(bumpSPPtrVReg)
34806 .addMBB(bumpMBB);
34808 // Delete the original pseudo instruction.
34809 MI.eraseFromParent();
34811 // And we're done.
34812 return continueMBB;
34815 MachineBasicBlock *
34816 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
34817 MachineBasicBlock *BB) const {
34818 MachineFunction *MF = BB->getParent();
34819 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
34820 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
34821 const MIMetadata MIMD(MI);
34823 assert(!isAsynchronousEHPersonality(
34824 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
34825 "SEH does not use catchret!");
34827 // Only 32-bit EH needs to worry about manually restoring stack pointers.
34828 if (!Subtarget.is32Bit())
34829 return BB;
34831 // C++ EH creates a new target block to hold the restore code, and wires up
34832 // the new block to the return destination with a normal JMP_4.
34833 MachineBasicBlock *RestoreMBB =
34834 MF->CreateMachineBasicBlock(BB->getBasicBlock());
34835 assert(BB->succ_size() == 1);
34836 MF->insert(std::next(BB->getIterator()), RestoreMBB);
34837 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
34838 BB->addSuccessor(RestoreMBB);
34839 MI.getOperand(0).setMBB(RestoreMBB);
34841 // Marking this as an EH pad but not a funclet entry block causes PEI to
34842 // restore stack pointers in the block.
34843 RestoreMBB->setIsEHPad(true);
34845 auto RestoreMBBI = RestoreMBB->begin();
34846 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
34847 return BB;
34850 MachineBasicBlock *
34851 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
34852 MachineBasicBlock *BB) const {
34853 // So, here we replace TLSADDR with the sequence:
34854 // adjust_stackdown -> TLSADDR -> adjust_stackup.
34855 // We need this because TLSADDR is lowered into calls
34856 // inside MC, therefore without the two markers shrink-wrapping
34857 // may push the prologue/epilogue pass them.
34858 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
34859 const MIMetadata MIMD(MI);
34860 MachineFunction &MF = *BB->getParent();
34862 // Emit CALLSEQ_START right before the instruction.
34863 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
34864 MachineInstrBuilder CallseqStart =
34865 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
34866 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
34868 // Emit CALLSEQ_END right after the instruction.
34869 // We don't call erase from parent because we want to keep the
34870 // original instruction around.
34871 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
34872 MachineInstrBuilder CallseqEnd =
34873 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
34874 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
34876 return BB;
34879 MachineBasicBlock *
34880 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
34881 MachineBasicBlock *BB) const {
34882 // This is pretty easy. We're taking the value that we received from
34883 // our load from the relocation, sticking it in either RDI (x86-64)
34884 // or EAX and doing an indirect call. The return value will then
34885 // be in the normal return register.
34886 MachineFunction *F = BB->getParent();
34887 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34888 const MIMetadata MIMD(MI);
34890 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
34891 assert(MI.getOperand(3).isGlobal() && "This should be a global");
34893 // Get a register mask for the lowered call.
34894 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
34895 // proper register mask.
34896 const uint32_t *RegMask =
34897 Subtarget.is64Bit() ?
34898 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
34899 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
34900 if (Subtarget.is64Bit()) {
34901 MachineInstrBuilder MIB =
34902 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
34903 .addReg(X86::RIP)
34904 .addImm(0)
34905 .addReg(0)
34906 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34907 MI.getOperand(3).getTargetFlags())
34908 .addReg(0);
34909 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
34910 addDirectMem(MIB, X86::RDI);
34911 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
34912 } else if (!isPositionIndependent()) {
34913 MachineInstrBuilder MIB =
34914 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
34915 .addReg(0)
34916 .addImm(0)
34917 .addReg(0)
34918 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34919 MI.getOperand(3).getTargetFlags())
34920 .addReg(0);
34921 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
34922 addDirectMem(MIB, X86::EAX);
34923 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
34924 } else {
34925 MachineInstrBuilder MIB =
34926 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
34927 .addReg(TII->getGlobalBaseReg(F))
34928 .addImm(0)
34929 .addReg(0)
34930 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34931 MI.getOperand(3).getTargetFlags())
34932 .addReg(0);
34933 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
34934 addDirectMem(MIB, X86::EAX);
34935 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
34938 MI.eraseFromParent(); // The pseudo instruction is gone now.
34939 return BB;
34942 static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
34943 switch (RPOpc) {
34944 case X86::INDIRECT_THUNK_CALL32:
34945 return X86::CALLpcrel32;
34946 case X86::INDIRECT_THUNK_CALL64:
34947 return X86::CALL64pcrel32;
34948 case X86::INDIRECT_THUNK_TCRETURN32:
34949 return X86::TCRETURNdi;
34950 case X86::INDIRECT_THUNK_TCRETURN64:
34951 return X86::TCRETURNdi64;
34953 llvm_unreachable("not indirect thunk opcode");
34956 static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
34957 unsigned Reg) {
34958 if (Subtarget.useRetpolineExternalThunk()) {
34959 // When using an external thunk for retpolines, we pick names that match the
34960 // names GCC happens to use as well. This helps simplify the implementation
34961 // of the thunks for kernels where they have no easy ability to create
34962 // aliases and are doing non-trivial configuration of the thunk's body. For
34963 // example, the Linux kernel will do boot-time hot patching of the thunk
34964 // bodies and cannot easily export aliases of these to loaded modules.
34966 // Note that at any point in the future, we may need to change the semantics
34967 // of how we implement retpolines and at that time will likely change the
34968 // name of the called thunk. Essentially, there is no hard guarantee that
34969 // LLVM will generate calls to specific thunks, we merely make a best-effort
34970 // attempt to help out kernels and other systems where duplicating the
34971 // thunks is costly.
34972 switch (Reg) {
34973 case X86::EAX:
34974 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34975 return "__x86_indirect_thunk_eax";
34976 case X86::ECX:
34977 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34978 return "__x86_indirect_thunk_ecx";
34979 case X86::EDX:
34980 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34981 return "__x86_indirect_thunk_edx";
34982 case X86::EDI:
34983 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34984 return "__x86_indirect_thunk_edi";
34985 case X86::R11:
34986 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
34987 return "__x86_indirect_thunk_r11";
34989 llvm_unreachable("unexpected reg for external indirect thunk");
34992 if (Subtarget.useRetpolineIndirectCalls() ||
34993 Subtarget.useRetpolineIndirectBranches()) {
34994 // When targeting an internal COMDAT thunk use an LLVM-specific name.
34995 switch (Reg) {
34996 case X86::EAX:
34997 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34998 return "__llvm_retpoline_eax";
34999 case X86::ECX:
35000 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35001 return "__llvm_retpoline_ecx";
35002 case X86::EDX:
35003 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35004 return "__llvm_retpoline_edx";
35005 case X86::EDI:
35006 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35007 return "__llvm_retpoline_edi";
35008 case X86::R11:
35009 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35010 return "__llvm_retpoline_r11";
35012 llvm_unreachable("unexpected reg for retpoline");
35015 if (Subtarget.useLVIControlFlowIntegrity()) {
35016 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35017 return "__llvm_lvi_thunk_r11";
35019 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
35022 MachineBasicBlock *
35023 X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
35024 MachineBasicBlock *BB) const {
35025 // Copy the virtual register into the R11 physical register and
35026 // call the retpoline thunk.
35027 const MIMetadata MIMD(MI);
35028 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35029 Register CalleeVReg = MI.getOperand(0).getReg();
35030 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
35032 // Find an available scratch register to hold the callee. On 64-bit, we can
35033 // just use R11, but we scan for uses anyway to ensure we don't generate
35034 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
35035 // already a register use operand to the call to hold the callee. If none
35036 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
35037 // register and ESI is the base pointer to realigned stack frames with VLAs.
35038 SmallVector<unsigned, 3> AvailableRegs;
35039 if (Subtarget.is64Bit())
35040 AvailableRegs.push_back(X86::R11);
35041 else
35042 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
35044 // Zero out any registers that are already used.
35045 for (const auto &MO : MI.operands()) {
35046 if (MO.isReg() && MO.isUse())
35047 for (unsigned &Reg : AvailableRegs)
35048 if (Reg == MO.getReg())
35049 Reg = 0;
35052 // Choose the first remaining non-zero available register.
35053 unsigned AvailableReg = 0;
35054 for (unsigned MaybeReg : AvailableRegs) {
35055 if (MaybeReg) {
35056 AvailableReg = MaybeReg;
35057 break;
35060 if (!AvailableReg)
35061 report_fatal_error("calling convention incompatible with retpoline, no "
35062 "available registers");
35064 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
35066 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
35067 .addReg(CalleeVReg);
35068 MI.getOperand(0).ChangeToES(Symbol);
35069 MI.setDesc(TII->get(Opc));
35070 MachineInstrBuilder(*BB->getParent(), &MI)
35071 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
35072 return BB;
35075 /// SetJmp implies future control flow change upon calling the corresponding
35076 /// LongJmp.
35077 /// Instead of using the 'return' instruction, the long jump fixes the stack and
35078 /// performs an indirect branch. To do so it uses the registers that were stored
35079 /// in the jump buffer (when calling SetJmp).
35080 /// In case the shadow stack is enabled we need to fix it as well, because some
35081 /// return addresses will be skipped.
35082 /// The function will save the SSP for future fixing in the function
35083 /// emitLongJmpShadowStackFix.
35084 /// \sa emitLongJmpShadowStackFix
35085 /// \param [in] MI The temporary Machine Instruction for the builtin.
35086 /// \param [in] MBB The Machine Basic Block that will be modified.
35087 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
35088 MachineBasicBlock *MBB) const {
35089 const MIMetadata MIMD(MI);
35090 MachineFunction *MF = MBB->getParent();
35091 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35092 MachineRegisterInfo &MRI = MF->getRegInfo();
35093 MachineInstrBuilder MIB;
35095 // Memory Reference.
35096 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35097 MI.memoperands_end());
35099 // Initialize a register with zero.
35100 MVT PVT = getPointerTy(MF->getDataLayout());
35101 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35102 Register ZReg = MRI.createVirtualRegister(PtrRC);
35103 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
35104 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
35105 .addDef(ZReg)
35106 .addReg(ZReg, RegState::Undef)
35107 .addReg(ZReg, RegState::Undef);
35109 // Read the current SSP Register value to the zeroed register.
35110 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35111 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35112 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35114 // Write the SSP register value to offset 3 in input memory buffer.
35115 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35116 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
35117 const int64_t SSPOffset = 3 * PVT.getStoreSize();
35118 const unsigned MemOpndSlot = 1;
35119 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35120 if (i == X86::AddrDisp)
35121 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
35122 else
35123 MIB.add(MI.getOperand(MemOpndSlot + i));
35125 MIB.addReg(SSPCopyReg);
35126 MIB.setMemRefs(MMOs);
35129 MachineBasicBlock *
35130 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
35131 MachineBasicBlock *MBB) const {
35132 const MIMetadata MIMD(MI);
35133 MachineFunction *MF = MBB->getParent();
35134 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35135 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35136 MachineRegisterInfo &MRI = MF->getRegInfo();
35138 const BasicBlock *BB = MBB->getBasicBlock();
35139 MachineFunction::iterator I = ++MBB->getIterator();
35141 // Memory Reference
35142 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35143 MI.memoperands_end());
35145 unsigned DstReg;
35146 unsigned MemOpndSlot = 0;
35148 unsigned CurOp = 0;
35150 DstReg = MI.getOperand(CurOp++).getReg();
35151 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35152 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
35153 (void)TRI;
35154 Register mainDstReg = MRI.createVirtualRegister(RC);
35155 Register restoreDstReg = MRI.createVirtualRegister(RC);
35157 MemOpndSlot = CurOp;
35159 MVT PVT = getPointerTy(MF->getDataLayout());
35160 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35161 "Invalid Pointer Size!");
35163 // For v = setjmp(buf), we generate
35165 // thisMBB:
35166 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
35167 // SjLjSetup restoreMBB
35169 // mainMBB:
35170 // v_main = 0
35172 // sinkMBB:
35173 // v = phi(main, restore)
35175 // restoreMBB:
35176 // if base pointer being used, load it from frame
35177 // v_restore = 1
35179 MachineBasicBlock *thisMBB = MBB;
35180 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35181 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35182 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
35183 MF->insert(I, mainMBB);
35184 MF->insert(I, sinkMBB);
35185 MF->push_back(restoreMBB);
35186 restoreMBB->setMachineBlockAddressTaken();
35188 MachineInstrBuilder MIB;
35190 // Transfer the remainder of BB and its successor edges to sinkMBB.
35191 sinkMBB->splice(sinkMBB->begin(), MBB,
35192 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35193 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35195 // thisMBB:
35196 unsigned PtrStoreOpc = 0;
35197 unsigned LabelReg = 0;
35198 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35199 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35200 !isPositionIndependent();
35202 // Prepare IP either in reg or imm.
35203 if (!UseImmLabel) {
35204 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35205 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35206 LabelReg = MRI.createVirtualRegister(PtrRC);
35207 if (Subtarget.is64Bit()) {
35208 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
35209 .addReg(X86::RIP)
35210 .addImm(0)
35211 .addReg(0)
35212 .addMBB(restoreMBB)
35213 .addReg(0);
35214 } else {
35215 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
35216 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
35217 .addReg(XII->getGlobalBaseReg(MF))
35218 .addImm(0)
35219 .addReg(0)
35220 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
35221 .addReg(0);
35223 } else
35224 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35225 // Store IP
35226 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
35227 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35228 if (i == X86::AddrDisp)
35229 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
35230 else
35231 MIB.add(MI.getOperand(MemOpndSlot + i));
35233 if (!UseImmLabel)
35234 MIB.addReg(LabelReg);
35235 else
35236 MIB.addMBB(restoreMBB);
35237 MIB.setMemRefs(MMOs);
35239 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35240 emitSetJmpShadowStackFix(MI, thisMBB);
35243 // Setup
35244 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
35245 .addMBB(restoreMBB);
35247 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35248 MIB.addRegMask(RegInfo->getNoPreservedMask());
35249 thisMBB->addSuccessor(mainMBB);
35250 thisMBB->addSuccessor(restoreMBB);
35252 // mainMBB:
35253 // EAX = 0
35254 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
35255 mainMBB->addSuccessor(sinkMBB);
35257 // sinkMBB:
35258 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35259 .addReg(mainDstReg)
35260 .addMBB(mainMBB)
35261 .addReg(restoreDstReg)
35262 .addMBB(restoreMBB);
35264 // restoreMBB:
35265 if (RegInfo->hasBasePointer(*MF)) {
35266 const bool Uses64BitFramePtr =
35267 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35268 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
35269 X86FI->setRestoreBasePointer(MF);
35270 Register FramePtr = RegInfo->getFrameRegister(*MF);
35271 Register BasePtr = RegInfo->getBaseRegister();
35272 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
35273 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
35274 FramePtr, true, X86FI->getRestoreBasePointerOffset())
35275 .setMIFlag(MachineInstr::FrameSetup);
35277 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
35278 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35279 restoreMBB->addSuccessor(sinkMBB);
35281 MI.eraseFromParent();
35282 return sinkMBB;
35285 /// Fix the shadow stack using the previously saved SSP pointer.
35286 /// \sa emitSetJmpShadowStackFix
35287 /// \param [in] MI The temporary Machine Instruction for the builtin.
35288 /// \param [in] MBB The Machine Basic Block that will be modified.
35289 /// \return The sink MBB that will perform the future indirect branch.
35290 MachineBasicBlock *
35291 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
35292 MachineBasicBlock *MBB) const {
35293 const MIMetadata MIMD(MI);
35294 MachineFunction *MF = MBB->getParent();
35295 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35296 MachineRegisterInfo &MRI = MF->getRegInfo();
35298 // Memory Reference
35299 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35300 MI.memoperands_end());
35302 MVT PVT = getPointerTy(MF->getDataLayout());
35303 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35305 // checkSspMBB:
35306 // xor vreg1, vreg1
35307 // rdssp vreg1
35308 // test vreg1, vreg1
35309 // je sinkMBB # Jump if Shadow Stack is not supported
35310 // fallMBB:
35311 // mov buf+24/12(%rip), vreg2
35312 // sub vreg1, vreg2
35313 // jbe sinkMBB # No need to fix the Shadow Stack
35314 // fixShadowMBB:
35315 // shr 3/2, vreg2
35316 // incssp vreg2 # fix the SSP according to the lower 8 bits
35317 // shr 8, vreg2
35318 // je sinkMBB
35319 // fixShadowLoopPrepareMBB:
35320 // shl vreg2
35321 // mov 128, vreg3
35322 // fixShadowLoopMBB:
35323 // incssp vreg3
35324 // dec vreg2
35325 // jne fixShadowLoopMBB # Iterate until you finish fixing
35326 // # the Shadow Stack
35327 // sinkMBB:
35329 MachineFunction::iterator I = ++MBB->getIterator();
35330 const BasicBlock *BB = MBB->getBasicBlock();
35332 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
35333 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35334 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
35335 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
35336 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
35337 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35338 MF->insert(I, checkSspMBB);
35339 MF->insert(I, fallMBB);
35340 MF->insert(I, fixShadowMBB);
35341 MF->insert(I, fixShadowLoopPrepareMBB);
35342 MF->insert(I, fixShadowLoopMBB);
35343 MF->insert(I, sinkMBB);
35345 // Transfer the remainder of BB and its successor edges to sinkMBB.
35346 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
35347 MBB->end());
35348 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35350 MBB->addSuccessor(checkSspMBB);
35352 // Initialize a register with zero.
35353 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
35354 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
35356 if (PVT == MVT::i64) {
35357 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
35358 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
35359 .addImm(0)
35360 .addReg(ZReg)
35361 .addImm(X86::sub_32bit);
35362 ZReg = TmpZReg;
35365 // Read the current SSP Register value to the zeroed register.
35366 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35367 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35368 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35370 // Check whether the result of the SSP register is zero and jump directly
35371 // to the sink.
35372 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
35373 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
35374 .addReg(SSPCopyReg)
35375 .addReg(SSPCopyReg);
35376 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
35377 .addMBB(sinkMBB)
35378 .addImm(X86::COND_E);
35379 checkSspMBB->addSuccessor(sinkMBB);
35380 checkSspMBB->addSuccessor(fallMBB);
35382 // Reload the previously saved SSP register value.
35383 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
35384 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35385 const int64_t SPPOffset = 3 * PVT.getStoreSize();
35386 MachineInstrBuilder MIB =
35387 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
35388 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35389 const MachineOperand &MO = MI.getOperand(i);
35390 if (i == X86::AddrDisp)
35391 MIB.addDisp(MO, SPPOffset);
35392 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35393 // preserve kill flags.
35394 MIB.addReg(MO.getReg());
35395 else
35396 MIB.add(MO);
35398 MIB.setMemRefs(MMOs);
35400 // Subtract the current SSP from the previous SSP.
35401 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
35402 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
35403 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
35404 .addReg(PrevSSPReg)
35405 .addReg(SSPCopyReg);
35407 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
35408 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
35409 .addMBB(sinkMBB)
35410 .addImm(X86::COND_BE);
35411 fallMBB->addSuccessor(sinkMBB);
35412 fallMBB->addSuccessor(fixShadowMBB);
35414 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
35415 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
35416 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
35417 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
35418 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
35419 .addReg(SspSubReg)
35420 .addImm(Offset);
35422 // Increase SSP when looking only on the lower 8 bits of the delta.
35423 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
35424 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
35426 // Reset the lower 8 bits.
35427 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
35428 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
35429 .addReg(SspFirstShrReg)
35430 .addImm(8);
35432 // Jump if the result of the shift is zero.
35433 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
35434 .addMBB(sinkMBB)
35435 .addImm(X86::COND_E);
35436 fixShadowMBB->addSuccessor(sinkMBB);
35437 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
35439 // Do a single shift left.
35440 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
35441 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
35442 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
35443 .addReg(SspSecondShrReg)
35444 .addImm(1);
35446 // Save the value 128 to a register (will be used next with incssp).
35447 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
35448 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
35449 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
35450 .addImm(128);
35451 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
35453 // Since incssp only looks at the lower 8 bits, we might need to do several
35454 // iterations of incssp until we finish fixing the shadow stack.
35455 Register DecReg = MRI.createVirtualRegister(PtrRC);
35456 Register CounterReg = MRI.createVirtualRegister(PtrRC);
35457 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
35458 .addReg(SspAfterShlReg)
35459 .addMBB(fixShadowLoopPrepareMBB)
35460 .addReg(DecReg)
35461 .addMBB(fixShadowLoopMBB);
35463 // Every iteration we increase the SSP by 128.
35464 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
35466 // Every iteration we decrement the counter by 1.
35467 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
35468 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
35470 // Jump if the counter is not zero yet.
35471 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
35472 .addMBB(fixShadowLoopMBB)
35473 .addImm(X86::COND_NE);
35474 fixShadowLoopMBB->addSuccessor(sinkMBB);
35475 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
35477 return sinkMBB;
35480 MachineBasicBlock *
35481 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
35482 MachineBasicBlock *MBB) const {
35483 const MIMetadata MIMD(MI);
35484 MachineFunction *MF = MBB->getParent();
35485 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35486 MachineRegisterInfo &MRI = MF->getRegInfo();
35488 // Memory Reference
35489 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35490 MI.memoperands_end());
35492 MVT PVT = getPointerTy(MF->getDataLayout());
35493 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35494 "Invalid Pointer Size!");
35496 const TargetRegisterClass *RC =
35497 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35498 Register Tmp = MRI.createVirtualRegister(RC);
35499 // Since FP is only updated here but NOT referenced, it's treated as GPR.
35500 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35501 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
35502 Register SP = RegInfo->getStackRegister();
35504 MachineInstrBuilder MIB;
35506 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35507 const int64_t SPOffset = 2 * PVT.getStoreSize();
35509 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35510 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
35512 MachineBasicBlock *thisMBB = MBB;
35514 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
35515 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35516 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
35519 // Reload FP
35520 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
35521 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35522 const MachineOperand &MO = MI.getOperand(i);
35523 if (MO.isReg()) // Don't add the whole operand, we don't want to
35524 // preserve kill flags.
35525 MIB.addReg(MO.getReg());
35526 else
35527 MIB.add(MO);
35529 MIB.setMemRefs(MMOs);
35531 // Reload IP
35532 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
35533 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35534 const MachineOperand &MO = MI.getOperand(i);
35535 if (i == X86::AddrDisp)
35536 MIB.addDisp(MO, LabelOffset);
35537 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35538 // preserve kill flags.
35539 MIB.addReg(MO.getReg());
35540 else
35541 MIB.add(MO);
35543 MIB.setMemRefs(MMOs);
35545 // Reload SP
35546 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
35547 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35548 if (i == X86::AddrDisp)
35549 MIB.addDisp(MI.getOperand(i), SPOffset);
35550 else
35551 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
35552 // the last instruction of the expansion.
35554 MIB.setMemRefs(MMOs);
35556 // Jump
35557 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
35559 MI.eraseFromParent();
35560 return thisMBB;
35563 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
35564 MachineBasicBlock *MBB,
35565 MachineBasicBlock *DispatchBB,
35566 int FI) const {
35567 const MIMetadata MIMD(MI);
35568 MachineFunction *MF = MBB->getParent();
35569 MachineRegisterInfo *MRI = &MF->getRegInfo();
35570 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35572 MVT PVT = getPointerTy(MF->getDataLayout());
35573 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
35575 unsigned Op = 0;
35576 unsigned VR = 0;
35578 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35579 !isPositionIndependent();
35581 if (UseImmLabel) {
35582 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35583 } else {
35584 const TargetRegisterClass *TRC =
35585 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35586 VR = MRI->createVirtualRegister(TRC);
35587 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35589 if (Subtarget.is64Bit())
35590 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
35591 .addReg(X86::RIP)
35592 .addImm(1)
35593 .addReg(0)
35594 .addMBB(DispatchBB)
35595 .addReg(0);
35596 else
35597 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
35598 .addReg(0) /* TII->getGlobalBaseReg(MF) */
35599 .addImm(1)
35600 .addReg(0)
35601 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
35602 .addReg(0);
35605 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
35606 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
35607 if (UseImmLabel)
35608 MIB.addMBB(DispatchBB);
35609 else
35610 MIB.addReg(VR);
35613 MachineBasicBlock *
35614 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
35615 MachineBasicBlock *BB) const {
35616 const MIMetadata MIMD(MI);
35617 MachineFunction *MF = BB->getParent();
35618 MachineRegisterInfo *MRI = &MF->getRegInfo();
35619 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35620 int FI = MF->getFrameInfo().getFunctionContextIndex();
35622 // Get a mapping of the call site numbers to all of the landing pads they're
35623 // associated with.
35624 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
35625 unsigned MaxCSNum = 0;
35626 for (auto &MBB : *MF) {
35627 if (!MBB.isEHPad())
35628 continue;
35630 MCSymbol *Sym = nullptr;
35631 for (const auto &MI : MBB) {
35632 if (MI.isDebugInstr())
35633 continue;
35635 assert(MI.isEHLabel() && "expected EH_LABEL");
35636 Sym = MI.getOperand(0).getMCSymbol();
35637 break;
35640 if (!MF->hasCallSiteLandingPad(Sym))
35641 continue;
35643 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
35644 CallSiteNumToLPad[CSI].push_back(&MBB);
35645 MaxCSNum = std::max(MaxCSNum, CSI);
35649 // Get an ordered list of the machine basic blocks for the jump table.
35650 std::vector<MachineBasicBlock *> LPadList;
35651 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
35652 LPadList.reserve(CallSiteNumToLPad.size());
35654 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
35655 for (auto &LP : CallSiteNumToLPad[CSI]) {
35656 LPadList.push_back(LP);
35657 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
35661 assert(!LPadList.empty() &&
35662 "No landing pad destinations for the dispatch jump table!");
35664 // Create the MBBs for the dispatch code.
35666 // Shove the dispatch's address into the return slot in the function context.
35667 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
35668 DispatchBB->setIsEHPad(true);
35670 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
35671 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
35672 DispatchBB->addSuccessor(TrapBB);
35674 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
35675 DispatchBB->addSuccessor(DispContBB);
35677 // Insert MBBs.
35678 MF->push_back(DispatchBB);
35679 MF->push_back(DispContBB);
35680 MF->push_back(TrapBB);
35682 // Insert code into the entry block that creates and registers the function
35683 // context.
35684 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
35686 // Create the jump table and associated information
35687 unsigned JTE = getJumpTableEncoding();
35688 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
35689 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
35691 const X86RegisterInfo &RI = TII->getRegisterInfo();
35692 // Add a register mask with no preserved registers. This results in all
35693 // registers being marked as clobbered.
35694 if (RI.hasBasePointer(*MF)) {
35695 const bool FPIs64Bit =
35696 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35697 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
35698 MFI->setRestoreBasePointer(MF);
35700 Register FP = RI.getFrameRegister(*MF);
35701 Register BP = RI.getBaseRegister();
35702 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
35703 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
35704 MFI->getRestoreBasePointerOffset())
35705 .addRegMask(RI.getNoPreservedMask());
35706 } else {
35707 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
35708 .addRegMask(RI.getNoPreservedMask());
35711 // IReg is used as an index in a memory operand and therefore can't be SP
35712 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
35713 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
35714 Subtarget.is64Bit() ? 8 : 4);
35715 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
35716 .addReg(IReg)
35717 .addImm(LPadList.size());
35718 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
35719 .addMBB(TrapBB)
35720 .addImm(X86::COND_AE);
35722 if (Subtarget.is64Bit()) {
35723 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
35724 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
35726 // leaq .LJTI0_0(%rip), BReg
35727 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
35728 .addReg(X86::RIP)
35729 .addImm(1)
35730 .addReg(0)
35731 .addJumpTableIndex(MJTI)
35732 .addReg(0);
35733 // movzx IReg64, IReg
35734 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
35735 .addImm(0)
35736 .addReg(IReg)
35737 .addImm(X86::sub_32bit);
35739 switch (JTE) {
35740 case MachineJumpTableInfo::EK_BlockAddress:
35741 // jmpq *(BReg,IReg64,8)
35742 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
35743 .addReg(BReg)
35744 .addImm(8)
35745 .addReg(IReg64)
35746 .addImm(0)
35747 .addReg(0);
35748 break;
35749 case MachineJumpTableInfo::EK_LabelDifference32: {
35750 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
35751 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
35752 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
35754 // movl (BReg,IReg64,4), OReg
35755 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
35756 .addReg(BReg)
35757 .addImm(4)
35758 .addReg(IReg64)
35759 .addImm(0)
35760 .addReg(0);
35761 // movsx OReg64, OReg
35762 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
35763 .addReg(OReg);
35764 // addq BReg, OReg64, TReg
35765 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
35766 .addReg(OReg64)
35767 .addReg(BReg);
35768 // jmpq *TReg
35769 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
35770 break;
35772 default:
35773 llvm_unreachable("Unexpected jump table encoding");
35775 } else {
35776 // jmpl *.LJTI0_0(,IReg,4)
35777 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
35778 .addReg(0)
35779 .addImm(4)
35780 .addReg(IReg)
35781 .addJumpTableIndex(MJTI)
35782 .addReg(0);
35785 // Add the jump table entries as successors to the MBB.
35786 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
35787 for (auto &LP : LPadList)
35788 if (SeenMBBs.insert(LP).second)
35789 DispContBB->addSuccessor(LP);
35791 // N.B. the order the invoke BBs are processed in doesn't matter here.
35792 SmallVector<MachineBasicBlock *, 64> MBBLPads;
35793 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
35794 for (MachineBasicBlock *MBB : InvokeBBs) {
35795 // Remove the landing pad successor from the invoke block and replace it
35796 // with the new dispatch block.
35797 // Keep a copy of Successors since it's modified inside the loop.
35798 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
35799 MBB->succ_rend());
35800 // FIXME: Avoid quadratic complexity.
35801 for (auto *MBBS : Successors) {
35802 if (MBBS->isEHPad()) {
35803 MBB->removeSuccessor(MBBS);
35804 MBBLPads.push_back(MBBS);
35808 MBB->addSuccessor(DispatchBB);
35810 // Find the invoke call and mark all of the callee-saved registers as
35811 // 'implicit defined' so that they're spilled. This prevents code from
35812 // moving instructions to before the EH block, where they will never be
35813 // executed.
35814 for (auto &II : reverse(*MBB)) {
35815 if (!II.isCall())
35816 continue;
35818 DenseMap<unsigned, bool> DefRegs;
35819 for (auto &MOp : II.operands())
35820 if (MOp.isReg())
35821 DefRegs[MOp.getReg()] = true;
35823 MachineInstrBuilder MIB(*MF, &II);
35824 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
35825 unsigned Reg = SavedRegs[RegIdx];
35826 if (!DefRegs[Reg])
35827 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
35830 break;
35834 // Mark all former landing pads as non-landing pads. The dispatch is the only
35835 // landing pad now.
35836 for (auto &LP : MBBLPads)
35837 LP->setIsEHPad(false);
35839 // The instruction is gone now.
35840 MI.eraseFromParent();
35841 return BB;
35844 MachineBasicBlock *
35845 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
35846 MachineBasicBlock *BB) const {
35847 MachineFunction *MF = BB->getParent();
35848 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35849 const MIMetadata MIMD(MI);
35851 auto TMMImmToTMMReg = [](unsigned Imm) {
35852 assert (Imm < 8 && "Illegal tmm index");
35853 return X86::TMM0 + Imm;
35855 switch (MI.getOpcode()) {
35856 default: llvm_unreachable("Unexpected instr type to insert");
35857 case X86::TLS_addr32:
35858 case X86::TLS_addr64:
35859 case X86::TLS_addrX32:
35860 case X86::TLS_base_addr32:
35861 case X86::TLS_base_addr64:
35862 case X86::TLS_base_addrX32:
35863 return EmitLoweredTLSAddr(MI, BB);
35864 case X86::INDIRECT_THUNK_CALL32:
35865 case X86::INDIRECT_THUNK_CALL64:
35866 case X86::INDIRECT_THUNK_TCRETURN32:
35867 case X86::INDIRECT_THUNK_TCRETURN64:
35868 return EmitLoweredIndirectThunk(MI, BB);
35869 case X86::CATCHRET:
35870 return EmitLoweredCatchRet(MI, BB);
35871 case X86::SEG_ALLOCA_32:
35872 case X86::SEG_ALLOCA_64:
35873 return EmitLoweredSegAlloca(MI, BB);
35874 case X86::PROBED_ALLOCA_32:
35875 case X86::PROBED_ALLOCA_64:
35876 return EmitLoweredProbedAlloca(MI, BB);
35877 case X86::TLSCall_32:
35878 case X86::TLSCall_64:
35879 return EmitLoweredTLSCall(MI, BB);
35880 case X86::CMOV_FR16:
35881 case X86::CMOV_FR16X:
35882 case X86::CMOV_FR32:
35883 case X86::CMOV_FR32X:
35884 case X86::CMOV_FR64:
35885 case X86::CMOV_FR64X:
35886 case X86::CMOV_GR8:
35887 case X86::CMOV_GR16:
35888 case X86::CMOV_GR32:
35889 case X86::CMOV_RFP32:
35890 case X86::CMOV_RFP64:
35891 case X86::CMOV_RFP80:
35892 case X86::CMOV_VR64:
35893 case X86::CMOV_VR128:
35894 case X86::CMOV_VR128X:
35895 case X86::CMOV_VR256:
35896 case X86::CMOV_VR256X:
35897 case X86::CMOV_VR512:
35898 case X86::CMOV_VK1:
35899 case X86::CMOV_VK2:
35900 case X86::CMOV_VK4:
35901 case X86::CMOV_VK8:
35902 case X86::CMOV_VK16:
35903 case X86::CMOV_VK32:
35904 case X86::CMOV_VK64:
35905 return EmitLoweredSelect(MI, BB);
35907 case X86::FP80_ADDr:
35908 case X86::FP80_ADDm32: {
35909 // Change the floating point control register to use double extended
35910 // precision when performing the addition.
35911 int OrigCWFrameIdx =
35912 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35913 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
35914 OrigCWFrameIdx);
35916 // Load the old value of the control word...
35917 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35918 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
35919 OrigCWFrameIdx);
35921 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
35922 // precision.
35923 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35924 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
35925 .addReg(OldCW, RegState::Kill)
35926 .addImm(0x300);
35928 // Extract to 16 bits.
35929 Register NewCW16 =
35930 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
35931 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
35932 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
35934 // Prepare memory for FLDCW.
35935 int NewCWFrameIdx =
35936 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35937 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
35938 NewCWFrameIdx)
35939 .addReg(NewCW16, RegState::Kill);
35941 // Reload the modified control word now...
35942 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
35943 NewCWFrameIdx);
35945 // Do the addition.
35946 if (MI.getOpcode() == X86::FP80_ADDr) {
35947 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
35948 .add(MI.getOperand(0))
35949 .add(MI.getOperand(1))
35950 .add(MI.getOperand(2));
35951 } else {
35952 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
35953 .add(MI.getOperand(0))
35954 .add(MI.getOperand(1))
35955 .add(MI.getOperand(2))
35956 .add(MI.getOperand(3))
35957 .add(MI.getOperand(4))
35958 .add(MI.getOperand(5))
35959 .add(MI.getOperand(6));
35962 // Reload the original control word now.
35963 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
35964 OrigCWFrameIdx);
35966 MI.eraseFromParent(); // The pseudo instruction is gone now.
35967 return BB;
35970 case X86::FP32_TO_INT16_IN_MEM:
35971 case X86::FP32_TO_INT32_IN_MEM:
35972 case X86::FP32_TO_INT64_IN_MEM:
35973 case X86::FP64_TO_INT16_IN_MEM:
35974 case X86::FP64_TO_INT32_IN_MEM:
35975 case X86::FP64_TO_INT64_IN_MEM:
35976 case X86::FP80_TO_INT16_IN_MEM:
35977 case X86::FP80_TO_INT32_IN_MEM:
35978 case X86::FP80_TO_INT64_IN_MEM: {
35979 // Change the floating point control register to use "round towards zero"
35980 // mode when truncating to an integer value.
35981 int OrigCWFrameIdx =
35982 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35983 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
35984 OrigCWFrameIdx);
35986 // Load the old value of the control word...
35987 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35988 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
35989 OrigCWFrameIdx);
35991 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
35992 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35993 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
35994 .addReg(OldCW, RegState::Kill).addImm(0xC00);
35996 // Extract to 16 bits.
35997 Register NewCW16 =
35998 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
35999 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36000 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36002 // Prepare memory for FLDCW.
36003 int NewCWFrameIdx =
36004 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36005 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36006 NewCWFrameIdx)
36007 .addReg(NewCW16, RegState::Kill);
36009 // Reload the modified control word now...
36010 addFrameReference(BuildMI(*BB, MI, MIMD,
36011 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
36013 // Get the X86 opcode to use.
36014 unsigned Opc;
36015 switch (MI.getOpcode()) {
36016 default: llvm_unreachable("illegal opcode!");
36017 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
36018 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
36019 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
36020 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
36021 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
36022 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
36023 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
36024 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
36025 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
36028 X86AddressMode AM = getAddressFromInstr(&MI, 0);
36029 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
36030 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
36032 // Reload the original control word now.
36033 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36034 OrigCWFrameIdx);
36036 MI.eraseFromParent(); // The pseudo instruction is gone now.
36037 return BB;
36040 // xbegin
36041 case X86::XBEGIN:
36042 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
36044 case X86::VAARG_64:
36045 case X86::VAARG_X32:
36046 return EmitVAARGWithCustomInserter(MI, BB);
36048 case X86::EH_SjLj_SetJmp32:
36049 case X86::EH_SjLj_SetJmp64:
36050 return emitEHSjLjSetJmp(MI, BB);
36052 case X86::EH_SjLj_LongJmp32:
36053 case X86::EH_SjLj_LongJmp64:
36054 return emitEHSjLjLongJmp(MI, BB);
36056 case X86::Int_eh_sjlj_setup_dispatch:
36057 return EmitSjLjDispatchBlock(MI, BB);
36059 case TargetOpcode::STATEPOINT:
36060 // As an implementation detail, STATEPOINT shares the STACKMAP format at
36061 // this point in the process. We diverge later.
36062 return emitPatchPoint(MI, BB);
36064 case TargetOpcode::STACKMAP:
36065 case TargetOpcode::PATCHPOINT:
36066 return emitPatchPoint(MI, BB);
36068 case TargetOpcode::PATCHABLE_EVENT_CALL:
36069 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
36070 return BB;
36072 case X86::LCMPXCHG8B: {
36073 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36074 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
36075 // requires a memory operand. If it happens that current architecture is
36076 // i686 and for current function we need a base pointer
36077 // - which is ESI for i686 - register allocator would not be able to
36078 // allocate registers for an address in form of X(%reg, %reg, Y)
36079 // - there never would be enough unreserved registers during regalloc
36080 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
36081 // We are giving a hand to register allocator by precomputing the address in
36082 // a new vreg using LEA.
36084 // If it is not i686 or there is no base pointer - nothing to do here.
36085 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
36086 return BB;
36088 // Even though this code does not necessarily needs the base pointer to
36089 // be ESI, we check for that. The reason: if this assert fails, there are
36090 // some changes happened in the compiler base pointer handling, which most
36091 // probably have to be addressed somehow here.
36092 assert(TRI->getBaseRegister() == X86::ESI &&
36093 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
36094 "base pointer in mind");
36096 MachineRegisterInfo &MRI = MF->getRegInfo();
36097 MVT SPTy = getPointerTy(MF->getDataLayout());
36098 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
36099 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
36101 X86AddressMode AM = getAddressFromInstr(&MI, 0);
36102 // Regalloc does not need any help when the memory operand of CMPXCHG8B
36103 // does not use index register.
36104 if (AM.IndexReg == X86::NoRegister)
36105 return BB;
36107 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
36108 // four operand definitions that are E[ABCD] registers. We skip them and
36109 // then insert the LEA.
36110 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
36111 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
36112 RMBBI->definesRegister(X86::EBX) ||
36113 RMBBI->definesRegister(X86::ECX) ||
36114 RMBBI->definesRegister(X86::EDX))) {
36115 ++RMBBI;
36117 MachineBasicBlock::iterator MBBI(RMBBI);
36118 addFullAddress(
36119 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
36121 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
36123 return BB;
36125 case X86::LCMPXCHG16B_NO_RBX: {
36126 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36127 Register BasePtr = TRI->getBaseRegister();
36128 if (TRI->hasBasePointer(*MF) &&
36129 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
36130 if (!BB->isLiveIn(BasePtr))
36131 BB->addLiveIn(BasePtr);
36132 // Save RBX into a virtual register.
36133 Register SaveRBX =
36134 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36135 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36136 .addReg(X86::RBX);
36137 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36138 MachineInstrBuilder MIB =
36139 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
36140 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36141 MIB.add(MI.getOperand(Idx));
36142 MIB.add(MI.getOperand(X86::AddrNumOperands));
36143 MIB.addReg(SaveRBX);
36144 } else {
36145 // Simple case, just copy the virtual register to RBX.
36146 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
36147 .add(MI.getOperand(X86::AddrNumOperands));
36148 MachineInstrBuilder MIB =
36149 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
36150 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36151 MIB.add(MI.getOperand(Idx));
36153 MI.eraseFromParent();
36154 return BB;
36156 case X86::MWAITX: {
36157 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36158 Register BasePtr = TRI->getBaseRegister();
36159 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
36160 // If no need to save the base pointer, we generate MWAITXrrr,
36161 // else we generate pseudo MWAITX_SAVE_RBX.
36162 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
36163 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36164 .addReg(MI.getOperand(0).getReg());
36165 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36166 .addReg(MI.getOperand(1).getReg());
36167 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
36168 .addReg(MI.getOperand(2).getReg());
36169 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
36170 MI.eraseFromParent();
36171 } else {
36172 if (!BB->isLiveIn(BasePtr)) {
36173 BB->addLiveIn(BasePtr);
36175 // Parameters can be copied into ECX and EAX but not EBX yet.
36176 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36177 .addReg(MI.getOperand(0).getReg());
36178 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36179 .addReg(MI.getOperand(1).getReg());
36180 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
36181 // Save RBX into a virtual register.
36182 Register SaveRBX =
36183 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36184 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36185 .addReg(X86::RBX);
36186 // Generate mwaitx pseudo.
36187 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36188 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
36189 .addDef(Dst) // Destination tied in with SaveRBX.
36190 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
36191 .addUse(SaveRBX); // Save of base pointer.
36192 MI.eraseFromParent();
36194 return BB;
36196 case TargetOpcode::PREALLOCATED_SETUP: {
36197 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
36198 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36199 MFI->setHasPreallocatedCall(true);
36200 int64_t PreallocatedId = MI.getOperand(0).getImm();
36201 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
36202 assert(StackAdjustment != 0 && "0 stack adjustment");
36203 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
36204 << StackAdjustment << "\n");
36205 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
36206 .addReg(X86::ESP)
36207 .addImm(StackAdjustment);
36208 MI.eraseFromParent();
36209 return BB;
36211 case TargetOpcode::PREALLOCATED_ARG: {
36212 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
36213 int64_t PreallocatedId = MI.getOperand(1).getImm();
36214 int64_t ArgIdx = MI.getOperand(2).getImm();
36215 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36216 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
36217 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
36218 << ", arg offset " << ArgOffset << "\n");
36219 // stack pointer + offset
36220 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
36221 MI.getOperand(0).getReg()),
36222 X86::ESP, false, ArgOffset);
36223 MI.eraseFromParent();
36224 return BB;
36226 case X86::PTDPBSSD:
36227 case X86::PTDPBSUD:
36228 case X86::PTDPBUSD:
36229 case X86::PTDPBUUD:
36230 case X86::PTDPBF16PS:
36231 case X86::PTDPFP16PS: {
36232 unsigned Opc;
36233 switch (MI.getOpcode()) {
36234 default: llvm_unreachable("illegal opcode!");
36235 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
36236 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
36237 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
36238 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
36239 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
36240 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
36243 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36244 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36245 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36246 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36247 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36249 MI.eraseFromParent(); // The pseudo is gone now.
36250 return BB;
36252 case X86::PTILEZERO: {
36253 unsigned Imm = MI.getOperand(0).getImm();
36254 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
36255 MI.eraseFromParent(); // The pseudo is gone now.
36256 return BB;
36258 case X86::PTILELOADD:
36259 case X86::PTILELOADDT1:
36260 case X86::PTILESTORED: {
36261 unsigned Opc;
36262 switch (MI.getOpcode()) {
36263 default: llvm_unreachable("illegal opcode!");
36264 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
36265 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
36266 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
36269 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36270 unsigned CurOp = 0;
36271 if (Opc != X86::TILESTORED)
36272 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36273 RegState::Define);
36275 MIB.add(MI.getOperand(CurOp++)); // base
36276 MIB.add(MI.getOperand(CurOp++)); // scale
36277 MIB.add(MI.getOperand(CurOp++)); // index -- stride
36278 MIB.add(MI.getOperand(CurOp++)); // displacement
36279 MIB.add(MI.getOperand(CurOp++)); // segment
36281 if (Opc == X86::TILESTORED)
36282 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36283 RegState::Undef);
36285 MI.eraseFromParent(); // The pseudo is gone now.
36286 return BB;
36288 case X86::PTCMMIMFP16PS:
36289 case X86::PTCMMRLFP16PS: {
36290 const MIMetadata MIMD(MI);
36291 unsigned Opc;
36292 switch (MI.getOpcode()) {
36293 default: llvm_unreachable("Unexpected instruction!");
36294 case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
36295 case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
36297 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36298 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36299 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36300 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36301 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36302 MI.eraseFromParent(); // The pseudo is gone now.
36303 return BB;
36308 //===----------------------------------------------------------------------===//
36309 // X86 Optimization Hooks
36310 //===----------------------------------------------------------------------===//
36312 bool
36313 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
36314 const APInt &DemandedBits,
36315 const APInt &DemandedElts,
36316 TargetLoweringOpt &TLO) const {
36317 EVT VT = Op.getValueType();
36318 unsigned Opcode = Op.getOpcode();
36319 unsigned EltSize = VT.getScalarSizeInBits();
36321 if (VT.isVector()) {
36322 // If the constant is only all signbits in the active bits, then we should
36323 // extend it to the entire constant to allow it act as a boolean constant
36324 // vector.
36325 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
36326 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
36327 return false;
36328 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
36329 if (!DemandedElts[i] || V.getOperand(i).isUndef())
36330 continue;
36331 const APInt &Val = V.getConstantOperandAPInt(i);
36332 if (Val.getBitWidth() > Val.getNumSignBits() &&
36333 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
36334 return true;
36336 return false;
36338 // For vectors - if we have a constant, then try to sign extend.
36339 // TODO: Handle AND cases.
36340 unsigned ActiveBits = DemandedBits.getActiveBits();
36341 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
36342 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
36343 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
36344 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
36345 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
36346 VT.getVectorNumElements());
36347 SDValue NewC =
36348 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
36349 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
36350 SDValue NewOp =
36351 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
36352 return TLO.CombineTo(Op, NewOp);
36354 return false;
36357 // Only optimize Ands to prevent shrinking a constant that could be
36358 // matched by movzx.
36359 if (Opcode != ISD::AND)
36360 return false;
36362 // Make sure the RHS really is a constant.
36363 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
36364 if (!C)
36365 return false;
36367 const APInt &Mask = C->getAPIntValue();
36369 // Clear all non-demanded bits initially.
36370 APInt ShrunkMask = Mask & DemandedBits;
36372 // Find the width of the shrunk mask.
36373 unsigned Width = ShrunkMask.getActiveBits();
36375 // If the mask is all 0s there's nothing to do here.
36376 if (Width == 0)
36377 return false;
36379 // Find the next power of 2 width, rounding up to a byte.
36380 Width = llvm::bit_ceil(std::max(Width, 8U));
36381 // Truncate the width to size to handle illegal types.
36382 Width = std::min(Width, EltSize);
36384 // Calculate a possible zero extend mask for this constant.
36385 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
36387 // If we aren't changing the mask, just return true to keep it and prevent
36388 // the caller from optimizing.
36389 if (ZeroExtendMask == Mask)
36390 return true;
36392 // Make sure the new mask can be represented by a combination of mask bits
36393 // and non-demanded bits.
36394 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
36395 return false;
36397 // Replace the constant with the zero extend mask.
36398 SDLoc DL(Op);
36399 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
36400 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
36401 return TLO.CombineTo(Op, NewOp);
36404 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
36405 KnownBits &Known,
36406 const APInt &DemandedElts,
36407 const SelectionDAG &DAG,
36408 unsigned Depth) const {
36409 unsigned BitWidth = Known.getBitWidth();
36410 unsigned NumElts = DemandedElts.getBitWidth();
36411 unsigned Opc = Op.getOpcode();
36412 EVT VT = Op.getValueType();
36413 assert((Opc >= ISD::BUILTIN_OP_END ||
36414 Opc == ISD::INTRINSIC_WO_CHAIN ||
36415 Opc == ISD::INTRINSIC_W_CHAIN ||
36416 Opc == ISD::INTRINSIC_VOID) &&
36417 "Should use MaskedValueIsZero if you don't know whether Op"
36418 " is a target node!");
36420 Known.resetAll();
36421 switch (Opc) {
36422 default: break;
36423 case X86ISD::MUL_IMM: {
36424 KnownBits Known2;
36425 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36426 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36427 Known = KnownBits::mul(Known, Known2);
36428 break;
36430 case X86ISD::SETCC:
36431 Known.Zero.setBitsFrom(1);
36432 break;
36433 case X86ISD::MOVMSK: {
36434 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
36435 Known.Zero.setBitsFrom(NumLoBits);
36436 break;
36438 case X86ISD::PEXTRB:
36439 case X86ISD::PEXTRW: {
36440 SDValue Src = Op.getOperand(0);
36441 EVT SrcVT = Src.getValueType();
36442 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
36443 Op.getConstantOperandVal(1));
36444 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
36445 Known = Known.anyextOrTrunc(BitWidth);
36446 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
36447 break;
36449 case X86ISD::VSRAI:
36450 case X86ISD::VSHLI:
36451 case X86ISD::VSRLI: {
36452 unsigned ShAmt = Op.getConstantOperandVal(1);
36453 if (ShAmt >= VT.getScalarSizeInBits()) {
36454 // Out of range logical bit shifts are guaranteed to be zero.
36455 // Out of range arithmetic bit shifts splat the sign bit.
36456 if (Opc != X86ISD::VSRAI) {
36457 Known.setAllZero();
36458 break;
36461 ShAmt = VT.getScalarSizeInBits() - 1;
36464 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36465 if (Opc == X86ISD::VSHLI) {
36466 Known.Zero <<= ShAmt;
36467 Known.One <<= ShAmt;
36468 // Low bits are known zero.
36469 Known.Zero.setLowBits(ShAmt);
36470 } else if (Opc == X86ISD::VSRLI) {
36471 Known.Zero.lshrInPlace(ShAmt);
36472 Known.One.lshrInPlace(ShAmt);
36473 // High bits are known zero.
36474 Known.Zero.setHighBits(ShAmt);
36475 } else {
36476 Known.Zero.ashrInPlace(ShAmt);
36477 Known.One.ashrInPlace(ShAmt);
36479 break;
36481 case X86ISD::PACKUS: {
36482 // PACKUS is just a truncation if the upper half is zero.
36483 APInt DemandedLHS, DemandedRHS;
36484 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
36486 Known.One = APInt::getAllOnes(BitWidth * 2);
36487 Known.Zero = APInt::getAllOnes(BitWidth * 2);
36489 KnownBits Known2;
36490 if (!!DemandedLHS) {
36491 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
36492 Known = Known.intersectWith(Known2);
36494 if (!!DemandedRHS) {
36495 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
36496 Known = Known.intersectWith(Known2);
36499 if (Known.countMinLeadingZeros() < BitWidth)
36500 Known.resetAll();
36501 Known = Known.trunc(BitWidth);
36502 break;
36504 case X86ISD::VBROADCAST: {
36505 SDValue Src = Op.getOperand(0);
36506 if (!Src.getSimpleValueType().isVector()) {
36507 Known = DAG.computeKnownBits(Src, Depth + 1);
36508 return;
36510 break;
36512 case X86ISD::AND: {
36513 if (Op.getResNo() == 0) {
36514 KnownBits Known2;
36515 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36516 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36517 Known &= Known2;
36519 break;
36521 case X86ISD::ANDNP: {
36522 KnownBits Known2;
36523 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36524 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36526 // ANDNP = (~X & Y);
36527 Known.One &= Known2.Zero;
36528 Known.Zero |= Known2.One;
36529 break;
36531 case X86ISD::FOR: {
36532 KnownBits Known2;
36533 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36534 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36536 Known |= Known2;
36537 break;
36539 case X86ISD::PSADBW: {
36540 assert(VT.getScalarType() == MVT::i64 &&
36541 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
36542 "Unexpected PSADBW types");
36544 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
36545 Known.Zero.setBitsFrom(16);
36546 break;
36548 case X86ISD::PCMPGT:
36549 case X86ISD::PCMPEQ: {
36550 KnownBits KnownLhs =
36551 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36552 KnownBits KnownRhs =
36553 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36554 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
36555 ? KnownBits::eq(KnownLhs, KnownRhs)
36556 : KnownBits::sgt(KnownLhs, KnownRhs);
36557 if (Res) {
36558 if (*Res)
36559 Known.setAllOnes();
36560 else
36561 Known.setAllZero();
36563 break;
36565 case X86ISD::PMULUDQ: {
36566 KnownBits Known2;
36567 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36568 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36570 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
36571 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
36572 Known = KnownBits::mul(Known, Known2);
36573 break;
36575 case X86ISD::CMOV: {
36576 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
36577 // If we don't know any bits, early out.
36578 if (Known.isUnknown())
36579 break;
36580 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
36582 // Only known if known in both the LHS and RHS.
36583 Known = Known.intersectWith(Known2);
36584 break;
36586 case X86ISD::BEXTR:
36587 case X86ISD::BEXTRI: {
36588 SDValue Op0 = Op.getOperand(0);
36589 SDValue Op1 = Op.getOperand(1);
36591 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
36592 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
36593 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
36595 // If the length is 0, the result is 0.
36596 if (Length == 0) {
36597 Known.setAllZero();
36598 break;
36601 if ((Shift + Length) <= BitWidth) {
36602 Known = DAG.computeKnownBits(Op0, Depth + 1);
36603 Known = Known.extractBits(Length, Shift);
36604 Known = Known.zextOrTrunc(BitWidth);
36607 break;
36609 case X86ISD::PDEP: {
36610 KnownBits Known2;
36611 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36612 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36613 // Zeros are retained from the mask operand. But not ones.
36614 Known.One.clearAllBits();
36615 // The result will have at least as many trailing zeros as the non-mask
36616 // operand since bits can only map to the same or higher bit position.
36617 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
36618 break;
36620 case X86ISD::PEXT: {
36621 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36622 // The result has as many leading zeros as the number of zeroes in the mask.
36623 unsigned Count = Known.Zero.popcount();
36624 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
36625 Known.One.clearAllBits();
36626 break;
36628 case X86ISD::VTRUNC:
36629 case X86ISD::VTRUNCS:
36630 case X86ISD::VTRUNCUS:
36631 case X86ISD::CVTSI2P:
36632 case X86ISD::CVTUI2P:
36633 case X86ISD::CVTP2SI:
36634 case X86ISD::CVTP2UI:
36635 case X86ISD::MCVTP2SI:
36636 case X86ISD::MCVTP2UI:
36637 case X86ISD::CVTTP2SI:
36638 case X86ISD::CVTTP2UI:
36639 case X86ISD::MCVTTP2SI:
36640 case X86ISD::MCVTTP2UI:
36641 case X86ISD::MCVTSI2P:
36642 case X86ISD::MCVTUI2P:
36643 case X86ISD::VFPROUND:
36644 case X86ISD::VMFPROUND:
36645 case X86ISD::CVTPS2PH:
36646 case X86ISD::MCVTPS2PH: {
36647 // Truncations/Conversions - upper elements are known zero.
36648 EVT SrcVT = Op.getOperand(0).getValueType();
36649 if (SrcVT.isVector()) {
36650 unsigned NumSrcElts = SrcVT.getVectorNumElements();
36651 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
36652 Known.setAllZero();
36654 break;
36656 case X86ISD::STRICT_CVTTP2SI:
36657 case X86ISD::STRICT_CVTTP2UI:
36658 case X86ISD::STRICT_CVTSI2P:
36659 case X86ISD::STRICT_CVTUI2P:
36660 case X86ISD::STRICT_VFPROUND:
36661 case X86ISD::STRICT_CVTPS2PH: {
36662 // Strict Conversions - upper elements are known zero.
36663 EVT SrcVT = Op.getOperand(1).getValueType();
36664 if (SrcVT.isVector()) {
36665 unsigned NumSrcElts = SrcVT.getVectorNumElements();
36666 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
36667 Known.setAllZero();
36669 break;
36671 case X86ISD::MOVQ2DQ: {
36672 // Move from MMX to XMM. Upper half of XMM should be 0.
36673 if (DemandedElts.countr_zero() >= (NumElts / 2))
36674 Known.setAllZero();
36675 break;
36677 case X86ISD::VBROADCAST_LOAD: {
36678 APInt UndefElts;
36679 SmallVector<APInt, 16> EltBits;
36680 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
36681 /*AllowWholeUndefs*/ false,
36682 /*AllowPartialUndefs*/ false)) {
36683 Known.Zero.setAllBits();
36684 Known.One.setAllBits();
36685 for (unsigned I = 0; I != NumElts; ++I) {
36686 if (!DemandedElts[I])
36687 continue;
36688 if (UndefElts[I]) {
36689 Known.resetAll();
36690 break;
36692 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
36693 Known = Known.intersectWith(Known2);
36695 return;
36697 break;
36701 // Handle target shuffles.
36702 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
36703 if (isTargetShuffle(Opc)) {
36704 SmallVector<int, 64> Mask;
36705 SmallVector<SDValue, 2> Ops;
36706 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
36707 unsigned NumOps = Ops.size();
36708 unsigned NumElts = VT.getVectorNumElements();
36709 if (Mask.size() == NumElts) {
36710 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
36711 Known.Zero.setAllBits(); Known.One.setAllBits();
36712 for (unsigned i = 0; i != NumElts; ++i) {
36713 if (!DemandedElts[i])
36714 continue;
36715 int M = Mask[i];
36716 if (M == SM_SentinelUndef) {
36717 // For UNDEF elements, we don't know anything about the common state
36718 // of the shuffle result.
36719 Known.resetAll();
36720 break;
36722 if (M == SM_SentinelZero) {
36723 Known.One.clearAllBits();
36724 continue;
36726 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
36727 "Shuffle index out of range");
36729 unsigned OpIdx = (unsigned)M / NumElts;
36730 unsigned EltIdx = (unsigned)M % NumElts;
36731 if (Ops[OpIdx].getValueType() != VT) {
36732 // TODO - handle target shuffle ops with different value types.
36733 Known.resetAll();
36734 break;
36736 DemandedOps[OpIdx].setBit(EltIdx);
36738 // Known bits are the values that are shared by every demanded element.
36739 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
36740 if (!DemandedOps[i])
36741 continue;
36742 KnownBits Known2 =
36743 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
36744 Known = Known.intersectWith(Known2);
36751 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
36752 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
36753 unsigned Depth) const {
36754 EVT VT = Op.getValueType();
36755 unsigned VTBits = VT.getScalarSizeInBits();
36756 unsigned Opcode = Op.getOpcode();
36757 switch (Opcode) {
36758 case X86ISD::SETCC_CARRY:
36759 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
36760 return VTBits;
36762 case X86ISD::VTRUNC: {
36763 SDValue Src = Op.getOperand(0);
36764 MVT SrcVT = Src.getSimpleValueType();
36765 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
36766 assert(VTBits < NumSrcBits && "Illegal truncation input type");
36767 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
36768 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
36769 if (Tmp > (NumSrcBits - VTBits))
36770 return Tmp - (NumSrcBits - VTBits);
36771 return 1;
36774 case X86ISD::PACKSS: {
36775 // PACKSS is just a truncation if the sign bits extend to the packed size.
36776 APInt DemandedLHS, DemandedRHS;
36777 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
36778 DemandedRHS);
36780 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
36781 // patterns often used to compact vXi64 allsignbit patterns.
36782 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
36783 SDValue BC = peekThroughBitcasts(V);
36784 if (BC.getOpcode() == X86ISD::PACKSS &&
36785 BC.getScalarValueSizeInBits() == 16 &&
36786 V.getScalarValueSizeInBits() == 32) {
36787 SDValue BC0 = peekThroughBitcasts(BC.getOperand(0));
36788 SDValue BC1 = peekThroughBitcasts(BC.getOperand(1));
36789 if (BC0.getScalarValueSizeInBits() == 64 &&
36790 BC1.getScalarValueSizeInBits() == 64 &&
36791 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
36792 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
36793 return 32;
36795 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
36798 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
36799 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
36800 if (!!DemandedLHS)
36801 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
36802 if (!!DemandedRHS)
36803 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
36804 unsigned Tmp = std::min(Tmp0, Tmp1);
36805 if (Tmp > (SrcBits - VTBits))
36806 return Tmp - (SrcBits - VTBits);
36807 return 1;
36810 case X86ISD::VBROADCAST: {
36811 SDValue Src = Op.getOperand(0);
36812 if (!Src.getSimpleValueType().isVector())
36813 return DAG.ComputeNumSignBits(Src, Depth + 1);
36814 break;
36817 case X86ISD::VSHLI: {
36818 SDValue Src = Op.getOperand(0);
36819 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
36820 if (ShiftVal.uge(VTBits))
36821 return VTBits; // Shifted all bits out --> zero.
36822 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
36823 if (ShiftVal.uge(Tmp))
36824 return 1; // Shifted all sign bits out --> unknown.
36825 return Tmp - ShiftVal.getZExtValue();
36828 case X86ISD::VSRAI: {
36829 SDValue Src = Op.getOperand(0);
36830 APInt ShiftVal = Op.getConstantOperandAPInt(1);
36831 if (ShiftVal.uge(VTBits - 1))
36832 return VTBits; // Sign splat.
36833 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
36834 ShiftVal += Tmp;
36835 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
36838 case X86ISD::FSETCC:
36839 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
36840 if (VT == MVT::f32 || VT == MVT::f64 ||
36841 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
36842 return VTBits;
36843 break;
36845 case X86ISD::PCMPGT:
36846 case X86ISD::PCMPEQ:
36847 case X86ISD::CMPP:
36848 case X86ISD::VPCOM:
36849 case X86ISD::VPCOMU:
36850 // Vector compares return zero/all-bits result values.
36851 return VTBits;
36853 case X86ISD::ANDNP: {
36854 unsigned Tmp0 =
36855 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
36856 if (Tmp0 == 1) return 1; // Early out.
36857 unsigned Tmp1 =
36858 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
36859 return std::min(Tmp0, Tmp1);
36862 case X86ISD::CMOV: {
36863 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
36864 if (Tmp0 == 1) return 1; // Early out.
36865 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
36866 return std::min(Tmp0, Tmp1);
36870 // Handle target shuffles.
36871 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
36872 if (isTargetShuffle(Opcode)) {
36873 SmallVector<int, 64> Mask;
36874 SmallVector<SDValue, 2> Ops;
36875 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
36876 unsigned NumOps = Ops.size();
36877 unsigned NumElts = VT.getVectorNumElements();
36878 if (Mask.size() == NumElts) {
36879 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
36880 for (unsigned i = 0; i != NumElts; ++i) {
36881 if (!DemandedElts[i])
36882 continue;
36883 int M = Mask[i];
36884 if (M == SM_SentinelUndef) {
36885 // For UNDEF elements, we don't know anything about the common state
36886 // of the shuffle result.
36887 return 1;
36888 } else if (M == SM_SentinelZero) {
36889 // Zero = all sign bits.
36890 continue;
36892 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
36893 "Shuffle index out of range");
36895 unsigned OpIdx = (unsigned)M / NumElts;
36896 unsigned EltIdx = (unsigned)M % NumElts;
36897 if (Ops[OpIdx].getValueType() != VT) {
36898 // TODO - handle target shuffle ops with different value types.
36899 return 1;
36901 DemandedOps[OpIdx].setBit(EltIdx);
36903 unsigned Tmp0 = VTBits;
36904 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
36905 if (!DemandedOps[i])
36906 continue;
36907 unsigned Tmp1 =
36908 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
36909 Tmp0 = std::min(Tmp0, Tmp1);
36911 return Tmp0;
36916 // Fallback case.
36917 return 1;
36920 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
36921 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
36922 return N->getOperand(0);
36923 return N;
36926 // Helper to look for a normal load that can be narrowed into a vzload with the
36927 // specified VT and memory VT. Returns SDValue() on failure.
36928 static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
36929 SelectionDAG &DAG) {
36930 // Can't if the load is volatile or atomic.
36931 if (!LN->isSimple())
36932 return SDValue();
36934 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
36935 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
36936 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
36937 LN->getPointerInfo(), LN->getOriginalAlign(),
36938 LN->getMemOperand()->getFlags());
36941 // Attempt to match a combined shuffle mask against supported unary shuffle
36942 // instructions.
36943 // TODO: Investigate sharing more of this with shuffle lowering.
36944 static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
36945 bool AllowFloatDomain, bool AllowIntDomain,
36946 SDValue V1, const SelectionDAG &DAG,
36947 const X86Subtarget &Subtarget, unsigned &Shuffle,
36948 MVT &SrcVT, MVT &DstVT) {
36949 unsigned NumMaskElts = Mask.size();
36950 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
36952 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
36953 if (Mask[0] == 0 &&
36954 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
36955 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
36956 (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36957 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
36958 Shuffle = X86ISD::VZEXT_MOVL;
36959 if (MaskEltSize == 16)
36960 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
36961 else
36962 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
36963 return true;
36967 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
36968 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
36969 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
36970 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
36971 unsigned MaxScale = 64 / MaskEltSize;
36972 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
36973 DAG.ComputeNumSignBits(V1) == MaskEltSize;
36974 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
36975 bool MatchAny = true;
36976 bool MatchZero = true;
36977 bool MatchSign = UseSign;
36978 unsigned NumDstElts = NumMaskElts / Scale;
36979 for (unsigned i = 0;
36980 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
36981 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
36982 MatchAny = MatchSign = MatchZero = false;
36983 break;
36985 unsigned Pos = (i * Scale) + 1;
36986 unsigned Len = Scale - 1;
36987 MatchAny &= isUndefInRange(Mask, Pos, Len);
36988 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
36989 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
36991 if (MatchAny || MatchSign || MatchZero) {
36992 assert((MatchSign || MatchZero) &&
36993 "Failed to match sext/zext but matched aext?");
36994 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
36995 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
36996 : MVT::getIntegerVT(MaskEltSize);
36997 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
36999 Shuffle = unsigned(
37000 MatchAny ? ISD::ANY_EXTEND
37001 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
37002 if (SrcVT.getVectorNumElements() != NumDstElts)
37003 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
37005 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
37006 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
37007 return true;
37012 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
37013 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
37014 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
37015 isUndefOrEqual(Mask[0], 0) &&
37016 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
37017 Shuffle = X86ISD::VZEXT_MOVL;
37018 if (MaskEltSize == 16)
37019 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37020 else
37021 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37022 return true;
37025 // Check if we have SSE3 which will let us use MOVDDUP etc. The
37026 // instructions are no slower than UNPCKLPD but has the option to
37027 // fold the input operand into even an unaligned memory load.
37028 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
37029 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
37030 Shuffle = X86ISD::MOVDDUP;
37031 SrcVT = DstVT = MVT::v2f64;
37032 return true;
37034 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37035 Shuffle = X86ISD::MOVSLDUP;
37036 SrcVT = DstVT = MVT::v4f32;
37037 return true;
37039 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
37040 Shuffle = X86ISD::MOVSHDUP;
37041 SrcVT = DstVT = MVT::v4f32;
37042 return true;
37046 if (MaskVT.is256BitVector() && AllowFloatDomain) {
37047 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
37048 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37049 Shuffle = X86ISD::MOVDDUP;
37050 SrcVT = DstVT = MVT::v4f64;
37051 return true;
37053 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37054 V1)) {
37055 Shuffle = X86ISD::MOVSLDUP;
37056 SrcVT = DstVT = MVT::v8f32;
37057 return true;
37059 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
37060 V1)) {
37061 Shuffle = X86ISD::MOVSHDUP;
37062 SrcVT = DstVT = MVT::v8f32;
37063 return true;
37067 if (MaskVT.is512BitVector() && AllowFloatDomain) {
37068 assert(Subtarget.hasAVX512() &&
37069 "AVX512 required for 512-bit vector shuffles");
37070 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37071 V1)) {
37072 Shuffle = X86ISD::MOVDDUP;
37073 SrcVT = DstVT = MVT::v8f64;
37074 return true;
37076 if (isTargetShuffleEquivalent(
37077 MaskVT, Mask,
37078 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
37079 Shuffle = X86ISD::MOVSLDUP;
37080 SrcVT = DstVT = MVT::v16f32;
37081 return true;
37083 if (isTargetShuffleEquivalent(
37084 MaskVT, Mask,
37085 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
37086 Shuffle = X86ISD::MOVSHDUP;
37087 SrcVT = DstVT = MVT::v16f32;
37088 return true;
37092 return false;
37095 // Attempt to match a combined shuffle mask against supported unary immediate
37096 // permute instructions.
37097 // TODO: Investigate sharing more of this with shuffle lowering.
37098 static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
37099 const APInt &Zeroable,
37100 bool AllowFloatDomain, bool AllowIntDomain,
37101 const SelectionDAG &DAG,
37102 const X86Subtarget &Subtarget,
37103 unsigned &Shuffle, MVT &ShuffleVT,
37104 unsigned &PermuteImm) {
37105 unsigned NumMaskElts = Mask.size();
37106 unsigned InputSizeInBits = MaskVT.getSizeInBits();
37107 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
37108 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
37109 bool ContainsZeros = isAnyZero(Mask);
37111 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
37112 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
37113 // Check for lane crossing permutes.
37114 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
37115 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
37116 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
37117 Shuffle = X86ISD::VPERMI;
37118 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
37119 PermuteImm = getV4X86ShuffleImm(Mask);
37120 return true;
37122 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
37123 SmallVector<int, 4> RepeatedMask;
37124 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
37125 Shuffle = X86ISD::VPERMI;
37126 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
37127 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
37128 return true;
37131 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
37132 // VPERMILPD can permute with a non-repeating shuffle.
37133 Shuffle = X86ISD::VPERMILPI;
37134 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
37135 PermuteImm = 0;
37136 for (int i = 0, e = Mask.size(); i != e; ++i) {
37137 int M = Mask[i];
37138 if (M == SM_SentinelUndef)
37139 continue;
37140 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
37141 PermuteImm |= (M & 1) << i;
37143 return true;
37147 // We are checking for shuffle match or shift match. Loop twice so we can
37148 // order which we try and match first depending on target preference.
37149 for (unsigned Order = 0; Order < 2; ++Order) {
37150 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
37151 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
37152 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
37153 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
37154 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
37155 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
37156 SmallVector<int, 4> RepeatedMask;
37157 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
37158 // Narrow the repeated mask to create 32-bit element permutes.
37159 SmallVector<int, 4> WordMask = RepeatedMask;
37160 if (MaskScalarSizeInBits == 64)
37161 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
37163 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
37164 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
37165 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
37166 PermuteImm = getV4X86ShuffleImm(WordMask);
37167 return true;
37171 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
37172 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
37173 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37174 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37175 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
37176 SmallVector<int, 4> RepeatedMask;
37177 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
37178 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
37179 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
37181 // PSHUFLW: permute lower 4 elements only.
37182 if (isUndefOrInRange(LoMask, 0, 4) &&
37183 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
37184 Shuffle = X86ISD::PSHUFLW;
37185 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
37186 PermuteImm = getV4X86ShuffleImm(LoMask);
37187 return true;
37190 // PSHUFHW: permute upper 4 elements only.
37191 if (isUndefOrInRange(HiMask, 4, 8) &&
37192 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
37193 // Offset the HiMask so that we can create the shuffle immediate.
37194 int OffsetHiMask[4];
37195 for (int i = 0; i != 4; ++i)
37196 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
37198 Shuffle = X86ISD::PSHUFHW;
37199 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
37200 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
37201 return true;
37205 } else {
37206 // Attempt to match against bit rotates.
37207 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
37208 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
37209 Subtarget.hasAVX512())) {
37210 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
37211 Subtarget, Mask);
37212 if (0 < RotateAmt) {
37213 Shuffle = X86ISD::VROTLI;
37214 PermuteImm = (unsigned)RotateAmt;
37215 return true;
37219 // Attempt to match against byte/bit shifts.
37220 if (AllowIntDomain &&
37221 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37222 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37223 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37224 int ShiftAmt =
37225 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
37226 Zeroable, Subtarget);
37227 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
37228 32 <= ShuffleVT.getScalarSizeInBits())) {
37229 // Byte shifts can be slower so only match them on second attempt.
37230 if (Order == 0 &&
37231 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
37232 continue;
37234 PermuteImm = (unsigned)ShiftAmt;
37235 return true;
37241 return false;
37244 // Attempt to match a combined unary shuffle mask against supported binary
37245 // shuffle instructions.
37246 // TODO: Investigate sharing more of this with shuffle lowering.
37247 static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37248 bool AllowFloatDomain, bool AllowIntDomain,
37249 SDValue &V1, SDValue &V2, const SDLoc &DL,
37250 SelectionDAG &DAG, const X86Subtarget &Subtarget,
37251 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
37252 bool IsUnary) {
37253 unsigned NumMaskElts = Mask.size();
37254 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37255 unsigned SizeInBits = MaskVT.getSizeInBits();
37257 if (MaskVT.is128BitVector()) {
37258 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
37259 AllowFloatDomain) {
37260 V2 = V1;
37261 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
37262 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
37263 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37264 return true;
37266 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
37267 AllowFloatDomain) {
37268 V2 = V1;
37269 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
37270 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37271 return true;
37273 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
37274 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
37275 std::swap(V1, V2);
37276 Shuffle = X86ISD::MOVSD;
37277 SrcVT = DstVT = MVT::v2f64;
37278 return true;
37280 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
37281 (AllowFloatDomain || !Subtarget.hasSSE41())) {
37282 Shuffle = X86ISD::MOVSS;
37283 SrcVT = DstVT = MVT::v4f32;
37284 return true;
37286 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
37287 DAG) &&
37288 Subtarget.hasFP16()) {
37289 Shuffle = X86ISD::MOVSH;
37290 SrcVT = DstVT = MVT::v8f16;
37291 return true;
37295 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
37296 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
37297 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
37298 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
37299 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
37300 Subtarget)) {
37301 DstVT = MaskVT;
37302 return true;
37305 // TODO: Can we handle this inside matchShuffleWithPACK?
37306 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
37307 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
37308 V1.getScalarValueSizeInBits() == 64 &&
37309 V2.getScalarValueSizeInBits() == 64) {
37310 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
37311 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
37312 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
37313 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
37314 SrcVT = MVT::v4i32;
37315 DstVT = MVT::v8i16;
37316 Shuffle = X86ISD::PACKUS;
37317 return true;
37319 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
37320 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
37321 SrcVT = MVT::v8i16;
37322 DstVT = MVT::v16i8;
37323 Shuffle = X86ISD::PACKUS;
37324 return true;
37326 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
37327 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
37328 SrcVT = MVT::v4i32;
37329 DstVT = MVT::v8i16;
37330 Shuffle = X86ISD::PACKSS;
37331 return true;
37335 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
37336 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
37337 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37338 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
37339 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37340 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
37341 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
37342 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
37343 Subtarget)) {
37344 SrcVT = DstVT = MaskVT;
37345 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
37346 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
37347 return true;
37351 // Attempt to match against a OR if we're performing a blend shuffle and the
37352 // non-blended source element is zero in each case.
37353 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
37354 if (SizeInBits == V1.getValueSizeInBits() &&
37355 SizeInBits == V2.getValueSizeInBits() &&
37356 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37357 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
37358 bool IsBlend = true;
37359 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
37360 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
37361 unsigned Scale1 = NumV1Elts / NumMaskElts;
37362 unsigned Scale2 = NumV2Elts / NumMaskElts;
37363 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
37364 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
37365 for (unsigned i = 0; i != NumMaskElts; ++i) {
37366 int M = Mask[i];
37367 if (M == SM_SentinelUndef)
37368 continue;
37369 if (M == SM_SentinelZero) {
37370 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37371 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37372 continue;
37374 if (M == (int)i) {
37375 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37376 continue;
37378 if (M == (int)(i + NumMaskElts)) {
37379 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37380 continue;
37382 IsBlend = false;
37383 break;
37385 if (IsBlend) {
37386 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
37387 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
37388 Shuffle = ISD::OR;
37389 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37390 return true;
37392 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
37393 // FIXME: handle mismatched sizes?
37394 // TODO: investigate if `ISD::OR` handling in
37395 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
37396 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
37397 unsigned NumElts = V.getValueType().getVectorNumElements();
37398 KnownBits Known(NumElts);
37399 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
37400 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
37401 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
37402 if (PeepholeKnown.isZero())
37403 Known.Zero.setBit(EltIdx);
37404 if (PeepholeKnown.isAllOnes())
37405 Known.One.setBit(EltIdx);
37407 return Known;
37410 KnownBits V1Known = computeKnownBitsElementWise(V1);
37411 KnownBits V2Known = computeKnownBitsElementWise(V2);
37413 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
37414 int M = Mask[i];
37415 if (M == SM_SentinelUndef)
37416 continue;
37417 if (M == SM_SentinelZero) {
37418 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
37419 continue;
37421 if (M == (int)i) {
37422 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
37423 continue;
37425 if (M == (int)(i + NumMaskElts)) {
37426 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
37427 continue;
37429 llvm_unreachable("will not get here.");
37431 if (IsBlend) {
37432 Shuffle = ISD::OR;
37433 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37434 return true;
37440 return false;
37443 static bool matchBinaryPermuteShuffle(
37444 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
37445 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
37446 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
37447 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
37448 unsigned NumMaskElts = Mask.size();
37449 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37451 // Attempt to match against VALIGND/VALIGNQ rotate.
37452 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
37453 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
37454 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
37455 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37456 if (!isAnyZero(Mask)) {
37457 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
37458 if (0 < Rotation) {
37459 Shuffle = X86ISD::VALIGN;
37460 if (EltSizeInBits == 64)
37461 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
37462 else
37463 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
37464 PermuteImm = Rotation;
37465 return true;
37470 // Attempt to match against PALIGNR byte rotate.
37471 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
37472 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37473 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
37474 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
37475 if (0 < ByteRotation) {
37476 Shuffle = X86ISD::PALIGNR;
37477 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
37478 PermuteImm = ByteRotation;
37479 return true;
37483 // Attempt to combine to X86ISD::BLENDI.
37484 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
37485 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
37486 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
37487 uint64_t BlendMask = 0;
37488 bool ForceV1Zero = false, ForceV2Zero = false;
37489 SmallVector<int, 8> TargetMask(Mask);
37490 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
37491 ForceV2Zero, BlendMask)) {
37492 if (MaskVT == MVT::v16i16) {
37493 // We can only use v16i16 PBLENDW if the lanes are repeated.
37494 SmallVector<int, 8> RepeatedMask;
37495 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
37496 RepeatedMask)) {
37497 assert(RepeatedMask.size() == 8 &&
37498 "Repeated mask size doesn't match!");
37499 PermuteImm = 0;
37500 for (int i = 0; i < 8; ++i)
37501 if (RepeatedMask[i] >= 8)
37502 PermuteImm |= 1 << i;
37503 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37504 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37505 Shuffle = X86ISD::BLENDI;
37506 ShuffleVT = MaskVT;
37507 return true;
37509 } else {
37510 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37511 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37512 PermuteImm = (unsigned)BlendMask;
37513 Shuffle = X86ISD::BLENDI;
37514 ShuffleVT = MaskVT;
37515 return true;
37520 // Attempt to combine to INSERTPS, but only if it has elements that need to
37521 // be set to zero.
37522 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
37523 MaskVT.is128BitVector() && isAnyZero(Mask) &&
37524 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
37525 Shuffle = X86ISD::INSERTPS;
37526 ShuffleVT = MVT::v4f32;
37527 return true;
37530 // Attempt to combine to SHUFPD.
37531 if (AllowFloatDomain && EltSizeInBits == 64 &&
37532 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37533 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
37534 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37535 bool ForceV1Zero = false, ForceV2Zero = false;
37536 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
37537 PermuteImm, Mask, Zeroable)) {
37538 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37539 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37540 Shuffle = X86ISD::SHUFP;
37541 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
37542 return true;
37546 // Attempt to combine to SHUFPS.
37547 if (AllowFloatDomain && EltSizeInBits == 32 &&
37548 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
37549 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
37550 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37551 SmallVector<int, 4> RepeatedMask;
37552 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
37553 // Match each half of the repeated mask, to determine if its just
37554 // referencing one of the vectors, is zeroable or entirely undef.
37555 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
37556 int M0 = RepeatedMask[Offset];
37557 int M1 = RepeatedMask[Offset + 1];
37559 if (isUndefInRange(RepeatedMask, Offset, 2)) {
37560 return DAG.getUNDEF(MaskVT);
37561 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
37562 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
37563 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
37564 return getZeroVector(MaskVT, Subtarget, DAG, DL);
37565 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
37566 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
37567 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
37568 return V1;
37569 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
37570 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
37571 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
37572 return V2;
37575 return SDValue();
37578 int ShufMask[4] = {-1, -1, -1, -1};
37579 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
37580 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
37582 if (Lo && Hi) {
37583 V1 = Lo;
37584 V2 = Hi;
37585 Shuffle = X86ISD::SHUFP;
37586 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
37587 PermuteImm = getV4X86ShuffleImm(ShufMask);
37588 return true;
37593 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
37594 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
37595 MaskVT.is128BitVector() &&
37596 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
37597 Shuffle = X86ISD::INSERTPS;
37598 ShuffleVT = MVT::v4f32;
37599 return true;
37602 return false;
37605 static SDValue combineX86ShuffleChainWithExtract(
37606 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
37607 bool HasVariableMask, bool AllowVariableCrossLaneMask,
37608 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
37609 const X86Subtarget &Subtarget);
37611 /// Combine an arbitrary chain of shuffles into a single instruction if
37612 /// possible.
37614 /// This is the leaf of the recursive combine below. When we have found some
37615 /// chain of single-use x86 shuffle instructions and accumulated the combined
37616 /// shuffle mask represented by them, this will try to pattern match that mask
37617 /// into either a single instruction if there is a special purpose instruction
37618 /// for this operation, or into a PSHUFB instruction which is a fully general
37619 /// instruction but should only be used to replace chains over a certain depth.
37620 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
37621 ArrayRef<int> BaseMask, int Depth,
37622 bool HasVariableMask,
37623 bool AllowVariableCrossLaneMask,
37624 bool AllowVariablePerLaneMask,
37625 SelectionDAG &DAG,
37626 const X86Subtarget &Subtarget) {
37627 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
37628 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
37629 "Unexpected number of shuffle inputs!");
37631 SDLoc DL(Root);
37632 MVT RootVT = Root.getSimpleValueType();
37633 unsigned RootSizeInBits = RootVT.getSizeInBits();
37634 unsigned NumRootElts = RootVT.getVectorNumElements();
37636 // Canonicalize shuffle input op to the requested type.
37637 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
37638 if (VT.getSizeInBits() > Op.getValueSizeInBits())
37639 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
37640 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
37641 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
37642 return DAG.getBitcast(VT, Op);
37645 // Find the inputs that enter the chain. Note that multiple uses are OK
37646 // here, we're not going to remove the operands we find.
37647 bool UnaryShuffle = (Inputs.size() == 1);
37648 SDValue V1 = peekThroughBitcasts(Inputs[0]);
37649 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
37650 : peekThroughBitcasts(Inputs[1]));
37652 MVT VT1 = V1.getSimpleValueType();
37653 MVT VT2 = V2.getSimpleValueType();
37654 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
37655 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
37657 SDValue Res;
37659 unsigned NumBaseMaskElts = BaseMask.size();
37660 if (NumBaseMaskElts == 1) {
37661 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
37662 return CanonicalizeShuffleInput(RootVT, V1);
37665 bool OptForSize = DAG.shouldOptForSize();
37666 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
37667 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
37668 (RootVT.isFloatingPoint() && Depth >= 1) ||
37669 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
37671 // Don't combine if we are a AVX512/EVEX target and the mask element size
37672 // is different from the root element size - this would prevent writemasks
37673 // from being reused.
37674 bool IsMaskedShuffle = false;
37675 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
37676 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
37677 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
37678 IsMaskedShuffle = true;
37682 // If we are shuffling a splat (and not introducing zeros) then we can just
37683 // use it directly. This works for smaller elements as well as they already
37684 // repeat across each mask element.
37685 if (UnaryShuffle && !isAnyZero(BaseMask) &&
37686 V1.getValueSizeInBits() >= RootSizeInBits &&
37687 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37688 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
37689 return CanonicalizeShuffleInput(RootVT, V1);
37692 SmallVector<int, 64> Mask(BaseMask);
37694 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
37695 // etc. can be simplified.
37696 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
37697 SmallVector<int> ScaledMask, IdentityMask;
37698 unsigned NumElts = VT1.getVectorNumElements();
37699 if (Mask.size() <= NumElts &&
37700 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
37701 for (unsigned i = 0; i != NumElts; ++i)
37702 IdentityMask.push_back(i);
37703 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
37704 V2))
37705 return CanonicalizeShuffleInput(RootVT, V1);
37709 // Handle 128/256-bit lane shuffles of 512-bit vectors.
37710 if (RootVT.is512BitVector() &&
37711 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
37712 // If the upper subvectors are zeroable, then an extract+insert is more
37713 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
37714 // to zero the upper subvectors.
37715 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
37716 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37717 return SDValue(); // Nothing to do!
37718 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
37719 "Unexpected lane shuffle");
37720 Res = CanonicalizeShuffleInput(RootVT, V1);
37721 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
37722 bool UseZero = isAnyZero(Mask);
37723 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
37724 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
37727 // Narrow shuffle mask to v4x128.
37728 SmallVector<int, 4> ScaledMask;
37729 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
37730 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
37732 // Try to lower to vshuf64x2/vshuf32x4.
37733 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
37734 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
37735 SelectionDAG &DAG) {
37736 int PermMask[4] = {-1, -1, -1, -1};
37737 // Ensure elements came from the same Op.
37738 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
37739 for (int i = 0; i < 4; ++i) {
37740 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
37741 if (ScaledMask[i] < 0)
37742 continue;
37744 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
37745 unsigned OpIndex = i / 2;
37746 if (Ops[OpIndex].isUndef())
37747 Ops[OpIndex] = Op;
37748 else if (Ops[OpIndex] != Op)
37749 return SDValue();
37751 PermMask[i] = ScaledMask[i] % 4;
37754 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
37755 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
37756 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
37757 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
37760 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
37761 // doesn't work because our mask is for 128 bits and we don't have an MVT
37762 // to match that.
37763 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
37764 isUndefOrInRange(ScaledMask[1], 0, 2) &&
37765 isUndefOrInRange(ScaledMask[2], 2, 4) &&
37766 isUndefOrInRange(ScaledMask[3], 2, 4) &&
37767 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
37768 ScaledMask[0] == (ScaledMask[2] % 2)) &&
37769 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
37770 ScaledMask[1] == (ScaledMask[3] % 2));
37772 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
37773 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
37774 return SDValue(); // Nothing to do!
37775 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
37776 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
37777 return DAG.getBitcast(RootVT, V);
37781 // Handle 128-bit lane shuffles of 256-bit vectors.
37782 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
37783 // If the upper half is zeroable, then an extract+insert is more optimal
37784 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
37785 // zero the upper half.
37786 if (isUndefOrZero(Mask[1])) {
37787 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37788 return SDValue(); // Nothing to do!
37789 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
37790 Res = CanonicalizeShuffleInput(RootVT, V1);
37791 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
37792 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
37793 256);
37796 // If we're inserting the low subvector, an insert-subvector 'concat'
37797 // pattern is quicker than VPERM2X128.
37798 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
37799 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
37800 !Subtarget.hasAVX2()) {
37801 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37802 return SDValue(); // Nothing to do!
37803 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
37804 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
37805 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
37806 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
37809 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
37810 return SDValue(); // Nothing to do!
37812 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
37813 // we need to use the zeroing feature.
37814 // Prefer blends for sequential shuffles unless we are optimizing for size.
37815 if (UnaryShuffle &&
37816 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
37817 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
37818 unsigned PermMask = 0;
37819 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
37820 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
37821 return DAG.getNode(
37822 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
37823 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
37826 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
37827 return SDValue(); // Nothing to do!
37829 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
37830 if (!UnaryShuffle && !IsMaskedShuffle) {
37831 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
37832 "Unexpected shuffle sentinel value");
37833 // Prefer blends to X86ISD::VPERM2X128.
37834 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
37835 unsigned PermMask = 0;
37836 PermMask |= ((Mask[0] & 3) << 0);
37837 PermMask |= ((Mask[1] & 3) << 4);
37838 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
37839 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
37840 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
37841 CanonicalizeShuffleInput(RootVT, LHS),
37842 CanonicalizeShuffleInput(RootVT, RHS),
37843 DAG.getTargetConstant(PermMask, DL, MVT::i8));
37848 // For masks that have been widened to 128-bit elements or more,
37849 // narrow back down to 64-bit elements.
37850 if (BaseMaskEltSizeInBits > 64) {
37851 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
37852 int MaskScale = BaseMaskEltSizeInBits / 64;
37853 SmallVector<int, 64> ScaledMask;
37854 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
37855 Mask = std::move(ScaledMask);
37858 // For masked shuffles, we're trying to match the root width for better
37859 // writemask folding, attempt to scale the mask.
37860 // TODO - variable shuffles might need this to be widened again.
37861 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
37862 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
37863 int MaskScale = NumRootElts / Mask.size();
37864 SmallVector<int, 64> ScaledMask;
37865 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
37866 Mask = std::move(ScaledMask);
37869 unsigned NumMaskElts = Mask.size();
37870 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
37872 // Determine the effective mask value type.
37873 FloatDomain &= (32 <= MaskEltSizeInBits);
37874 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
37875 : MVT::getIntegerVT(MaskEltSizeInBits);
37876 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
37878 // Only allow legal mask types.
37879 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
37880 return SDValue();
37882 // Attempt to match the mask against known shuffle patterns.
37883 MVT ShuffleSrcVT, ShuffleVT;
37884 unsigned Shuffle, PermuteImm;
37886 // Which shuffle domains are permitted?
37887 // Permit domain crossing at higher combine depths.
37888 // TODO: Should we indicate which domain is preferred if both are allowed?
37889 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
37890 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
37891 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
37893 // Determine zeroable mask elements.
37894 APInt KnownUndef, KnownZero;
37895 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
37896 APInt Zeroable = KnownUndef | KnownZero;
37898 if (UnaryShuffle) {
37899 // Attempt to match against broadcast-from-vector.
37900 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
37901 if ((Subtarget.hasAVX2() ||
37902 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
37903 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
37904 if (isUndefOrEqual(Mask, 0)) {
37905 if (V1.getValueType() == MaskVT &&
37906 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37907 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
37908 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
37909 return SDValue(); // Nothing to do!
37910 Res = V1.getOperand(0);
37911 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
37912 return DAG.getBitcast(RootVT, Res);
37914 if (Subtarget.hasAVX2()) {
37915 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
37916 return SDValue(); // Nothing to do!
37917 Res = CanonicalizeShuffleInput(MaskVT, V1);
37918 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
37919 return DAG.getBitcast(RootVT, Res);
37924 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
37925 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
37926 (!IsMaskedShuffle ||
37927 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37928 if (Depth == 0 && Root.getOpcode() == Shuffle)
37929 return SDValue(); // Nothing to do!
37930 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
37931 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
37932 return DAG.getBitcast(RootVT, Res);
37935 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
37936 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
37937 PermuteImm) &&
37938 (!IsMaskedShuffle ||
37939 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37940 if (Depth == 0 && Root.getOpcode() == Shuffle)
37941 return SDValue(); // Nothing to do!
37942 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
37943 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
37944 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37945 return DAG.getBitcast(RootVT, Res);
37949 // Attempt to combine to INSERTPS, but only if the inserted element has come
37950 // from a scalar.
37951 // TODO: Handle other insertions here as well?
37952 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
37953 Subtarget.hasSSE41() &&
37954 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
37955 if (MaskEltSizeInBits == 32) {
37956 SDValue SrcV1 = V1, SrcV2 = V2;
37957 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
37958 DAG) &&
37959 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
37960 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
37961 return SDValue(); // Nothing to do!
37962 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
37963 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
37964 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
37965 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37966 return DAG.getBitcast(RootVT, Res);
37969 if (MaskEltSizeInBits == 64 &&
37970 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
37971 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37972 V2.getScalarValueSizeInBits() <= 32) {
37973 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
37974 return SDValue(); // Nothing to do!
37975 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
37976 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
37977 CanonicalizeShuffleInput(MVT::v4f32, V1),
37978 CanonicalizeShuffleInput(MVT::v4f32, V2),
37979 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37980 return DAG.getBitcast(RootVT, Res);
37984 SDValue NewV1 = V1; // Save operands in case early exit happens.
37985 SDValue NewV2 = V2;
37986 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
37987 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
37988 ShuffleVT, UnaryShuffle) &&
37989 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37990 if (Depth == 0 && Root.getOpcode() == Shuffle)
37991 return SDValue(); // Nothing to do!
37992 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
37993 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
37994 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
37995 return DAG.getBitcast(RootVT, Res);
37998 NewV1 = V1; // Save operands in case early exit happens.
37999 NewV2 = V2;
38000 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38001 AllowIntDomain, NewV1, NewV2, DL, DAG,
38002 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
38003 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38004 if (Depth == 0 && Root.getOpcode() == Shuffle)
38005 return SDValue(); // Nothing to do!
38006 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
38007 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
38008 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
38009 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38010 return DAG.getBitcast(RootVT, Res);
38013 // Typically from here on, we need an integer version of MaskVT.
38014 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
38015 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
38017 // Annoyingly, SSE4A instructions don't map into the above match helpers.
38018 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
38019 uint64_t BitLen, BitIdx;
38020 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
38021 Zeroable)) {
38022 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
38023 return SDValue(); // Nothing to do!
38024 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38025 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
38026 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38027 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38028 return DAG.getBitcast(RootVT, Res);
38031 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
38032 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
38033 return SDValue(); // Nothing to do!
38034 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38035 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
38036 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
38037 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38038 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38039 return DAG.getBitcast(RootVT, Res);
38043 // Match shuffle against TRUNCATE patterns.
38044 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
38045 // Match against a VTRUNC instruction, accounting for src/dst sizes.
38046 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
38047 Subtarget)) {
38048 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
38049 ShuffleSrcVT.getVectorNumElements();
38050 unsigned Opc =
38051 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
38052 if (Depth == 0 && Root.getOpcode() == Opc)
38053 return SDValue(); // Nothing to do!
38054 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38055 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
38056 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
38057 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
38058 return DAG.getBitcast(RootVT, Res);
38061 // Do we need a more general binary truncation pattern?
38062 if (RootSizeInBits < 512 &&
38063 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
38064 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
38065 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
38066 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
38067 // Bail if this was already a truncation or PACK node.
38068 // We sometimes fail to match PACK if we demand known undef elements.
38069 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
38070 Root.getOpcode() == X86ISD::PACKSS ||
38071 Root.getOpcode() == X86ISD::PACKUS))
38072 return SDValue(); // Nothing to do!
38073 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38074 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
38075 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38076 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
38077 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38078 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
38079 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
38080 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
38081 return DAG.getBitcast(RootVT, Res);
38085 // Don't try to re-form single instruction chains under any circumstances now
38086 // that we've done encoding canonicalization for them.
38087 if (Depth < 1)
38088 return SDValue();
38090 // Depth threshold above which we can efficiently use variable mask shuffles.
38091 int VariableCrossLaneShuffleDepth =
38092 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
38093 int VariablePerLaneShuffleDepth =
38094 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
38095 AllowVariableCrossLaneMask &=
38096 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
38097 AllowVariablePerLaneMask &=
38098 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
38099 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
38100 // higher depth before combining them.
38101 bool AllowBWIVPERMV3 =
38102 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
38104 bool MaskContainsZeros = isAnyZero(Mask);
38106 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
38107 // If we have a single input lane-crossing shuffle then lower to VPERMV.
38108 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
38109 if (Subtarget.hasAVX2() &&
38110 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
38111 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
38112 Res = CanonicalizeShuffleInput(MaskVT, V1);
38113 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
38114 return DAG.getBitcast(RootVT, Res);
38116 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
38117 if ((Subtarget.hasAVX512() &&
38118 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38119 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38120 (Subtarget.hasBWI() &&
38121 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38122 (Subtarget.hasVBMI() &&
38123 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
38124 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38125 V2 = DAG.getUNDEF(MaskVT);
38126 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38127 return DAG.getBitcast(RootVT, Res);
38131 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
38132 // vector as the second source (non-VLX will pad to 512-bit shuffles).
38133 if (UnaryShuffle && AllowVariableCrossLaneMask &&
38134 ((Subtarget.hasAVX512() &&
38135 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38136 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38137 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
38138 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38139 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38140 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38141 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38142 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
38143 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
38144 for (unsigned i = 0; i != NumMaskElts; ++i)
38145 if (Mask[i] == SM_SentinelZero)
38146 Mask[i] = NumMaskElts + i;
38147 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38148 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
38149 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38150 return DAG.getBitcast(RootVT, Res);
38153 // If that failed and either input is extracted then try to combine as a
38154 // shuffle with the larger type.
38155 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
38156 Inputs, Root, BaseMask, Depth, HasVariableMask,
38157 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
38158 Subtarget))
38159 return WideShuffle;
38161 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
38162 // (non-VLX will pad to 512-bit shuffles).
38163 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
38164 ((Subtarget.hasAVX512() &&
38165 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38166 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38167 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
38168 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
38169 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38170 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38171 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38172 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
38173 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38174 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38175 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38176 return DAG.getBitcast(RootVT, Res);
38178 return SDValue();
38181 // See if we can combine a single input shuffle with zeros to a bit-mask,
38182 // which is much simpler than any shuffle.
38183 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
38184 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
38185 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
38186 APInt Zero = APInt::getZero(MaskEltSizeInBits);
38187 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
38188 APInt UndefElts(NumMaskElts, 0);
38189 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
38190 for (unsigned i = 0; i != NumMaskElts; ++i) {
38191 int M = Mask[i];
38192 if (M == SM_SentinelUndef) {
38193 UndefElts.setBit(i);
38194 continue;
38196 if (M == SM_SentinelZero)
38197 continue;
38198 EltBits[i] = AllOnes;
38200 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
38201 Res = CanonicalizeShuffleInput(MaskVT, V1);
38202 unsigned AndOpcode =
38203 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
38204 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
38205 return DAG.getBitcast(RootVT, Res);
38208 // If we have a single input shuffle with different shuffle patterns in the
38209 // the 128-bit lanes use the variable mask to VPERMILPS.
38210 // TODO Combine other mask types at higher depths.
38211 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38212 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
38213 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
38214 SmallVector<SDValue, 16> VPermIdx;
38215 for (int M : Mask) {
38216 SDValue Idx =
38217 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
38218 VPermIdx.push_back(Idx);
38220 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
38221 Res = CanonicalizeShuffleInput(MaskVT, V1);
38222 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
38223 return DAG.getBitcast(RootVT, Res);
38226 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
38227 // to VPERMIL2PD/VPERMIL2PS.
38228 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
38229 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
38230 MaskVT == MVT::v8f32)) {
38231 // VPERMIL2 Operation.
38232 // Bits[3] - Match Bit.
38233 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
38234 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
38235 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
38236 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
38237 SmallVector<int, 8> VPerm2Idx;
38238 unsigned M2ZImm = 0;
38239 for (int M : Mask) {
38240 if (M == SM_SentinelUndef) {
38241 VPerm2Idx.push_back(-1);
38242 continue;
38244 if (M == SM_SentinelZero) {
38245 M2ZImm = 2;
38246 VPerm2Idx.push_back(8);
38247 continue;
38249 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
38250 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
38251 VPerm2Idx.push_back(Index);
38253 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38254 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38255 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
38256 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
38257 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
38258 return DAG.getBitcast(RootVT, Res);
38261 // If we have 3 or more shuffle instructions or a chain involving a variable
38262 // mask, we can replace them with a single PSHUFB instruction profitably.
38263 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
38264 // instructions, but in practice PSHUFB tends to be *very* fast so we're
38265 // more aggressive.
38266 if (UnaryShuffle && AllowVariablePerLaneMask &&
38267 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
38268 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
38269 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
38270 SmallVector<SDValue, 16> PSHUFBMask;
38271 int NumBytes = RootVT.getSizeInBits() / 8;
38272 int Ratio = NumBytes / NumMaskElts;
38273 for (int i = 0; i < NumBytes; ++i) {
38274 int M = Mask[i / Ratio];
38275 if (M == SM_SentinelUndef) {
38276 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
38277 continue;
38279 if (M == SM_SentinelZero) {
38280 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38281 continue;
38283 M = Ratio * M + i % Ratio;
38284 assert((M / 16) == (i / 16) && "Lane crossing detected");
38285 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38287 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
38288 Res = CanonicalizeShuffleInput(ByteVT, V1);
38289 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
38290 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
38291 return DAG.getBitcast(RootVT, Res);
38294 // With XOP, if we have a 128-bit binary input shuffle we can always combine
38295 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
38296 // slower than PSHUFB on targets that support both.
38297 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
38298 Subtarget.hasXOP()) {
38299 // VPPERM Mask Operation
38300 // Bits[4:0] - Byte Index (0 - 31)
38301 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
38302 SmallVector<SDValue, 16> VPPERMMask;
38303 int NumBytes = 16;
38304 int Ratio = NumBytes / NumMaskElts;
38305 for (int i = 0; i < NumBytes; ++i) {
38306 int M = Mask[i / Ratio];
38307 if (M == SM_SentinelUndef) {
38308 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
38309 continue;
38311 if (M == SM_SentinelZero) {
38312 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38313 continue;
38315 M = Ratio * M + i % Ratio;
38316 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38318 MVT ByteVT = MVT::v16i8;
38319 V1 = CanonicalizeShuffleInput(ByteVT, V1);
38320 V2 = CanonicalizeShuffleInput(ByteVT, V2);
38321 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
38322 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
38323 return DAG.getBitcast(RootVT, Res);
38326 // If that failed and either input is extracted then try to combine as a
38327 // shuffle with the larger type.
38328 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
38329 Inputs, Root, BaseMask, Depth, HasVariableMask,
38330 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
38331 return WideShuffle;
38333 // If we have a dual input shuffle then lower to VPERMV3,
38334 // (non-VLX will pad to 512-bit shuffles)
38335 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38336 ((Subtarget.hasAVX512() &&
38337 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
38338 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
38339 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
38340 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
38341 MaskVT == MVT::v16i32)) ||
38342 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38343 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
38344 MaskVT == MVT::v32i16)) ||
38345 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38346 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
38347 MaskVT == MVT::v64i8)))) {
38348 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38349 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38350 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38351 return DAG.getBitcast(RootVT, Res);
38354 // Failed to find any combines.
38355 return SDValue();
38358 // Combine an arbitrary chain of shuffles + extract_subvectors into a single
38359 // instruction if possible.
38361 // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
38362 // type size to attempt to combine:
38363 // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
38364 // -->
38365 // extract_subvector(shuffle(x,y,m2),0)
38366 static SDValue combineX86ShuffleChainWithExtract(
38367 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38368 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38369 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38370 const X86Subtarget &Subtarget) {
38371 unsigned NumMaskElts = BaseMask.size();
38372 unsigned NumInputs = Inputs.size();
38373 if (NumInputs == 0)
38374 return SDValue();
38376 EVT RootVT = Root.getValueType();
38377 unsigned RootSizeInBits = RootVT.getSizeInBits();
38378 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
38379 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
38381 // Peek through extract_subvector to find widest legal vector.
38382 // TODO: Handle ISD::TRUNCATE
38383 unsigned WideSizeInBits = RootSizeInBits;
38384 for (unsigned I = 0; I != NumInputs; ++I) {
38385 SDValue Input = peekThroughBitcasts(Inputs[I]);
38386 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
38387 Input = peekThroughBitcasts(Input.getOperand(0));
38388 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
38389 WideSizeInBits < Input.getValueSizeInBits())
38390 WideSizeInBits = Input.getValueSizeInBits();
38393 // Bail if we fail to find a source larger than the existing root.
38394 unsigned Scale = WideSizeInBits / RootSizeInBits;
38395 if (WideSizeInBits <= RootSizeInBits ||
38396 (WideSizeInBits % RootSizeInBits) != 0)
38397 return SDValue();
38399 // Create new mask for larger type.
38400 SmallVector<int, 64> WideMask(BaseMask);
38401 for (int &M : WideMask) {
38402 if (M < 0)
38403 continue;
38404 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
38406 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
38408 // Attempt to peek through inputs and adjust mask when we extract from an
38409 // upper subvector.
38410 int AdjustedMasks = 0;
38411 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
38412 for (unsigned I = 0; I != NumInputs; ++I) {
38413 SDValue &Input = WideInputs[I];
38414 Input = peekThroughBitcasts(Input);
38415 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
38416 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
38417 uint64_t Idx = Input.getConstantOperandVal(1);
38418 if (Idx != 0) {
38419 ++AdjustedMasks;
38420 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
38421 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
38423 int lo = I * WideMask.size();
38424 int hi = (I + 1) * WideMask.size();
38425 for (int &M : WideMask)
38426 if (lo <= M && M < hi)
38427 M += Idx;
38429 Input = peekThroughBitcasts(Input.getOperand(0));
38433 // Remove unused/repeated shuffle source ops.
38434 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
38435 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
38437 // Bail if we're always extracting from the lowest subvectors,
38438 // combineX86ShuffleChain should match this for the current width, or the
38439 // shuffle still references too many inputs.
38440 if (AdjustedMasks == 0 || WideInputs.size() > 2)
38441 return SDValue();
38443 // Minor canonicalization of the accumulated shuffle mask to make it easier
38444 // to match below. All this does is detect masks with sequential pairs of
38445 // elements, and shrink them to the half-width mask. It does this in a loop
38446 // so it will reduce the size of the mask to the minimal width mask which
38447 // performs an equivalent shuffle.
38448 while (WideMask.size() > 1) {
38449 SmallVector<int, 64> WidenedMask;
38450 if (!canWidenShuffleElements(WideMask, WidenedMask))
38451 break;
38452 WideMask = std::move(WidenedMask);
38455 // Canonicalization of binary shuffle masks to improve pattern matching by
38456 // commuting the inputs.
38457 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
38458 ShuffleVectorSDNode::commuteMask(WideMask);
38459 std::swap(WideInputs[0], WideInputs[1]);
38462 // Increase depth for every upper subvector we've peeked through.
38463 Depth += AdjustedMasks;
38465 // Attempt to combine wider chain.
38466 // TODO: Can we use a better Root?
38467 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
38468 WideInputs.back().getValueSizeInBits()
38469 ? WideInputs.front()
38470 : WideInputs.back();
38471 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
38472 "WideRootSize mismatch");
38474 if (SDValue WideShuffle =
38475 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
38476 HasVariableMask, AllowVariableCrossLaneMask,
38477 AllowVariablePerLaneMask, DAG, Subtarget)) {
38478 WideShuffle =
38479 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
38480 return DAG.getBitcast(RootVT, WideShuffle);
38483 return SDValue();
38486 // Canonicalize the combined shuffle mask chain with horizontal ops.
38487 // NOTE: This may update the Ops and Mask.
38488 static SDValue canonicalizeShuffleMaskWithHorizOp(
38489 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
38490 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
38491 const X86Subtarget &Subtarget) {
38492 if (Mask.empty() || Ops.empty())
38493 return SDValue();
38495 SmallVector<SDValue> BC;
38496 for (SDValue Op : Ops)
38497 BC.push_back(peekThroughBitcasts(Op));
38499 // All ops must be the same horizop + type.
38500 SDValue BC0 = BC[0];
38501 EVT VT0 = BC0.getValueType();
38502 unsigned Opcode0 = BC0.getOpcode();
38503 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
38504 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
38506 return SDValue();
38508 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
38509 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
38510 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
38511 if (!isHoriz && !isPack)
38512 return SDValue();
38514 // Do all ops have a single use?
38515 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
38516 return Op.hasOneUse() &&
38517 peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
38520 int NumElts = VT0.getVectorNumElements();
38521 int NumLanes = VT0.getSizeInBits() / 128;
38522 int NumEltsPerLane = NumElts / NumLanes;
38523 int NumHalfEltsPerLane = NumEltsPerLane / 2;
38524 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
38525 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
38527 if (NumEltsPerLane >= 4 &&
38528 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
38529 SmallVector<int> LaneMask, ScaledMask;
38530 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
38531 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
38532 // See if we can remove the shuffle by resorting the HOP chain so that
38533 // the HOP args are pre-shuffled.
38534 // TODO: Generalize to any sized/depth chain.
38535 // TODO: Add support for PACKSS/PACKUS.
38536 if (isHoriz) {
38537 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
38538 auto GetHOpSrc = [&](int M) {
38539 if (M == SM_SentinelUndef)
38540 return DAG.getUNDEF(VT0);
38541 if (M == SM_SentinelZero)
38542 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
38543 SDValue Src0 = BC[M / 4];
38544 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
38545 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
38546 return Src1.getOperand(M % 2);
38547 return SDValue();
38549 SDValue M0 = GetHOpSrc(ScaledMask[0]);
38550 SDValue M1 = GetHOpSrc(ScaledMask[1]);
38551 SDValue M2 = GetHOpSrc(ScaledMask[2]);
38552 SDValue M3 = GetHOpSrc(ScaledMask[3]);
38553 if (M0 && M1 && M2 && M3) {
38554 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
38555 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
38556 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
38559 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
38560 if (Ops.size() >= 2) {
38561 SDValue LHS, RHS;
38562 auto GetHOpSrc = [&](int M, int &OutM) {
38563 // TODO: Support SM_SentinelZero
38564 if (M < 0)
38565 return M == SM_SentinelUndef;
38566 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
38567 if (!LHS || LHS == Src) {
38568 LHS = Src;
38569 OutM = (M % 2);
38570 return true;
38572 if (!RHS || RHS == Src) {
38573 RHS = Src;
38574 OutM = (M % 2) + 2;
38575 return true;
38577 return false;
38579 int PostMask[4] = {-1, -1, -1, -1};
38580 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
38581 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
38582 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
38583 GetHOpSrc(ScaledMask[3], PostMask[3])) {
38584 LHS = DAG.getBitcast(SrcVT, LHS);
38585 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
38586 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
38587 // Use SHUFPS for the permute so this will work on SSE2 targets,
38588 // shuffle combining and domain handling will simplify this later on.
38589 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
38590 Res = DAG.getBitcast(ShuffleVT, Res);
38591 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
38592 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
38598 if (2 < Ops.size())
38599 return SDValue();
38601 SDValue BC1 = BC[BC.size() - 1];
38602 if (Mask.size() == VT0.getVectorNumElements()) {
38603 // Canonicalize binary shuffles of horizontal ops that use the
38604 // same sources to an unary shuffle.
38605 // TODO: Try to perform this fold even if the shuffle remains.
38606 if (Ops.size() == 2) {
38607 auto ContainsOps = [](SDValue HOp, SDValue Op) {
38608 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
38610 // Commute if all BC0's ops are contained in BC1.
38611 if (ContainsOps(BC1, BC0.getOperand(0)) &&
38612 ContainsOps(BC1, BC0.getOperand(1))) {
38613 ShuffleVectorSDNode::commuteMask(Mask);
38614 std::swap(Ops[0], Ops[1]);
38615 std::swap(BC0, BC1);
38618 // If BC1 can be represented by BC0, then convert to unary shuffle.
38619 if (ContainsOps(BC0, BC1.getOperand(0)) &&
38620 ContainsOps(BC0, BC1.getOperand(1))) {
38621 for (int &M : Mask) {
38622 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
38623 continue;
38624 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
38625 M -= NumElts + (SubLane * NumHalfEltsPerLane);
38626 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
38627 M += NumHalfEltsPerLane;
38632 // Canonicalize unary horizontal ops to only refer to lower halves.
38633 for (int i = 0; i != NumElts; ++i) {
38634 int &M = Mask[i];
38635 if (isUndefOrZero(M))
38636 continue;
38637 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
38638 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
38639 M -= NumHalfEltsPerLane;
38640 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
38641 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
38642 M -= NumHalfEltsPerLane;
38646 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
38647 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
38648 // represents the LHS/RHS inputs for the lower/upper halves.
38649 SmallVector<int, 16> TargetMask128, WideMask128;
38650 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
38651 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
38652 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
38653 bool SingleOp = (Ops.size() == 1);
38654 if (isPack || OneUseOps ||
38655 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
38656 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
38657 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
38658 Lo = Lo.getOperand(WideMask128[0] & 1);
38659 Hi = Hi.getOperand(WideMask128[1] & 1);
38660 if (SingleOp) {
38661 SDValue Undef = DAG.getUNDEF(SrcVT);
38662 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
38663 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
38664 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
38665 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
38666 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
38668 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
38672 // If we are post-shuffling a 256-bit hop and not requiring the upper
38673 // elements, then try to narrow to a 128-bit hop directly.
38674 SmallVector<int, 16> WideMask64;
38675 if (Ops.size() == 1 && NumLanes == 2 &&
38676 scaleShuffleElements(Mask, 4, WideMask64) &&
38677 isUndefInRange(WideMask64, 2, 2)) {
38678 int M0 = WideMask64[0];
38679 int M1 = WideMask64[1];
38680 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
38681 MVT HalfVT = VT0.getSimpleVT().getHalfNumVectorElementsVT();
38682 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
38683 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
38684 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
38685 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
38686 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
38687 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
38691 return SDValue();
38694 // Attempt to constant fold all of the constant source ops.
38695 // Returns true if the entire shuffle is folded to a constant.
38696 // TODO: Extend this to merge multiple constant Ops and update the mask.
38697 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
38698 ArrayRef<int> Mask, SDValue Root,
38699 bool HasVariableMask,
38700 SelectionDAG &DAG,
38701 const X86Subtarget &Subtarget) {
38702 MVT VT = Root.getSimpleValueType();
38704 unsigned SizeInBits = VT.getSizeInBits();
38705 unsigned NumMaskElts = Mask.size();
38706 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
38707 unsigned NumOps = Ops.size();
38709 // Extract constant bits from each source op.
38710 SmallVector<APInt, 16> UndefEltsOps(NumOps);
38711 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
38712 for (unsigned I = 0; I != NumOps; ++I)
38713 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
38714 RawBitsOps[I]))
38715 return SDValue();
38717 // If we're optimizing for size, only fold if at least one of the constants is
38718 // only used once or the combined shuffle has included a variable mask
38719 // shuffle, this is to avoid constant pool bloat.
38720 bool IsOptimizingSize = DAG.shouldOptForSize();
38721 if (IsOptimizingSize && !HasVariableMask &&
38722 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
38723 return SDValue();
38725 // Shuffle the constant bits according to the mask.
38726 SDLoc DL(Root);
38727 APInt UndefElts(NumMaskElts, 0);
38728 APInt ZeroElts(NumMaskElts, 0);
38729 APInt ConstantElts(NumMaskElts, 0);
38730 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
38731 APInt::getZero(MaskSizeInBits));
38732 for (unsigned i = 0; i != NumMaskElts; ++i) {
38733 int M = Mask[i];
38734 if (M == SM_SentinelUndef) {
38735 UndefElts.setBit(i);
38736 continue;
38737 } else if (M == SM_SentinelZero) {
38738 ZeroElts.setBit(i);
38739 continue;
38741 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
38743 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
38744 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
38746 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
38747 if (SrcUndefElts[SrcMaskIdx]) {
38748 UndefElts.setBit(i);
38749 continue;
38752 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
38753 APInt &Bits = SrcEltBits[SrcMaskIdx];
38754 if (!Bits) {
38755 ZeroElts.setBit(i);
38756 continue;
38759 ConstantElts.setBit(i);
38760 ConstantBitData[i] = Bits;
38762 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
38764 // Attempt to create a zero vector.
38765 if ((UndefElts | ZeroElts).isAllOnes())
38766 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
38768 // Create the constant data.
38769 MVT MaskSVT;
38770 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
38771 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
38772 else
38773 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
38775 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
38776 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
38777 return SDValue();
38779 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
38780 return DAG.getBitcast(VT, CstOp);
38783 namespace llvm {
38784 namespace X86 {
38785 enum {
38786 MaxShuffleCombineDepth = 8
38788 } // namespace X86
38789 } // namespace llvm
38791 /// Fully generic combining of x86 shuffle instructions.
38793 /// This should be the last combine run over the x86 shuffle instructions. Once
38794 /// they have been fully optimized, this will recursively consider all chains
38795 /// of single-use shuffle instructions, build a generic model of the cumulative
38796 /// shuffle operation, and check for simpler instructions which implement this
38797 /// operation. We use this primarily for two purposes:
38799 /// 1) Collapse generic shuffles to specialized single instructions when
38800 /// equivalent. In most cases, this is just an encoding size win, but
38801 /// sometimes we will collapse multiple generic shuffles into a single
38802 /// special-purpose shuffle.
38803 /// 2) Look for sequences of shuffle instructions with 3 or more total
38804 /// instructions, and replace them with the slightly more expensive SSSE3
38805 /// PSHUFB instruction if available. We do this as the last combining step
38806 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
38807 /// a suitable short sequence of other instructions. The PSHUFB will either
38808 /// use a register or have to read from memory and so is slightly (but only
38809 /// slightly) more expensive than the other shuffle instructions.
38811 /// Because this is inherently a quadratic operation (for each shuffle in
38812 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
38813 /// This should never be an issue in practice as the shuffle lowering doesn't
38814 /// produce sequences of more than 8 instructions.
38816 /// FIXME: We will currently miss some cases where the redundant shuffling
38817 /// would simplify under the threshold for PSHUFB formation because of
38818 /// combine-ordering. To fix this, we should do the redundant instruction
38819 /// combining in this recursive walk.
38820 static SDValue combineX86ShufflesRecursively(
38821 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
38822 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
38823 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
38824 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38825 const X86Subtarget &Subtarget) {
38826 assert(!RootMask.empty() &&
38827 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
38828 "Illegal shuffle root mask");
38829 MVT RootVT = Root.getSimpleValueType();
38830 assert(RootVT.isVector() && "Shuffles operate on vector types!");
38831 unsigned RootSizeInBits = RootVT.getSizeInBits();
38833 // Bound the depth of our recursive combine because this is ultimately
38834 // quadratic in nature.
38835 if (Depth >= MaxDepth)
38836 return SDValue();
38838 // Directly rip through bitcasts to find the underlying operand.
38839 SDValue Op = SrcOps[SrcOpIndex];
38840 Op = peekThroughOneUseBitcasts(Op);
38842 EVT VT = Op.getValueType();
38843 if (!VT.isVector() || !VT.isSimple())
38844 return SDValue(); // Bail if we hit a non-simple non-vector.
38846 // FIXME: Just bail on f16 for now.
38847 if (VT.getVectorElementType() == MVT::f16)
38848 return SDValue();
38850 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
38851 "Can only combine shuffles upto size of the root op.");
38853 // Create a demanded elts mask from the referenced elements of Op.
38854 APInt OpDemandedElts = APInt::getZero(RootMask.size());
38855 for (int M : RootMask) {
38856 int BaseIdx = RootMask.size() * SrcOpIndex;
38857 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
38858 OpDemandedElts.setBit(M - BaseIdx);
38860 if (RootSizeInBits != VT.getSizeInBits()) {
38861 // Op is smaller than Root - extract the demanded elts for the subvector.
38862 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
38863 unsigned NumOpMaskElts = RootMask.size() / Scale;
38864 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
38865 assert(OpDemandedElts
38866 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
38867 .isZero() &&
38868 "Out of range elements referenced in root mask");
38869 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
38871 OpDemandedElts =
38872 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
38874 // Extract target shuffle mask and resolve sentinels and inputs.
38875 SmallVector<int, 64> OpMask;
38876 SmallVector<SDValue, 2> OpInputs;
38877 APInt OpUndef, OpZero;
38878 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
38879 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
38880 OpZero, DAG, Depth, false)) {
38881 // Shuffle inputs must not be larger than the shuffle result.
38882 // TODO: Relax this for single input faux shuffles (e.g. trunc).
38883 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
38884 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
38886 return SDValue();
38887 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
38888 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
38889 !isNullConstant(Op.getOperand(1))) {
38890 SDValue SrcVec = Op.getOperand(0);
38891 int ExtractIdx = Op.getConstantOperandVal(1);
38892 unsigned NumElts = VT.getVectorNumElements();
38893 OpInputs.assign({SrcVec});
38894 OpMask.assign(NumElts, SM_SentinelUndef);
38895 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
38896 OpZero = OpUndef = APInt::getZero(NumElts);
38897 } else {
38898 return SDValue();
38901 // If the shuffle result was smaller than the root, we need to adjust the
38902 // mask indices and pad the mask with undefs.
38903 if (RootSizeInBits > VT.getSizeInBits()) {
38904 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
38905 unsigned OpMaskSize = OpMask.size();
38906 if (OpInputs.size() > 1) {
38907 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
38908 for (int &M : OpMask) {
38909 if (M < 0)
38910 continue;
38911 int EltIdx = M % OpMaskSize;
38912 int OpIdx = M / OpMaskSize;
38913 M = (PaddedMaskSize * OpIdx) + EltIdx;
38916 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
38917 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
38918 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
38921 SmallVector<int, 64> Mask;
38922 SmallVector<SDValue, 16> Ops;
38924 // We don't need to merge masks if the root is empty.
38925 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
38926 if (EmptyRoot) {
38927 // Only resolve zeros if it will remove an input, otherwise we might end
38928 // up in an infinite loop.
38929 bool ResolveKnownZeros = true;
38930 if (!OpZero.isZero()) {
38931 APInt UsedInputs = APInt::getZero(OpInputs.size());
38932 for (int i = 0, e = OpMask.size(); i != e; ++i) {
38933 int M = OpMask[i];
38934 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
38935 continue;
38936 UsedInputs.setBit(M / OpMask.size());
38937 if (UsedInputs.isAllOnes()) {
38938 ResolveKnownZeros = false;
38939 break;
38943 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
38944 ResolveKnownZeros);
38946 Mask = OpMask;
38947 Ops.append(OpInputs.begin(), OpInputs.end());
38948 } else {
38949 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
38951 // Add the inputs to the Ops list, avoiding duplicates.
38952 Ops.append(SrcOps.begin(), SrcOps.end());
38954 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
38955 // Attempt to find an existing match.
38956 SDValue InputBC = peekThroughBitcasts(Input);
38957 for (int i = 0, e = Ops.size(); i < e; ++i)
38958 if (InputBC == peekThroughBitcasts(Ops[i]))
38959 return i;
38960 // Match failed - should we replace an existing Op?
38961 if (InsertionPoint >= 0) {
38962 Ops[InsertionPoint] = Input;
38963 return InsertionPoint;
38965 // Add to the end of the Ops list.
38966 Ops.push_back(Input);
38967 return Ops.size() - 1;
38970 SmallVector<int, 2> OpInputIdx;
38971 for (SDValue OpInput : OpInputs)
38972 OpInputIdx.push_back(
38973 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
38975 assert(((RootMask.size() > OpMask.size() &&
38976 RootMask.size() % OpMask.size() == 0) ||
38977 (OpMask.size() > RootMask.size() &&
38978 OpMask.size() % RootMask.size() == 0) ||
38979 OpMask.size() == RootMask.size()) &&
38980 "The smaller number of elements must divide the larger.");
38982 // This function can be performance-critical, so we rely on the power-of-2
38983 // knowledge that we have about the mask sizes to replace div/rem ops with
38984 // bit-masks and shifts.
38985 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
38986 "Non-power-of-2 shuffle mask sizes");
38987 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
38988 "Non-power-of-2 shuffle mask sizes");
38989 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
38990 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
38992 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
38993 unsigned RootRatio =
38994 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
38995 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
38996 assert((RootRatio == 1 || OpRatio == 1) &&
38997 "Must not have a ratio for both incoming and op masks!");
38999 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
39000 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
39001 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
39002 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
39003 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
39005 Mask.resize(MaskWidth, SM_SentinelUndef);
39007 // Merge this shuffle operation's mask into our accumulated mask. Note that
39008 // this shuffle's mask will be the first applied to the input, followed by
39009 // the root mask to get us all the way to the root value arrangement. The
39010 // reason for this order is that we are recursing up the operation chain.
39011 for (unsigned i = 0; i < MaskWidth; ++i) {
39012 unsigned RootIdx = i >> RootRatioLog2;
39013 if (RootMask[RootIdx] < 0) {
39014 // This is a zero or undef lane, we're done.
39015 Mask[i] = RootMask[RootIdx];
39016 continue;
39019 unsigned RootMaskedIdx =
39020 RootRatio == 1
39021 ? RootMask[RootIdx]
39022 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
39024 // Just insert the scaled root mask value if it references an input other
39025 // than the SrcOp we're currently inserting.
39026 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
39027 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
39028 Mask[i] = RootMaskedIdx;
39029 continue;
39032 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
39033 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
39034 if (OpMask[OpIdx] < 0) {
39035 // The incoming lanes are zero or undef, it doesn't matter which ones we
39036 // are using.
39037 Mask[i] = OpMask[OpIdx];
39038 continue;
39041 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
39042 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
39043 : (OpMask[OpIdx] << OpRatioLog2) +
39044 (RootMaskedIdx & (OpRatio - 1));
39046 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
39047 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
39048 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
39049 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
39051 Mask[i] = OpMaskedIdx;
39055 // Peek through vector widenings and set out of bounds mask indices to undef.
39056 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
39057 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
39058 SDValue &Op = Ops[I];
39059 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
39060 isNullConstant(Op.getOperand(2))) {
39061 Op = Op.getOperand(1);
39062 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
39063 int Lo = I * Mask.size();
39064 int Hi = (I + 1) * Mask.size();
39065 int NewHi = Lo + (Mask.size() / Scale);
39066 for (int &M : Mask) {
39067 if (Lo <= M && NewHi <= M && M < Hi)
39068 M = SM_SentinelUndef;
39073 // Peek through any free extract_subvector nodes back to root size.
39074 for (SDValue &Op : Ops)
39075 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39076 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39077 isNullConstant(Op.getOperand(1)))
39078 Op = Op.getOperand(0);
39080 // Remove unused/repeated shuffle source ops.
39081 resolveTargetShuffleInputsAndMask(Ops, Mask);
39083 // Handle the all undef/zero/ones cases early.
39084 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
39085 return DAG.getUNDEF(RootVT);
39086 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
39087 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
39088 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
39089 !llvm::is_contained(Mask, SM_SentinelZero))
39090 return getOnesVector(RootVT, DAG, SDLoc(Root));
39092 assert(!Ops.empty() && "Shuffle with no inputs detected");
39093 HasVariableMask |= IsOpVariableMask;
39095 // Update the list of shuffle nodes that have been combined so far.
39096 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
39097 SrcNodes.end());
39098 CombinedNodes.push_back(Op.getNode());
39100 // See if we can recurse into each shuffle source op (if it's a target
39101 // shuffle). The source op should only be generally combined if it either has
39102 // a single use (i.e. current Op) or all its users have already been combined,
39103 // if not then we can still combine but should prevent generation of variable
39104 // shuffles to avoid constant pool bloat.
39105 // Don't recurse if we already have more source ops than we can combine in
39106 // the remaining recursion depth.
39107 if (Ops.size() < (MaxDepth - Depth)) {
39108 for (int i = 0, e = Ops.size(); i < e; ++i) {
39109 // For empty roots, we need to resolve zeroable elements before combining
39110 // them with other shuffles.
39111 SmallVector<int, 64> ResolvedMask = Mask;
39112 if (EmptyRoot)
39113 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
39114 bool AllowCrossLaneVar = false;
39115 bool AllowPerLaneVar = false;
39116 if (Ops[i].getNode()->hasOneUse() ||
39117 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
39118 AllowCrossLaneVar = AllowVariableCrossLaneMask;
39119 AllowPerLaneVar = AllowVariablePerLaneMask;
39121 if (SDValue Res = combineX86ShufflesRecursively(
39122 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
39123 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
39124 Subtarget))
39125 return Res;
39129 // Attempt to constant fold all of the constant source ops.
39130 if (SDValue Cst = combineX86ShufflesConstants(
39131 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
39132 return Cst;
39134 // If constant fold failed and we only have constants - then we have
39135 // multiple uses by a single non-variable shuffle - just bail.
39136 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
39137 APInt UndefElts;
39138 SmallVector<APInt> RawBits;
39139 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39140 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
39141 RawBits);
39142 })) {
39143 return SDValue();
39146 // Canonicalize the combined shuffle mask chain with horizontal ops.
39147 // NOTE: This will update the Ops and Mask.
39148 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
39149 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
39150 return DAG.getBitcast(RootVT, HOp);
39152 // Try to refine our inputs given our knowledge of target shuffle mask.
39153 for (auto I : enumerate(Ops)) {
39154 int OpIdx = I.index();
39155 SDValue &Op = I.value();
39157 // What range of shuffle mask element values results in picking from Op?
39158 int Lo = OpIdx * Mask.size();
39159 int Hi = Lo + Mask.size();
39161 // Which elements of Op do we demand, given the mask's granularity?
39162 APInt OpDemandedElts(Mask.size(), 0);
39163 for (int MaskElt : Mask) {
39164 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
39165 int OpEltIdx = MaskElt - Lo;
39166 OpDemandedElts.setBit(OpEltIdx);
39170 // Is the shuffle result smaller than the root?
39171 if (Op.getValueSizeInBits() < RootSizeInBits) {
39172 // We padded the mask with undefs. But we now need to undo that.
39173 unsigned NumExpectedVectorElts = Mask.size();
39174 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
39175 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
39176 assert(!OpDemandedElts.extractBits(
39177 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
39178 "Demanding the virtual undef widening padding?");
39179 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
39182 // The Op itself may be of different VT, so we need to scale the mask.
39183 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
39184 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
39186 // Can this operand be simplified any further, given it's demanded elements?
39187 if (SDValue NewOp =
39188 DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(
39189 Op, OpScaledDemandedElts, DAG))
39190 Op = NewOp;
39192 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
39194 // Widen any subvector shuffle inputs we've collected.
39195 // TODO: Remove this to avoid generating temporary nodes, we should only
39196 // widen once combineX86ShuffleChain has found a match.
39197 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
39198 return Op.getValueSizeInBits() < RootSizeInBits;
39199 })) {
39200 for (SDValue &Op : Ops)
39201 if (Op.getValueSizeInBits() < RootSizeInBits)
39202 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
39203 RootSizeInBits);
39204 // Reresolve - we might have repeated subvector sources.
39205 resolveTargetShuffleInputsAndMask(Ops, Mask);
39208 // We can only combine unary and binary shuffle mask cases.
39209 if (Ops.size() <= 2) {
39210 // Minor canonicalization of the accumulated shuffle mask to make it easier
39211 // to match below. All this does is detect masks with sequential pairs of
39212 // elements, and shrink them to the half-width mask. It does this in a loop
39213 // so it will reduce the size of the mask to the minimal width mask which
39214 // performs an equivalent shuffle.
39215 while (Mask.size() > 1) {
39216 SmallVector<int, 64> WidenedMask;
39217 if (!canWidenShuffleElements(Mask, WidenedMask))
39218 break;
39219 Mask = std::move(WidenedMask);
39222 // Canonicalization of binary shuffle masks to improve pattern matching by
39223 // commuting the inputs.
39224 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
39225 ShuffleVectorSDNode::commuteMask(Mask);
39226 std::swap(Ops[0], Ops[1]);
39229 // Try to combine into a single shuffle instruction.
39230 if (SDValue Shuffle = combineX86ShuffleChain(
39231 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
39232 AllowVariablePerLaneMask, DAG, Subtarget))
39233 return Shuffle;
39235 // If all the operands come from the same larger vector, fallthrough and try
39236 // to use combineX86ShuffleChainWithExtract.
39237 SDValue LHS = peekThroughBitcasts(Ops.front());
39238 SDValue RHS = peekThroughBitcasts(Ops.back());
39239 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
39240 (RootSizeInBits / Mask.size()) != 64 ||
39241 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
39242 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
39243 LHS.getOperand(0) != RHS.getOperand(0))
39244 return SDValue();
39247 // If that failed and any input is extracted then try to combine as a
39248 // shuffle with the larger type.
39249 return combineX86ShuffleChainWithExtract(
39250 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
39251 AllowVariablePerLaneMask, DAG, Subtarget);
39254 /// Helper entry wrapper to combineX86ShufflesRecursively.
39255 static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
39256 const X86Subtarget &Subtarget) {
39257 return combineX86ShufflesRecursively(
39258 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
39259 /*HasVarMask*/ false,
39260 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
39261 Subtarget);
39264 /// Get the PSHUF-style mask from PSHUF node.
39266 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
39267 /// PSHUF-style masks that can be reused with such instructions.
39268 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
39269 MVT VT = N.getSimpleValueType();
39270 SmallVector<int, 4> Mask;
39271 SmallVector<SDValue, 2> Ops;
39272 bool HaveMask =
39273 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
39274 (void)HaveMask;
39275 assert(HaveMask);
39277 // If we have more than 128-bits, only the low 128-bits of shuffle mask
39278 // matter. Check that the upper masks are repeats and remove them.
39279 if (VT.getSizeInBits() > 128) {
39280 int LaneElts = 128 / VT.getScalarSizeInBits();
39281 #ifndef NDEBUG
39282 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
39283 for (int j = 0; j < LaneElts; ++j)
39284 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
39285 "Mask doesn't repeat in high 128-bit lanes!");
39286 #endif
39287 Mask.resize(LaneElts);
39290 switch (N.getOpcode()) {
39291 case X86ISD::PSHUFD:
39292 return Mask;
39293 case X86ISD::PSHUFLW:
39294 Mask.resize(4);
39295 return Mask;
39296 case X86ISD::PSHUFHW:
39297 Mask.erase(Mask.begin(), Mask.begin() + 4);
39298 for (int &M : Mask)
39299 M -= 4;
39300 return Mask;
39301 default:
39302 llvm_unreachable("No valid shuffle instruction found!");
39306 /// Search for a combinable shuffle across a chain ending in pshufd.
39308 /// We walk up the chain and look for a combinable shuffle, skipping over
39309 /// shuffles that we could hoist this shuffle's transformation past without
39310 /// altering anything.
39311 static SDValue
39312 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
39313 SelectionDAG &DAG) {
39314 assert(N.getOpcode() == X86ISD::PSHUFD &&
39315 "Called with something other than an x86 128-bit half shuffle!");
39316 SDLoc DL(N);
39318 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
39319 // of the shuffles in the chain so that we can form a fresh chain to replace
39320 // this one.
39321 SmallVector<SDValue, 8> Chain;
39322 SDValue V = N.getOperand(0);
39323 for (; V.hasOneUse(); V = V.getOperand(0)) {
39324 switch (V.getOpcode()) {
39325 default:
39326 return SDValue(); // Nothing combined!
39328 case ISD::BITCAST:
39329 // Skip bitcasts as we always know the type for the target specific
39330 // instructions.
39331 continue;
39333 case X86ISD::PSHUFD:
39334 // Found another dword shuffle.
39335 break;
39337 case X86ISD::PSHUFLW:
39338 // Check that the low words (being shuffled) are the identity in the
39339 // dword shuffle, and the high words are self-contained.
39340 if (Mask[0] != 0 || Mask[1] != 1 ||
39341 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
39342 return SDValue();
39344 Chain.push_back(V);
39345 continue;
39347 case X86ISD::PSHUFHW:
39348 // Check that the high words (being shuffled) are the identity in the
39349 // dword shuffle, and the low words are self-contained.
39350 if (Mask[2] != 2 || Mask[3] != 3 ||
39351 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
39352 return SDValue();
39354 Chain.push_back(V);
39355 continue;
39357 case X86ISD::UNPCKL:
39358 case X86ISD::UNPCKH:
39359 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
39360 // shuffle into a preceding word shuffle.
39361 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
39362 V.getSimpleValueType().getVectorElementType() != MVT::i16)
39363 return SDValue();
39365 // Search for a half-shuffle which we can combine with.
39366 unsigned CombineOp =
39367 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
39368 if (V.getOperand(0) != V.getOperand(1) ||
39369 !V->isOnlyUserOf(V.getOperand(0).getNode()))
39370 return SDValue();
39371 Chain.push_back(V);
39372 V = V.getOperand(0);
39373 do {
39374 switch (V.getOpcode()) {
39375 default:
39376 return SDValue(); // Nothing to combine.
39378 case X86ISD::PSHUFLW:
39379 case X86ISD::PSHUFHW:
39380 if (V.getOpcode() == CombineOp)
39381 break;
39383 Chain.push_back(V);
39385 [[fallthrough]];
39386 case ISD::BITCAST:
39387 V = V.getOperand(0);
39388 continue;
39390 break;
39391 } while (V.hasOneUse());
39392 break;
39394 // Break out of the loop if we break out of the switch.
39395 break;
39398 if (!V.hasOneUse())
39399 // We fell out of the loop without finding a viable combining instruction.
39400 return SDValue();
39402 // Merge this node's mask and our incoming mask.
39403 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
39404 for (int &M : Mask)
39405 M = VMask[M];
39406 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
39407 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
39409 // Rebuild the chain around this new shuffle.
39410 while (!Chain.empty()) {
39411 SDValue W = Chain.pop_back_val();
39413 if (V.getValueType() != W.getOperand(0).getValueType())
39414 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
39416 switch (W.getOpcode()) {
39417 default:
39418 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
39420 case X86ISD::UNPCKL:
39421 case X86ISD::UNPCKH:
39422 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
39423 break;
39425 case X86ISD::PSHUFD:
39426 case X86ISD::PSHUFLW:
39427 case X86ISD::PSHUFHW:
39428 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
39429 break;
39432 if (V.getValueType() != N.getValueType())
39433 V = DAG.getBitcast(N.getValueType(), V);
39435 // Return the new chain to replace N.
39436 return V;
39439 // Attempt to commute shufps LHS loads:
39440 // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
39441 static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
39442 SelectionDAG &DAG) {
39443 // TODO: Add vXf64 support.
39444 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
39445 return SDValue();
39447 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
39448 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
39449 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
39450 return SDValue();
39451 SDValue N0 = V.getOperand(0);
39452 SDValue N1 = V.getOperand(1);
39453 unsigned Imm = V.getConstantOperandVal(2);
39454 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
39455 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
39456 X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
39457 return SDValue();
39458 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
39459 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
39460 DAG.getTargetConstant(Imm, DL, MVT::i8));
39463 switch (N.getOpcode()) {
39464 case X86ISD::VPERMILPI:
39465 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
39466 unsigned Imm = N.getConstantOperandVal(1);
39467 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
39468 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
39470 break;
39471 case X86ISD::SHUFP: {
39472 SDValue N0 = N.getOperand(0);
39473 SDValue N1 = N.getOperand(1);
39474 unsigned Imm = N.getConstantOperandVal(2);
39475 if (N0 == N1) {
39476 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
39477 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
39478 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
39479 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
39480 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
39481 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
39482 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
39483 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
39484 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
39486 break;
39490 return SDValue();
39493 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
39494 static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
39495 const SDLoc &DL) {
39496 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39497 EVT ShuffleVT = N.getValueType();
39499 auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {
39500 // AllZeros/AllOnes constants are freely shuffled and will peek through
39501 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
39502 // merge with target shuffles if it has one use so shuffle combining is
39503 // likely to kick in. Shuffles of splats are expected to be removed.
39504 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
39505 ISD::isBuildVectorAllZeros(Op.getNode()) ||
39506 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
39507 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
39508 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
39509 (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
39510 (FoldLoad && isShuffleFoldableLoad(Op)) ||
39511 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
39513 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
39514 // Ensure we only shuffle whole vector src elements, unless its a logical
39515 // binops where we can more aggressively move shuffles from dst to src.
39516 return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
39517 BinOp == X86ISD::ANDNP ||
39518 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
39521 unsigned Opc = N.getOpcode();
39522 switch (Opc) {
39523 // Unary and Unary+Permute Shuffles.
39524 case X86ISD::PSHUFB: {
39525 // Don't merge PSHUFB if it contains zero'd elements.
39526 SmallVector<int> Mask;
39527 SmallVector<SDValue> Ops;
39528 if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
39529 Mask))
39530 break;
39531 [[fallthrough]];
39533 case X86ISD::VBROADCAST:
39534 case X86ISD::MOVDDUP:
39535 case X86ISD::PSHUFD:
39536 case X86ISD::PSHUFHW:
39537 case X86ISD::PSHUFLW:
39538 case X86ISD::VPERMI:
39539 case X86ISD::VPERMILPI: {
39540 if (N.getOperand(0).getValueType() == ShuffleVT &&
39541 N->isOnlyUserOf(N.getOperand(0).getNode())) {
39542 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
39543 unsigned SrcOpcode = N0.getOpcode();
39544 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
39545 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
39546 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
39547 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||
39548 IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {
39549 SDValue LHS, RHS;
39550 Op00 = DAG.getBitcast(ShuffleVT, Op00);
39551 Op01 = DAG.getBitcast(ShuffleVT, Op01);
39552 if (N.getNumOperands() == 2) {
39553 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
39554 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
39555 } else {
39556 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
39557 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
39559 EVT OpVT = N0.getValueType();
39560 return DAG.getBitcast(ShuffleVT,
39561 DAG.getNode(SrcOpcode, DL, OpVT,
39562 DAG.getBitcast(OpVT, LHS),
39563 DAG.getBitcast(OpVT, RHS)));
39567 break;
39569 // Binary and Binary+Permute Shuffles.
39570 case X86ISD::INSERTPS: {
39571 // Don't merge INSERTPS if it contains zero'd elements.
39572 unsigned InsertPSMask = N.getConstantOperandVal(2);
39573 unsigned ZeroMask = InsertPSMask & 0xF;
39574 if (ZeroMask != 0)
39575 break;
39576 [[fallthrough]];
39578 case X86ISD::MOVSD:
39579 case X86ISD::MOVSS:
39580 case X86ISD::BLENDI:
39581 case X86ISD::SHUFP:
39582 case X86ISD::UNPCKH:
39583 case X86ISD::UNPCKL: {
39584 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
39585 N->isOnlyUserOf(N.getOperand(1).getNode())) {
39586 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
39587 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
39588 unsigned SrcOpcode = N0.getOpcode();
39589 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
39590 N0.getValueType() == N1.getValueType() &&
39591 IsSafeToMoveShuffle(N0, SrcOpcode) &&
39592 IsSafeToMoveShuffle(N1, SrcOpcode)) {
39593 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
39594 SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
39595 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
39596 SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
39597 // Ensure the total number of shuffles doesn't increase by folding this
39598 // shuffle through to the source ops.
39599 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
39600 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
39601 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
39602 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
39603 SDValue LHS, RHS;
39604 Op00 = DAG.getBitcast(ShuffleVT, Op00);
39605 Op10 = DAG.getBitcast(ShuffleVT, Op10);
39606 Op01 = DAG.getBitcast(ShuffleVT, Op01);
39607 Op11 = DAG.getBitcast(ShuffleVT, Op11);
39608 if (N.getNumOperands() == 3) {
39609 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
39610 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
39611 } else {
39612 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
39613 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
39615 EVT OpVT = N0.getValueType();
39616 return DAG.getBitcast(ShuffleVT,
39617 DAG.getNode(SrcOpcode, DL, OpVT,
39618 DAG.getBitcast(OpVT, LHS),
39619 DAG.getBitcast(OpVT, RHS)));
39623 break;
39626 return SDValue();
39629 /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
39630 static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
39631 SelectionDAG &DAG,
39632 const SDLoc &DL) {
39633 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
39635 MVT VT = V.getSimpleValueType();
39636 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
39637 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
39638 unsigned SrcOpc0 = Src0.getOpcode();
39639 unsigned SrcOpc1 = Src1.getOpcode();
39640 EVT SrcVT0 = Src0.getValueType();
39641 EVT SrcVT1 = Src1.getValueType();
39643 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
39644 return SDValue();
39646 switch (SrcOpc0) {
39647 case X86ISD::MOVDDUP: {
39648 SDValue LHS = Src0.getOperand(0);
39649 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
39650 SDValue Res =
39651 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
39652 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
39653 return DAG.getBitcast(VT, Res);
39655 case X86ISD::VPERMILPI:
39656 // TODO: Handle v4f64 permutes with different low/high lane masks.
39657 if (SrcVT0 == MVT::v4f64) {
39658 uint64_t Mask = Src0.getConstantOperandVal(1);
39659 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
39660 break;
39662 [[fallthrough]];
39663 case X86ISD::VSHLI:
39664 case X86ISD::VSRLI:
39665 case X86ISD::VSRAI:
39666 case X86ISD::PSHUFD:
39667 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
39668 SDValue LHS = Src0.getOperand(0);
39669 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
39670 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
39671 V.getOperand(2));
39672 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
39673 return DAG.getBitcast(VT, Res);
39675 break;
39678 return SDValue();
39681 /// Try to combine x86 target specific shuffles.
39682 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
39683 TargetLowering::DAGCombinerInfo &DCI,
39684 const X86Subtarget &Subtarget) {
39685 SDLoc DL(N);
39686 MVT VT = N.getSimpleValueType();
39687 SmallVector<int, 4> Mask;
39688 unsigned Opcode = N.getOpcode();
39690 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
39691 return R;
39693 // Handle specific target shuffles.
39694 switch (Opcode) {
39695 case X86ISD::MOVDDUP: {
39696 SDValue Src = N.getOperand(0);
39697 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
39698 if (VT == MVT::v2f64 && Src.hasOneUse() &&
39699 ISD::isNormalLoad(Src.getNode())) {
39700 LoadSDNode *LN = cast<LoadSDNode>(Src);
39701 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
39702 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
39703 DCI.CombineTo(N.getNode(), Movddup);
39704 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39705 DCI.recursivelyDeleteUnusedNodes(LN);
39706 return N; // Return N so it doesn't get rechecked!
39710 return SDValue();
39712 case X86ISD::VBROADCAST: {
39713 SDValue Src = N.getOperand(0);
39714 SDValue BC = peekThroughBitcasts(Src);
39715 EVT SrcVT = Src.getValueType();
39716 EVT BCVT = BC.getValueType();
39718 // If broadcasting from another shuffle, attempt to simplify it.
39719 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
39720 if (isTargetShuffle(BC.getOpcode()) &&
39721 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
39722 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
39723 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
39724 SM_SentinelUndef);
39725 for (unsigned i = 0; i != Scale; ++i)
39726 DemandedMask[i] = i;
39727 if (SDValue Res = combineX86ShufflesRecursively(
39728 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
39729 X86::MaxShuffleCombineDepth,
39730 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
39731 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
39732 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
39733 DAG.getBitcast(SrcVT, Res));
39736 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
39737 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
39738 if (Src.getOpcode() == ISD::BITCAST &&
39739 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
39740 DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
39741 FixedVectorType::isValidElementType(
39742 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
39743 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
39744 VT.getVectorNumElements());
39745 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
39748 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
39749 // If we're re-broadcasting a smaller type then broadcast with that type and
39750 // bitcast.
39751 // TODO: Do this for any splat?
39752 if (Src.getOpcode() == ISD::BITCAST &&
39753 (BC.getOpcode() == X86ISD::VBROADCAST ||
39754 BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&
39755 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
39756 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
39757 MVT NewVT =
39758 MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),
39759 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
39760 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
39763 // Reduce broadcast source vector to lowest 128-bits.
39764 if (SrcVT.getSizeInBits() > 128)
39765 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
39766 extract128BitVector(Src, 0, DAG, DL));
39768 // broadcast(scalar_to_vector(x)) -> broadcast(x).
39769 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39770 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
39771 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
39773 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
39774 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
39775 isNullConstant(Src.getOperand(1)) &&
39776 Src.getValueType() ==
39777 Src.getOperand(0).getValueType().getScalarType() &&
39778 DAG.getTargetLoweringInfo().isTypeLegal(
39779 Src.getOperand(0).getValueType()))
39780 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
39782 // Share broadcast with the longest vector and extract low subvector (free).
39783 // Ensure the same SDValue from the SDNode use is being used.
39784 for (SDNode *User : Src->uses())
39785 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
39786 Src == User->getOperand(0) &&
39787 User->getValueSizeInBits(0).getFixedValue() >
39788 VT.getFixedSizeInBits()) {
39789 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
39790 VT.getSizeInBits());
39793 // vbroadcast(scalarload X) -> vbroadcast_load X
39794 // For float loads, extract other uses of the scalar from the broadcast.
39795 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
39796 ISD::isNormalLoad(Src.getNode())) {
39797 LoadSDNode *LN = cast<LoadSDNode>(Src);
39798 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39799 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39800 SDValue BcastLd =
39801 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39802 LN->getMemoryVT(), LN->getMemOperand());
39803 // If the load value is used only by N, replace it via CombineTo N.
39804 bool NoReplaceExtract = Src.hasOneUse();
39805 DCI.CombineTo(N.getNode(), BcastLd);
39806 if (NoReplaceExtract) {
39807 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39808 DCI.recursivelyDeleteUnusedNodes(LN);
39809 } else {
39810 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
39811 DAG.getIntPtrConstant(0, DL));
39812 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
39814 return N; // Return N so it doesn't get rechecked!
39817 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
39818 // i16. So shrink it ourselves if we can make a broadcast_load.
39819 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
39820 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
39821 assert(Subtarget.hasAVX2() && "Expected AVX2");
39822 SDValue TruncIn = Src.getOperand(0);
39824 // If this is a truncate of a non extending load we can just narrow it to
39825 // use a broadcast_load.
39826 if (ISD::isNormalLoad(TruncIn.getNode())) {
39827 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
39828 // Unless its volatile or atomic.
39829 if (LN->isSimple()) {
39830 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39831 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39832 SDValue BcastLd = DAG.getMemIntrinsicNode(
39833 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
39834 LN->getPointerInfo(), LN->getOriginalAlign(),
39835 LN->getMemOperand()->getFlags());
39836 DCI.CombineTo(N.getNode(), BcastLd);
39837 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39838 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39839 return N; // Return N so it doesn't get rechecked!
39843 // If this is a truncate of an i16 extload, we can directly replace it.
39844 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
39845 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
39846 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
39847 if (LN->getMemoryVT().getSizeInBits() == 16) {
39848 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39849 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39850 SDValue BcastLd =
39851 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39852 LN->getMemoryVT(), LN->getMemOperand());
39853 DCI.CombineTo(N.getNode(), BcastLd);
39854 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39855 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39856 return N; // Return N so it doesn't get rechecked!
39860 // If this is a truncate of load that has been shifted right, we can
39861 // offset the pointer and use a narrower load.
39862 if (TruncIn.getOpcode() == ISD::SRL &&
39863 TruncIn.getOperand(0).hasOneUse() &&
39864 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
39865 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
39866 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
39867 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
39868 // Make sure the shift amount and the load size are divisible by 16.
39869 // Don't do this if the load is volatile or atomic.
39870 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
39871 LN->isSimple()) {
39872 unsigned Offset = ShiftAmt / 8;
39873 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39874 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
39875 TypeSize::Fixed(Offset), DL);
39876 SDValue Ops[] = { LN->getChain(), Ptr };
39877 SDValue BcastLd = DAG.getMemIntrinsicNode(
39878 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
39879 LN->getPointerInfo().getWithOffset(Offset),
39880 LN->getOriginalAlign(),
39881 LN->getMemOperand()->getFlags());
39882 DCI.CombineTo(N.getNode(), BcastLd);
39883 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39884 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39885 return N; // Return N so it doesn't get rechecked!
39890 // vbroadcast(vzload X) -> vbroadcast_load X
39891 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
39892 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
39893 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
39894 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39895 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39896 SDValue BcastLd =
39897 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39898 LN->getMemoryVT(), LN->getMemOperand());
39899 DCI.CombineTo(N.getNode(), BcastLd);
39900 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39901 DCI.recursivelyDeleteUnusedNodes(LN);
39902 return N; // Return N so it doesn't get rechecked!
39906 // vbroadcast(vector load X) -> vbroadcast_load
39907 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
39908 SrcVT == MVT::v4i32) &&
39909 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
39910 LoadSDNode *LN = cast<LoadSDNode>(Src);
39911 // Unless the load is volatile or atomic.
39912 if (LN->isSimple()) {
39913 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39914 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39915 SDValue BcastLd = DAG.getMemIntrinsicNode(
39916 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
39917 LN->getPointerInfo(), LN->getOriginalAlign(),
39918 LN->getMemOperand()->getFlags());
39919 DCI.CombineTo(N.getNode(), BcastLd);
39920 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39921 DCI.recursivelyDeleteUnusedNodes(LN);
39922 return N; // Return N so it doesn't get rechecked!
39926 return SDValue();
39928 case X86ISD::VZEXT_MOVL: {
39929 SDValue N0 = N.getOperand(0);
39931 // If this a vzmovl of a full vector load, replace it with a vzload, unless
39932 // the load is volatile.
39933 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
39934 auto *LN = cast<LoadSDNode>(N0);
39935 if (SDValue VZLoad =
39936 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
39937 DCI.CombineTo(N.getNode(), VZLoad);
39938 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39939 DCI.recursivelyDeleteUnusedNodes(LN);
39940 return N;
39944 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
39945 // and can just use a VZEXT_LOAD.
39946 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
39947 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
39948 auto *LN = cast<MemSDNode>(N0);
39949 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
39950 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39951 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39952 SDValue VZLoad =
39953 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
39954 LN->getMemoryVT(), LN->getMemOperand());
39955 DCI.CombineTo(N.getNode(), VZLoad);
39956 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39957 DCI.recursivelyDeleteUnusedNodes(LN);
39958 return N;
39962 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
39963 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
39964 // if the upper bits of the i64 are zero.
39965 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39966 N0.getOperand(0).hasOneUse() &&
39967 N0.getOperand(0).getValueType() == MVT::i64) {
39968 SDValue In = N0.getOperand(0);
39969 APInt Mask = APInt::getHighBitsSet(64, 32);
39970 if (DAG.MaskedValueIsZero(In, Mask)) {
39971 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
39972 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
39973 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
39974 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
39975 return DAG.getBitcast(VT, Movl);
39979 // Load a scalar integer constant directly to XMM instead of transferring an
39980 // immediate value from GPR.
39981 // vzext_movl (scalar_to_vector C) --> load [C,0...]
39982 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
39983 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
39984 // Create a vector constant - scalar constant followed by zeros.
39985 EVT ScalarVT = N0.getOperand(0).getValueType();
39986 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
39987 unsigned NumElts = VT.getVectorNumElements();
39988 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
39989 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
39990 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
39992 // Load the vector constant from constant pool.
39993 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
39994 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
39995 MachinePointerInfo MPI =
39996 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
39997 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
39998 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
39999 MachineMemOperand::MOLoad);
40003 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
40004 // insert into a zero vector. This helps get VZEXT_MOVL closer to
40005 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
40006 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
40007 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
40008 SDValue V = peekThroughOneUseBitcasts(N0);
40010 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
40011 isNullConstant(V.getOperand(2))) {
40012 SDValue In = V.getOperand(1);
40013 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
40014 In.getValueSizeInBits() /
40015 VT.getScalarSizeInBits());
40016 In = DAG.getBitcast(SubVT, In);
40017 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
40018 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
40019 getZeroVector(VT, Subtarget, DAG, DL), Movl,
40020 V.getOperand(2));
40024 return SDValue();
40026 case X86ISD::BLENDI: {
40027 SDValue N0 = N.getOperand(0);
40028 SDValue N1 = N.getOperand(1);
40030 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
40031 // TODO: Handle MVT::v16i16 repeated blend mask.
40032 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
40033 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
40034 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
40035 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
40036 SrcVT.getScalarSizeInBits() >= 32) {
40037 unsigned BlendMask = N.getConstantOperandVal(2);
40038 unsigned Size = VT.getVectorNumElements();
40039 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
40040 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
40041 return DAG.getBitcast(
40042 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
40043 N1.getOperand(0),
40044 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
40047 return SDValue();
40049 case X86ISD::SHUFP: {
40050 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
40051 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
40052 // TODO: Support types other than v4f32.
40053 if (VT == MVT::v4f32) {
40054 bool Updated = false;
40055 SmallVector<int> Mask;
40056 SmallVector<SDValue> Ops;
40057 if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
40058 Ops.size() == 2) {
40059 for (int i = 0; i != 2; ++i) {
40060 SmallVector<SDValue> SubOps;
40061 SmallVector<int> SubMask, SubScaledMask;
40062 SDValue Sub = peekThroughBitcasts(Ops[i]);
40063 // TODO: Scaling might be easier if we specify the demanded elts.
40064 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
40065 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
40066 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
40067 int Ofs = i * 2;
40068 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
40069 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
40070 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
40071 Updated = true;
40075 if (Updated) {
40076 for (int &M : Mask)
40077 M %= 4;
40078 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
40079 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
40082 return SDValue();
40084 case X86ISD::VPERMI: {
40085 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
40086 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
40087 SDValue N0 = N.getOperand(0);
40088 SDValue N1 = N.getOperand(1);
40089 unsigned EltSizeInBits = VT.getScalarSizeInBits();
40090 if (N0.getOpcode() == ISD::BITCAST &&
40091 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
40092 SDValue Src = N0.getOperand(0);
40093 EVT SrcVT = Src.getValueType();
40094 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
40095 return DAG.getBitcast(VT, Res);
40097 return SDValue();
40099 case X86ISD::VPERM2X128: {
40100 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
40101 SDValue LHS = N->getOperand(0);
40102 SDValue RHS = N->getOperand(1);
40103 if (LHS.getOpcode() == ISD::BITCAST &&
40104 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
40105 EVT SrcVT = LHS.getOperand(0).getValueType();
40106 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
40107 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
40108 DAG.getBitcast(SrcVT, LHS),
40109 DAG.getBitcast(SrcVT, RHS),
40110 N->getOperand(2)));
40114 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
40115 if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
40116 return Res;
40118 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
40119 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
40120 auto FindSubVector128 = [&](unsigned Idx) {
40121 if (Idx > 3)
40122 return SDValue();
40123 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
40124 SmallVector<SDValue> SubOps;
40125 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
40126 return SubOps[Idx & 1];
40127 unsigned NumElts = Src.getValueType().getVectorNumElements();
40128 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
40129 Src.getOperand(1).getValueSizeInBits() == 128 &&
40130 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
40131 return Src.getOperand(1);
40133 return SDValue();
40135 unsigned Imm = N.getConstantOperandVal(2);
40136 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
40137 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
40138 MVT SubVT = VT.getHalfNumVectorElementsVT();
40139 SubLo = DAG.getBitcast(SubVT, SubLo);
40140 SubHi = DAG.getBitcast(SubVT, SubHi);
40141 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
40144 return SDValue();
40146 case X86ISD::PSHUFD:
40147 case X86ISD::PSHUFLW:
40148 case X86ISD::PSHUFHW: {
40149 SDValue N0 = N.getOperand(0);
40150 SDValue N1 = N.getOperand(1);
40151 if (N0->hasOneUse()) {
40152 SDValue V = peekThroughOneUseBitcasts(N0);
40153 switch (V.getOpcode()) {
40154 case X86ISD::VSHL:
40155 case X86ISD::VSRL:
40156 case X86ISD::VSRA:
40157 case X86ISD::VSHLI:
40158 case X86ISD::VSRLI:
40159 case X86ISD::VSRAI:
40160 case X86ISD::VROTLI:
40161 case X86ISD::VROTRI: {
40162 MVT InnerVT = V.getSimpleValueType();
40163 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
40164 SDValue Res = DAG.getNode(Opcode, DL, VT,
40165 DAG.getBitcast(VT, V.getOperand(0)), N1);
40166 Res = DAG.getBitcast(InnerVT, Res);
40167 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
40168 return DAG.getBitcast(VT, Res);
40170 break;
40175 Mask = getPSHUFShuffleMask(N);
40176 assert(Mask.size() == 4);
40177 break;
40179 case X86ISD::MOVSD:
40180 case X86ISD::MOVSH:
40181 case X86ISD::MOVSS: {
40182 SDValue N0 = N.getOperand(0);
40183 SDValue N1 = N.getOperand(1);
40185 // Canonicalize scalar FPOps:
40186 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
40187 // If commutable, allow OP(N1[0], N0[0]).
40188 unsigned Opcode1 = N1.getOpcode();
40189 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
40190 Opcode1 == ISD::FDIV) {
40191 SDValue N10 = N1.getOperand(0);
40192 SDValue N11 = N1.getOperand(1);
40193 if (N10 == N0 ||
40194 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
40195 if (N10 != N0)
40196 std::swap(N10, N11);
40197 MVT SVT = VT.getVectorElementType();
40198 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
40199 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
40200 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
40201 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
40202 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
40203 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
40207 return SDValue();
40209 case X86ISD::INSERTPS: {
40210 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
40211 SDValue Op0 = N.getOperand(0);
40212 SDValue Op1 = N.getOperand(1);
40213 unsigned InsertPSMask = N.getConstantOperandVal(2);
40214 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
40215 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
40216 unsigned ZeroMask = InsertPSMask & 0xF;
40218 // If we zero out all elements from Op0 then we don't need to reference it.
40219 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
40220 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
40221 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40223 // If we zero out the element from Op1 then we don't need to reference it.
40224 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
40225 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
40226 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40228 // Attempt to merge insertps Op1 with an inner target shuffle node.
40229 SmallVector<int, 8> TargetMask1;
40230 SmallVector<SDValue, 2> Ops1;
40231 APInt KnownUndef1, KnownZero1;
40232 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
40233 KnownZero1)) {
40234 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
40235 // Zero/UNDEF insertion - zero out element and remove dependency.
40236 InsertPSMask |= (1u << DstIdx);
40237 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
40238 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40240 // Update insertps mask srcidx and reference the source input directly.
40241 int M = TargetMask1[SrcIdx];
40242 assert(0 <= M && M < 8 && "Shuffle index out of range");
40243 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
40244 Op1 = Ops1[M < 4 ? 0 : 1];
40245 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
40246 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40249 // Attempt to merge insertps Op0 with an inner target shuffle node.
40250 SmallVector<int, 8> TargetMask0;
40251 SmallVector<SDValue, 2> Ops0;
40252 APInt KnownUndef0, KnownZero0;
40253 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
40254 KnownZero0)) {
40255 bool Updated = false;
40256 bool UseInput00 = false;
40257 bool UseInput01 = false;
40258 for (int i = 0; i != 4; ++i) {
40259 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
40260 // No change if element is already zero or the inserted element.
40261 continue;
40264 if (KnownUndef0[i] || KnownZero0[i]) {
40265 // If the target mask is undef/zero then we must zero the element.
40266 InsertPSMask |= (1u << i);
40267 Updated = true;
40268 continue;
40271 // The input vector element must be inline.
40272 int M = TargetMask0[i];
40273 if (M != i && M != (i + 4))
40274 return SDValue();
40276 // Determine which inputs of the target shuffle we're using.
40277 UseInput00 |= (0 <= M && M < 4);
40278 UseInput01 |= (4 <= M);
40281 // If we're not using both inputs of the target shuffle then use the
40282 // referenced input directly.
40283 if (UseInput00 && !UseInput01) {
40284 Updated = true;
40285 Op0 = Ops0[0];
40286 } else if (!UseInput00 && UseInput01) {
40287 Updated = true;
40288 Op0 = Ops0[1];
40291 if (Updated)
40292 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
40293 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40296 // If we're inserting an element from a vbroadcast load, fold the
40297 // load into the X86insertps instruction. We need to convert the scalar
40298 // load to a vector and clear the source lane of the INSERTPS control.
40299 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
40300 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
40301 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
40302 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
40303 MemIntr->getBasePtr(),
40304 MemIntr->getMemOperand());
40305 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
40306 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
40307 Load),
40308 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
40309 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
40310 return Insert;
40314 return SDValue();
40316 default:
40317 return SDValue();
40320 // Nuke no-op shuffles that show up after combining.
40321 if (isNoopShuffleMask(Mask))
40322 return N.getOperand(0);
40324 // Look for simplifications involving one or two shuffle instructions.
40325 SDValue V = N.getOperand(0);
40326 switch (N.getOpcode()) {
40327 default:
40328 break;
40329 case X86ISD::PSHUFLW:
40330 case X86ISD::PSHUFHW:
40331 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
40333 // See if this reduces to a PSHUFD which is no more expensive and can
40334 // combine with more operations. Note that it has to at least flip the
40335 // dwords as otherwise it would have been removed as a no-op.
40336 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
40337 int DMask[] = {0, 1, 2, 3};
40338 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
40339 DMask[DOffset + 0] = DOffset + 1;
40340 DMask[DOffset + 1] = DOffset + 0;
40341 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
40342 V = DAG.getBitcast(DVT, V);
40343 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
40344 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
40345 return DAG.getBitcast(VT, V);
40348 // Look for shuffle patterns which can be implemented as a single unpack.
40349 // FIXME: This doesn't handle the location of the PSHUFD generically, and
40350 // only works when we have a PSHUFD followed by two half-shuffles.
40351 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
40352 (V.getOpcode() == X86ISD::PSHUFLW ||
40353 V.getOpcode() == X86ISD::PSHUFHW) &&
40354 V.getOpcode() != N.getOpcode() &&
40355 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
40356 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
40357 if (D.getOpcode() == X86ISD::PSHUFD) {
40358 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
40359 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
40360 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
40361 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
40362 int WordMask[8];
40363 for (int i = 0; i < 4; ++i) {
40364 WordMask[i + NOffset] = Mask[i] + NOffset;
40365 WordMask[i + VOffset] = VMask[i] + VOffset;
40367 // Map the word mask through the DWord mask.
40368 int MappedMask[8];
40369 for (int i = 0; i < 8; ++i)
40370 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
40371 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
40372 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
40373 // We can replace all three shuffles with an unpack.
40374 V = DAG.getBitcast(VT, D.getOperand(0));
40375 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
40376 : X86ISD::UNPCKH,
40377 DL, VT, V, V);
40382 break;
40384 case X86ISD::PSHUFD:
40385 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
40386 return NewN;
40388 break;
40391 return SDValue();
40394 /// Checks if the shuffle mask takes subsequent elements
40395 /// alternately from two vectors.
40396 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
40397 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
40399 int ParitySrc[2] = {-1, -1};
40400 unsigned Size = Mask.size();
40401 for (unsigned i = 0; i != Size; ++i) {
40402 int M = Mask[i];
40403 if (M < 0)
40404 continue;
40406 // Make sure we are using the matching element from the input.
40407 if ((M % Size) != i)
40408 return false;
40410 // Make sure we use the same input for all elements of the same parity.
40411 int Src = M / Size;
40412 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
40413 return false;
40414 ParitySrc[i % 2] = Src;
40417 // Make sure each input is used.
40418 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
40419 return false;
40421 Op0Even = ParitySrc[0] == 0;
40422 return true;
40425 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
40426 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
40427 /// are written to the parameters \p Opnd0 and \p Opnd1.
40429 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
40430 /// so it is easier to generically match. We also insert dummy vector shuffle
40431 /// nodes for the operands which explicitly discard the lanes which are unused
40432 /// by this operation to try to flow through the rest of the combiner
40433 /// the fact that they're unused.
40434 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
40435 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
40436 bool &IsSubAdd) {
40438 EVT VT = N->getValueType(0);
40439 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40440 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
40441 !VT.getSimpleVT().isFloatingPoint())
40442 return false;
40444 // We only handle target-independent shuffles.
40445 // FIXME: It would be easy and harmless to use the target shuffle mask
40446 // extraction tool to support more.
40447 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
40448 return false;
40450 SDValue V1 = N->getOperand(0);
40451 SDValue V2 = N->getOperand(1);
40453 // Make sure we have an FADD and an FSUB.
40454 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
40455 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
40456 V1.getOpcode() == V2.getOpcode())
40457 return false;
40459 // If there are other uses of these operations we can't fold them.
40460 if (!V1->hasOneUse() || !V2->hasOneUse())
40461 return false;
40463 // Ensure that both operations have the same operands. Note that we can
40464 // commute the FADD operands.
40465 SDValue LHS, RHS;
40466 if (V1.getOpcode() == ISD::FSUB) {
40467 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
40468 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
40469 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
40470 return false;
40471 } else {
40472 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
40473 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
40474 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
40475 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
40476 return false;
40479 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
40480 bool Op0Even;
40481 if (!isAddSubOrSubAddMask(Mask, Op0Even))
40482 return false;
40484 // It's a subadd if the vector in the even parity is an FADD.
40485 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
40486 : V2->getOpcode() == ISD::FADD;
40488 Opnd0 = LHS;
40489 Opnd1 = RHS;
40490 return true;
40493 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
40494 static SDValue combineShuffleToFMAddSub(SDNode *N,
40495 const X86Subtarget &Subtarget,
40496 SelectionDAG &DAG) {
40497 // We only handle target-independent shuffles.
40498 // FIXME: It would be easy and harmless to use the target shuffle mask
40499 // extraction tool to support more.
40500 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
40501 return SDValue();
40503 MVT VT = N->getSimpleValueType(0);
40504 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40505 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
40506 return SDValue();
40508 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
40509 SDValue Op0 = N->getOperand(0);
40510 SDValue Op1 = N->getOperand(1);
40511 SDValue FMAdd = Op0, FMSub = Op1;
40512 if (FMSub.getOpcode() != X86ISD::FMSUB)
40513 std::swap(FMAdd, FMSub);
40515 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
40516 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
40517 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
40518 FMAdd.getOperand(2) != FMSub.getOperand(2))
40519 return SDValue();
40521 // Check for correct shuffle mask.
40522 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
40523 bool Op0Even;
40524 if (!isAddSubOrSubAddMask(Mask, Op0Even))
40525 return SDValue();
40527 // FMAddSub takes zeroth operand from FMSub node.
40528 SDLoc DL(N);
40529 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
40530 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
40531 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
40532 FMAdd.getOperand(2));
40535 /// Try to combine a shuffle into a target-specific add-sub or
40536 /// mul-add-sub node.
40537 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
40538 const X86Subtarget &Subtarget,
40539 SelectionDAG &DAG) {
40540 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
40541 return V;
40543 SDValue Opnd0, Opnd1;
40544 bool IsSubAdd;
40545 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
40546 return SDValue();
40548 MVT VT = N->getSimpleValueType(0);
40549 SDLoc DL(N);
40551 // Try to generate X86ISD::FMADDSUB node here.
40552 SDValue Opnd2;
40553 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
40554 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
40555 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
40558 if (IsSubAdd)
40559 return SDValue();
40561 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
40562 // the ADDSUB idiom has been successfully recognized. There are no known
40563 // X86 targets with 512-bit ADDSUB instructions!
40564 if (VT.is512BitVector())
40565 return SDValue();
40567 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
40568 // the ADDSUB idiom has been successfully recognized. There are no known
40569 // X86 targets with FP16 ADDSUB instructions!
40570 if (VT.getVectorElementType() == MVT::f16)
40571 return SDValue();
40573 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
40576 // We are looking for a shuffle where both sources are concatenated with undef
40577 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
40578 // if we can express this as a single-source shuffle, that's preferable.
40579 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
40580 const X86Subtarget &Subtarget) {
40581 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
40582 return SDValue();
40584 EVT VT = N->getValueType(0);
40586 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
40587 if (!VT.is128BitVector() && !VT.is256BitVector())
40588 return SDValue();
40590 if (VT.getVectorElementType() != MVT::i32 &&
40591 VT.getVectorElementType() != MVT::i64 &&
40592 VT.getVectorElementType() != MVT::f32 &&
40593 VT.getVectorElementType() != MVT::f64)
40594 return SDValue();
40596 SDValue N0 = N->getOperand(0);
40597 SDValue N1 = N->getOperand(1);
40599 // Check that both sources are concats with undef.
40600 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
40601 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
40602 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
40603 !N1.getOperand(1).isUndef())
40604 return SDValue();
40606 // Construct the new shuffle mask. Elements from the first source retain their
40607 // index, but elements from the second source no longer need to skip an undef.
40608 SmallVector<int, 8> Mask;
40609 int NumElts = VT.getVectorNumElements();
40611 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
40612 for (int Elt : SVOp->getMask())
40613 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
40615 SDLoc DL(N);
40616 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
40617 N1.getOperand(0));
40618 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
40621 /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
40622 /// low half of each source vector and does not set any high half elements in
40623 /// the destination vector, narrow the shuffle to half its original size.
40624 static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
40625 EVT VT = Shuf->getValueType(0);
40626 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
40627 return SDValue();
40628 if (!VT.is256BitVector() && !VT.is512BitVector())
40629 return SDValue();
40631 // See if we can ignore all of the high elements of the shuffle.
40632 ArrayRef<int> Mask = Shuf->getMask();
40633 if (!isUndefUpperHalf(Mask))
40634 return SDValue();
40636 // Check if the shuffle mask accesses only the low half of each input vector
40637 // (half-index output is 0 or 2).
40638 int HalfIdx1, HalfIdx2;
40639 SmallVector<int, 8> HalfMask(Mask.size() / 2);
40640 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
40641 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
40642 return SDValue();
40644 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
40645 // The trick is knowing that all of the insert/extract are actually free
40646 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
40647 // of narrow inputs into a narrow output, and that is always cheaper than
40648 // the wide shuffle that we started with.
40649 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
40650 Shuf->getOperand(1), HalfMask, HalfIdx1,
40651 HalfIdx2, false, DAG, /*UseConcat*/ true);
40654 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
40655 TargetLowering::DAGCombinerInfo &DCI,
40656 const X86Subtarget &Subtarget) {
40657 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
40658 if (SDValue V = narrowShuffle(Shuf, DAG))
40659 return V;
40661 // If we have legalized the vector types, look for blends of FADD and FSUB
40662 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
40663 SDLoc dl(N);
40664 EVT VT = N->getValueType(0);
40665 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40666 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
40667 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
40668 return AddSub;
40670 // Attempt to combine into a vector load/broadcast.
40671 if (SDValue LD = combineToConsecutiveLoads(
40672 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
40673 return LD;
40675 // For AVX2, we sometimes want to combine
40676 // (vector_shuffle <mask> (concat_vectors t1, undef)
40677 // (concat_vectors t2, undef))
40678 // Into:
40679 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
40680 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
40681 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
40682 return ShufConcat;
40684 if (isTargetShuffle(N->getOpcode())) {
40685 SDValue Op(N, 0);
40686 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
40687 return Shuffle;
40689 // Try recursively combining arbitrary sequences of x86 shuffle
40690 // instructions into higher-order shuffles. We do this after combining
40691 // specific PSHUF instruction sequences into their minimal form so that we
40692 // can evaluate how many specialized shuffle instructions are involved in
40693 // a particular chain.
40694 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
40695 return Res;
40697 // Simplify source operands based on shuffle mask.
40698 // TODO - merge this into combineX86ShufflesRecursively.
40699 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
40700 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
40701 return SDValue(N, 0);
40703 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40704 // Perform this after other shuffle combines to allow inner shuffles to be
40705 // combined away first.
40706 if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))
40707 return BinOp;
40710 return SDValue();
40713 // Simplify variable target shuffle masks based on the demanded elements.
40714 // TODO: Handle DemandedBits in mask indices as well?
40715 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
40716 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
40717 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
40718 // If we're demanding all elements don't bother trying to simplify the mask.
40719 unsigned NumElts = DemandedElts.getBitWidth();
40720 if (DemandedElts.isAllOnes())
40721 return false;
40723 SDValue Mask = Op.getOperand(MaskIndex);
40724 if (!Mask.hasOneUse())
40725 return false;
40727 // Attempt to generically simplify the variable shuffle mask.
40728 APInt MaskUndef, MaskZero;
40729 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
40730 Depth + 1))
40731 return true;
40733 // Attempt to extract+simplify a (constant pool load) shuffle mask.
40734 // TODO: Support other types from getTargetShuffleMaskIndices?
40735 SDValue BC = peekThroughOneUseBitcasts(Mask);
40736 EVT BCVT = BC.getValueType();
40737 auto *Load = dyn_cast<LoadSDNode>(BC);
40738 if (!Load)
40739 return false;
40741 const Constant *C = getTargetConstantFromNode(Load);
40742 if (!C)
40743 return false;
40745 Type *CTy = C->getType();
40746 if (!CTy->isVectorTy() ||
40747 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
40748 return false;
40750 // Handle scaling for i64 elements on 32-bit targets.
40751 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
40752 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
40753 return false;
40754 unsigned Scale = NumCstElts / NumElts;
40756 // Simplify mask if we have an undemanded element that is not undef.
40757 bool Simplified = false;
40758 SmallVector<Constant *, 32> ConstVecOps;
40759 for (unsigned i = 0; i != NumCstElts; ++i) {
40760 Constant *Elt = C->getAggregateElement(i);
40761 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
40762 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
40763 Simplified = true;
40764 continue;
40766 ConstVecOps.push_back(Elt);
40768 if (!Simplified)
40769 return false;
40771 // Generate new constant pool entry + legalize immediately for the load.
40772 SDLoc DL(Op);
40773 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
40774 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
40775 SDValue NewMask = TLO.DAG.getLoad(
40776 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
40777 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
40778 Load->getAlign());
40779 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
40782 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
40783 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
40784 TargetLoweringOpt &TLO, unsigned Depth) const {
40785 int NumElts = DemandedElts.getBitWidth();
40786 unsigned Opc = Op.getOpcode();
40787 EVT VT = Op.getValueType();
40789 // Handle special case opcodes.
40790 switch (Opc) {
40791 case X86ISD::PMULDQ:
40792 case X86ISD::PMULUDQ: {
40793 APInt LHSUndef, LHSZero;
40794 APInt RHSUndef, RHSZero;
40795 SDValue LHS = Op.getOperand(0);
40796 SDValue RHS = Op.getOperand(1);
40797 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
40798 Depth + 1))
40799 return true;
40800 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
40801 Depth + 1))
40802 return true;
40803 // Multiply by zero.
40804 KnownZero = LHSZero | RHSZero;
40805 break;
40807 case X86ISD::VPMADDWD: {
40808 APInt LHSUndef, LHSZero;
40809 APInt RHSUndef, RHSZero;
40810 SDValue LHS = Op.getOperand(0);
40811 SDValue RHS = Op.getOperand(1);
40812 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
40814 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
40815 Depth + 1))
40816 return true;
40817 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
40818 Depth + 1))
40819 return true;
40821 // TODO: Multiply by zero.
40823 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
40824 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
40825 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
40826 Depth + 1))
40827 return true;
40828 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
40829 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
40830 Depth + 1))
40831 return true;
40832 break;
40834 case X86ISD::PSADBW: {
40835 SDValue LHS = Op.getOperand(0);
40836 SDValue RHS = Op.getOperand(1);
40837 assert(VT.getScalarType() == MVT::i64 &&
40838 LHS.getValueType() == RHS.getValueType() &&
40839 LHS.getValueType().getScalarType() == MVT::i8 &&
40840 "Unexpected PSADBW types");
40842 // Aggressively peek through ops to get at the demanded elts.
40843 if (!DemandedElts.isAllOnes()) {
40844 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
40845 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
40846 SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
40847 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
40848 SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
40849 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
40850 if (NewLHS || NewRHS) {
40851 NewLHS = NewLHS ? NewLHS : LHS;
40852 NewRHS = NewRHS ? NewRHS : RHS;
40853 return TLO.CombineTo(
40854 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
40857 break;
40859 case X86ISD::VSHL:
40860 case X86ISD::VSRL:
40861 case X86ISD::VSRA: {
40862 // We only need the bottom 64-bits of the (128-bit) shift amount.
40863 SDValue Amt = Op.getOperand(1);
40864 MVT AmtVT = Amt.getSimpleValueType();
40865 assert(AmtVT.is128BitVector() && "Unexpected value type");
40867 // If we reuse the shift amount just for sse shift amounts then we know that
40868 // only the bottom 64-bits are only ever used.
40869 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
40870 unsigned UseOpc = Use->getOpcode();
40871 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
40872 UseOpc == X86ISD::VSRA) &&
40873 Use->getOperand(0) != Amt;
40876 APInt AmtUndef, AmtZero;
40877 unsigned NumAmtElts = AmtVT.getVectorNumElements();
40878 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
40879 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
40880 Depth + 1, AssumeSingleUse))
40881 return true;
40882 [[fallthrough]];
40884 case X86ISD::VSHLI:
40885 case X86ISD::VSRLI:
40886 case X86ISD::VSRAI: {
40887 SDValue Src = Op.getOperand(0);
40888 APInt SrcUndef;
40889 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
40890 Depth + 1))
40891 return true;
40893 // Fold shift(0,x) -> 0
40894 if (DemandedElts.isSubsetOf(KnownZero))
40895 return TLO.CombineTo(
40896 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
40898 // Aggressively peek through ops to get at the demanded elts.
40899 if (!DemandedElts.isAllOnes())
40900 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
40901 Src, DemandedElts, TLO.DAG, Depth + 1))
40902 return TLO.CombineTo(
40903 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
40904 break;
40906 case X86ISD::VPSHA:
40907 case X86ISD::VPSHL:
40908 case X86ISD::VSHLV:
40909 case X86ISD::VSRLV:
40910 case X86ISD::VSRAV: {
40911 APInt LHSUndef, LHSZero;
40912 APInt RHSUndef, RHSZero;
40913 SDValue LHS = Op.getOperand(0);
40914 SDValue RHS = Op.getOperand(1);
40915 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
40916 Depth + 1))
40917 return true;
40919 // Fold shift(0,x) -> 0
40920 if (DemandedElts.isSubsetOf(LHSZero))
40921 return TLO.CombineTo(
40922 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
40924 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
40925 Depth + 1))
40926 return true;
40928 KnownZero = LHSZero;
40929 break;
40931 case X86ISD::KSHIFTL: {
40932 SDValue Src = Op.getOperand(0);
40933 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
40934 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
40935 unsigned ShiftAmt = Amt->getZExtValue();
40937 if (ShiftAmt == 0)
40938 return TLO.CombineTo(Op, Src);
40940 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
40941 // single shift. We can do this if the bottom bits (which are shifted
40942 // out) are never demanded.
40943 if (Src.getOpcode() == X86ISD::KSHIFTR) {
40944 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
40945 unsigned C1 = Src.getConstantOperandVal(1);
40946 unsigned NewOpc = X86ISD::KSHIFTL;
40947 int Diff = ShiftAmt - C1;
40948 if (Diff < 0) {
40949 Diff = -Diff;
40950 NewOpc = X86ISD::KSHIFTR;
40953 SDLoc dl(Op);
40954 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
40955 return TLO.CombineTo(
40956 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
40960 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
40961 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
40962 Depth + 1))
40963 return true;
40965 KnownUndef <<= ShiftAmt;
40966 KnownZero <<= ShiftAmt;
40967 KnownZero.setLowBits(ShiftAmt);
40968 break;
40970 case X86ISD::KSHIFTR: {
40971 SDValue Src = Op.getOperand(0);
40972 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
40973 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
40974 unsigned ShiftAmt = Amt->getZExtValue();
40976 if (ShiftAmt == 0)
40977 return TLO.CombineTo(Op, Src);
40979 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
40980 // single shift. We can do this if the top bits (which are shifted
40981 // out) are never demanded.
40982 if (Src.getOpcode() == X86ISD::KSHIFTL) {
40983 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
40984 unsigned C1 = Src.getConstantOperandVal(1);
40985 unsigned NewOpc = X86ISD::KSHIFTR;
40986 int Diff = ShiftAmt - C1;
40987 if (Diff < 0) {
40988 Diff = -Diff;
40989 NewOpc = X86ISD::KSHIFTL;
40992 SDLoc dl(Op);
40993 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
40994 return TLO.CombineTo(
40995 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
40999 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
41000 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
41001 Depth + 1))
41002 return true;
41004 KnownUndef.lshrInPlace(ShiftAmt);
41005 KnownZero.lshrInPlace(ShiftAmt);
41006 KnownZero.setHighBits(ShiftAmt);
41007 break;
41009 case X86ISD::ANDNP: {
41010 // ANDNP = (~LHS & RHS);
41011 SDValue LHS = Op.getOperand(0);
41012 SDValue RHS = Op.getOperand(1);
41014 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
41015 APInt UndefElts;
41016 SmallVector<APInt> EltBits;
41017 int NumElts = VT.getVectorNumElements();
41018 int EltSizeInBits = VT.getScalarSizeInBits();
41019 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
41020 APInt OpElts = DemandedElts;
41021 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41022 EltBits)) {
41023 OpBits.clearAllBits();
41024 OpElts.clearAllBits();
41025 for (int I = 0; I != NumElts; ++I) {
41026 if (!DemandedElts[I])
41027 continue;
41028 if (UndefElts[I]) {
41029 // We can't assume an undef src element gives an undef dst - the
41030 // other src might be zero.
41031 OpBits.setAllBits();
41032 OpElts.setBit(I);
41033 } else if ((Invert && !EltBits[I].isAllOnes()) ||
41034 (!Invert && !EltBits[I].isZero())) {
41035 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
41036 OpElts.setBit(I);
41040 return std::make_pair(OpBits, OpElts);
41042 APInt BitsLHS, EltsLHS;
41043 APInt BitsRHS, EltsRHS;
41044 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
41045 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
41047 APInt LHSUndef, LHSZero;
41048 APInt RHSUndef, RHSZero;
41049 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
41050 Depth + 1))
41051 return true;
41052 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
41053 Depth + 1))
41054 return true;
41056 if (!DemandedElts.isAllOnes()) {
41057 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
41058 TLO.DAG, Depth + 1);
41059 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
41060 TLO.DAG, Depth + 1);
41061 if (NewLHS || NewRHS) {
41062 NewLHS = NewLHS ? NewLHS : LHS;
41063 NewRHS = NewRHS ? NewRHS : RHS;
41064 return TLO.CombineTo(
41065 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
41068 break;
41070 case X86ISD::CVTSI2P:
41071 case X86ISD::CVTUI2P: {
41072 SDValue Src = Op.getOperand(0);
41073 MVT SrcVT = Src.getSimpleValueType();
41074 APInt SrcUndef, SrcZero;
41075 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
41076 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
41077 Depth + 1))
41078 return true;
41079 break;
41081 case X86ISD::PACKSS:
41082 case X86ISD::PACKUS: {
41083 SDValue N0 = Op.getOperand(0);
41084 SDValue N1 = Op.getOperand(1);
41086 APInt DemandedLHS, DemandedRHS;
41087 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
41089 APInt LHSUndef, LHSZero;
41090 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
41091 Depth + 1))
41092 return true;
41093 APInt RHSUndef, RHSZero;
41094 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
41095 Depth + 1))
41096 return true;
41098 // TODO - pass on known zero/undef.
41100 // Aggressively peek through ops to get at the demanded elts.
41101 // TODO - we should do this for all target/faux shuffles ops.
41102 if (!DemandedElts.isAllOnes()) {
41103 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
41104 TLO.DAG, Depth + 1);
41105 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
41106 TLO.DAG, Depth + 1);
41107 if (NewN0 || NewN1) {
41108 NewN0 = NewN0 ? NewN0 : N0;
41109 NewN1 = NewN1 ? NewN1 : N1;
41110 return TLO.CombineTo(Op,
41111 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
41114 break;
41116 case X86ISD::HADD:
41117 case X86ISD::HSUB:
41118 case X86ISD::FHADD:
41119 case X86ISD::FHSUB: {
41120 SDValue N0 = Op.getOperand(0);
41121 SDValue N1 = Op.getOperand(1);
41123 APInt DemandedLHS, DemandedRHS;
41124 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
41126 APInt LHSUndef, LHSZero;
41127 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
41128 Depth + 1))
41129 return true;
41130 APInt RHSUndef, RHSZero;
41131 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
41132 Depth + 1))
41133 return true;
41135 // TODO - pass on known zero/undef.
41137 // Aggressively peek through ops to get at the demanded elts.
41138 // TODO: Handle repeated operands.
41139 if (N0 != N1 && !DemandedElts.isAllOnes()) {
41140 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
41141 TLO.DAG, Depth + 1);
41142 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
41143 TLO.DAG, Depth + 1);
41144 if (NewN0 || NewN1) {
41145 NewN0 = NewN0 ? NewN0 : N0;
41146 NewN1 = NewN1 ? NewN1 : N1;
41147 return TLO.CombineTo(Op,
41148 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
41151 break;
41153 case X86ISD::VTRUNC:
41154 case X86ISD::VTRUNCS:
41155 case X86ISD::VTRUNCUS: {
41156 SDValue Src = Op.getOperand(0);
41157 MVT SrcVT = Src.getSimpleValueType();
41158 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
41159 APInt SrcUndef, SrcZero;
41160 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
41161 Depth + 1))
41162 return true;
41163 KnownZero = SrcZero.zextOrTrunc(NumElts);
41164 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
41165 break;
41167 case X86ISD::BLENDV: {
41168 APInt SelUndef, SelZero;
41169 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
41170 SelZero, TLO, Depth + 1))
41171 return true;
41173 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
41174 APInt LHSUndef, LHSZero;
41175 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
41176 LHSZero, TLO, Depth + 1))
41177 return true;
41179 APInt RHSUndef, RHSZero;
41180 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
41181 RHSZero, TLO, Depth + 1))
41182 return true;
41184 KnownZero = LHSZero & RHSZero;
41185 KnownUndef = LHSUndef & RHSUndef;
41186 break;
41188 case X86ISD::VZEXT_MOVL: {
41189 // If upper demanded elements are already zero then we have nothing to do.
41190 SDValue Src = Op.getOperand(0);
41191 APInt DemandedUpperElts = DemandedElts;
41192 DemandedUpperElts.clearLowBits(1);
41193 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
41194 return TLO.CombineTo(Op, Src);
41195 break;
41197 case X86ISD::VBROADCAST: {
41198 SDValue Src = Op.getOperand(0);
41199 MVT SrcVT = Src.getSimpleValueType();
41200 if (!SrcVT.isVector())
41201 break;
41202 // Don't bother broadcasting if we just need the 0'th element.
41203 if (DemandedElts == 1) {
41204 if (Src.getValueType() != VT)
41205 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
41206 SDLoc(Op));
41207 return TLO.CombineTo(Op, Src);
41209 APInt SrcUndef, SrcZero;
41210 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
41211 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
41212 Depth + 1))
41213 return true;
41214 // Aggressively peek through src to get at the demanded elt.
41215 // TODO - we should do this for all target/faux shuffles ops.
41216 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
41217 Src, SrcElts, TLO.DAG, Depth + 1))
41218 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41219 break;
41221 case X86ISD::VPERMV:
41222 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
41223 Depth))
41224 return true;
41225 break;
41226 case X86ISD::PSHUFB:
41227 case X86ISD::VPERMV3:
41228 case X86ISD::VPERMILPV:
41229 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
41230 Depth))
41231 return true;
41232 break;
41233 case X86ISD::VPPERM:
41234 case X86ISD::VPERMIL2:
41235 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
41236 Depth))
41237 return true;
41238 break;
41241 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
41242 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
41243 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
41244 if ((VT.is256BitVector() || VT.is512BitVector()) &&
41245 DemandedElts.lshr(NumElts / 2) == 0) {
41246 unsigned SizeInBits = VT.getSizeInBits();
41247 unsigned ExtSizeInBits = SizeInBits / 2;
41249 // See if 512-bit ops only use the bottom 128-bits.
41250 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
41251 ExtSizeInBits = SizeInBits / 4;
41253 switch (Opc) {
41254 // Scalar broadcast.
41255 case X86ISD::VBROADCAST: {
41256 SDLoc DL(Op);
41257 SDValue Src = Op.getOperand(0);
41258 if (Src.getValueSizeInBits() > ExtSizeInBits)
41259 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
41260 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41261 ExtSizeInBits / VT.getScalarSizeInBits());
41262 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
41263 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
41264 TLO.DAG, DL, ExtSizeInBits));
41266 case X86ISD::VBROADCAST_LOAD: {
41267 SDLoc DL(Op);
41268 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
41269 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41270 ExtSizeInBits / VT.getScalarSizeInBits());
41271 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
41272 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
41273 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
41274 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
41275 MemIntr->getMemOperand());
41276 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
41277 Bcst.getValue(1));
41278 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
41279 TLO.DAG, DL, ExtSizeInBits));
41281 // Subvector broadcast.
41282 case X86ISD::SUBV_BROADCAST_LOAD: {
41283 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
41284 EVT MemVT = MemIntr->getMemoryVT();
41285 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
41286 SDLoc DL(Op);
41287 SDValue Ld =
41288 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
41289 MemIntr->getBasePtr(), MemIntr->getMemOperand());
41290 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
41291 Ld.getValue(1));
41292 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
41293 TLO.DAG, DL, ExtSizeInBits));
41294 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
41295 SDLoc DL(Op);
41296 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41297 ExtSizeInBits / VT.getScalarSizeInBits());
41298 if (SDValue BcstLd =
41299 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
41300 return TLO.CombineTo(Op,
41301 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
41302 TLO.DAG, DL, ExtSizeInBits));
41304 break;
41306 // Byte shifts by immediate.
41307 case X86ISD::VSHLDQ:
41308 case X86ISD::VSRLDQ:
41309 // Shift by uniform.
41310 case X86ISD::VSHL:
41311 case X86ISD::VSRL:
41312 case X86ISD::VSRA:
41313 // Shift by immediate.
41314 case X86ISD::VSHLI:
41315 case X86ISD::VSRLI:
41316 case X86ISD::VSRAI: {
41317 SDLoc DL(Op);
41318 SDValue Ext0 =
41319 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
41320 SDValue ExtOp =
41321 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
41322 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41323 SDValue Insert =
41324 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
41325 return TLO.CombineTo(Op, Insert);
41327 case X86ISD::VPERMI: {
41328 // Simplify PERMPD/PERMQ to extract_subvector.
41329 // TODO: This should be done in shuffle combining.
41330 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
41331 SmallVector<int, 4> Mask;
41332 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
41333 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
41334 SDLoc DL(Op);
41335 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
41336 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41337 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
41338 return TLO.CombineTo(Op, Insert);
41341 break;
41343 case X86ISD::VPERM2X128: {
41344 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
41345 SDLoc DL(Op);
41346 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
41347 if (LoMask & 0x8)
41348 return TLO.CombineTo(
41349 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
41350 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
41351 unsigned SrcIdx = (LoMask & 0x2) >> 1;
41352 SDValue ExtOp =
41353 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
41354 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41355 SDValue Insert =
41356 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
41357 return TLO.CombineTo(Op, Insert);
41359 // Zero upper elements.
41360 case X86ISD::VZEXT_MOVL:
41361 // Target unary shuffles by immediate:
41362 case X86ISD::PSHUFD:
41363 case X86ISD::PSHUFLW:
41364 case X86ISD::PSHUFHW:
41365 case X86ISD::VPERMILPI:
41366 // (Non-Lane Crossing) Target Shuffles.
41367 case X86ISD::VPERMILPV:
41368 case X86ISD::VPERMIL2:
41369 case X86ISD::PSHUFB:
41370 case X86ISD::UNPCKL:
41371 case X86ISD::UNPCKH:
41372 case X86ISD::BLENDI:
41373 // Integer ops.
41374 case X86ISD::PACKSS:
41375 case X86ISD::PACKUS:
41376 // Horizontal Ops.
41377 case X86ISD::HADD:
41378 case X86ISD::HSUB:
41379 case X86ISD::FHADD:
41380 case X86ISD::FHSUB: {
41381 SDLoc DL(Op);
41382 SmallVector<SDValue, 4> Ops;
41383 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
41384 SDValue SrcOp = Op.getOperand(i);
41385 EVT SrcVT = SrcOp.getValueType();
41386 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
41387 "Unsupported vector size");
41388 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
41389 ExtSizeInBits)
41390 : SrcOp);
41392 MVT ExtVT = VT.getSimpleVT();
41393 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
41394 ExtSizeInBits / ExtVT.getScalarSizeInBits());
41395 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
41396 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41397 SDValue Insert =
41398 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
41399 return TLO.CombineTo(Op, Insert);
41404 // For splats, unless we *only* demand the 0'th element,
41405 // stop attempts at simplification here, we aren't going to improve things,
41406 // this is better than any potential shuffle.
41407 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
41408 return false;
41410 // Get target/faux shuffle mask.
41411 APInt OpUndef, OpZero;
41412 SmallVector<int, 64> OpMask;
41413 SmallVector<SDValue, 2> OpInputs;
41414 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
41415 OpZero, TLO.DAG, Depth, false))
41416 return false;
41418 // Shuffle inputs must be the same size as the result.
41419 if (OpMask.size() != (unsigned)NumElts ||
41420 llvm::any_of(OpInputs, [VT](SDValue V) {
41421 return VT.getSizeInBits() != V.getValueSizeInBits() ||
41422 !V.getValueType().isVector();
41424 return false;
41426 KnownZero = OpZero;
41427 KnownUndef = OpUndef;
41429 // Check if shuffle mask can be simplified to undef/zero/identity.
41430 int NumSrcs = OpInputs.size();
41431 for (int i = 0; i != NumElts; ++i)
41432 if (!DemandedElts[i])
41433 OpMask[i] = SM_SentinelUndef;
41435 if (isUndefInRange(OpMask, 0, NumElts)) {
41436 KnownUndef.setAllBits();
41437 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
41439 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
41440 KnownZero.setAllBits();
41441 return TLO.CombineTo(
41442 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41444 for (int Src = 0; Src != NumSrcs; ++Src)
41445 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
41446 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
41448 // Attempt to simplify inputs.
41449 for (int Src = 0; Src != NumSrcs; ++Src) {
41450 // TODO: Support inputs of different types.
41451 if (OpInputs[Src].getValueType() != VT)
41452 continue;
41454 int Lo = Src * NumElts;
41455 APInt SrcElts = APInt::getZero(NumElts);
41456 for (int i = 0; i != NumElts; ++i)
41457 if (DemandedElts[i]) {
41458 int M = OpMask[i] - Lo;
41459 if (0 <= M && M < NumElts)
41460 SrcElts.setBit(M);
41463 // TODO - Propagate input undef/zero elts.
41464 APInt SrcUndef, SrcZero;
41465 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
41466 TLO, Depth + 1))
41467 return true;
41470 // If we don't demand all elements, then attempt to combine to a simpler
41471 // shuffle.
41472 // We need to convert the depth to something combineX86ShufflesRecursively
41473 // can handle - so pretend its Depth == 0 again, and reduce the max depth
41474 // to match. This prevents combineX86ShuffleChain from returning a
41475 // combined shuffle that's the same as the original root, causing an
41476 // infinite loop.
41477 if (!DemandedElts.isAllOnes()) {
41478 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
41480 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
41481 for (int i = 0; i != NumElts; ++i)
41482 if (DemandedElts[i])
41483 DemandedMask[i] = i;
41485 SDValue NewShuffle = combineX86ShufflesRecursively(
41486 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
41487 /*HasVarMask*/ false,
41488 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
41489 Subtarget);
41490 if (NewShuffle)
41491 return TLO.CombineTo(Op, NewShuffle);
41494 return false;
41497 bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
41498 SDValue Op, const APInt &OriginalDemandedBits,
41499 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
41500 unsigned Depth) const {
41501 EVT VT = Op.getValueType();
41502 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
41503 unsigned Opc = Op.getOpcode();
41504 switch(Opc) {
41505 case X86ISD::VTRUNC: {
41506 KnownBits KnownOp;
41507 SDValue Src = Op.getOperand(0);
41508 MVT SrcVT = Src.getSimpleValueType();
41510 // Simplify the input, using demanded bit information.
41511 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
41512 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
41513 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
41514 return true;
41515 break;
41517 case X86ISD::PMULDQ:
41518 case X86ISD::PMULUDQ: {
41519 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
41520 KnownBits KnownLHS, KnownRHS;
41521 SDValue LHS = Op.getOperand(0);
41522 SDValue RHS = Op.getOperand(1);
41524 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
41525 // FIXME: Can we bound this better?
41526 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
41527 APInt DemandedMaskLHS = APInt::getAllOnes(64);
41528 APInt DemandedMaskRHS = APInt::getAllOnes(64);
41530 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
41531 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
41532 DemandedMaskLHS = DemandedMask;
41533 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
41534 DemandedMaskRHS = DemandedMask;
41536 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
41537 KnownLHS, TLO, Depth + 1))
41538 return true;
41539 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
41540 KnownRHS, TLO, Depth + 1))
41541 return true;
41543 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
41544 KnownRHS = KnownRHS.trunc(32);
41545 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
41546 KnownRHS.getConstant().isOne()) {
41547 SDLoc DL(Op);
41548 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
41549 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
41552 // Aggressively peek through ops to get at the demanded low bits.
41553 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
41554 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
41555 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
41556 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
41557 if (DemandedLHS || DemandedRHS) {
41558 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
41559 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
41560 return TLO.CombineTo(
41561 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
41563 break;
41565 case X86ISD::ANDNP: {
41566 KnownBits Known2;
41567 SDValue Op0 = Op.getOperand(0);
41568 SDValue Op1 = Op.getOperand(1);
41570 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
41571 Known, TLO, Depth + 1))
41572 return true;
41573 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
41575 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
41576 OriginalDemandedElts, Known2, TLO, Depth + 1))
41577 return true;
41578 assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
41580 // If the RHS is a constant, see if we can simplify it.
41581 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
41582 OriginalDemandedElts, TLO))
41583 return true;
41585 // ANDNP = (~Op0 & Op1);
41586 Known.One &= Known2.Zero;
41587 Known.Zero |= Known2.One;
41588 break;
41590 case X86ISD::VSHLI: {
41591 SDValue Op0 = Op.getOperand(0);
41593 unsigned ShAmt = Op.getConstantOperandVal(1);
41594 if (ShAmt >= BitWidth)
41595 break;
41597 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
41599 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
41600 // single shift. We can do this if the bottom bits (which are shifted
41601 // out) are never demanded.
41602 if (Op0.getOpcode() == X86ISD::VSRLI &&
41603 OriginalDemandedBits.countr_zero() >= ShAmt) {
41604 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
41605 if (Shift2Amt < BitWidth) {
41606 int Diff = ShAmt - Shift2Amt;
41607 if (Diff == 0)
41608 return TLO.CombineTo(Op, Op0.getOperand(0));
41610 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
41611 SDValue NewShift = TLO.DAG.getNode(
41612 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
41613 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
41614 return TLO.CombineTo(Op, NewShift);
41618 // If we are only demanding sign bits then we can use the shift source directly.
41619 unsigned NumSignBits =
41620 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
41621 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
41622 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
41623 return TLO.CombineTo(Op, Op0);
41625 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
41626 TLO, Depth + 1))
41627 return true;
41629 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
41630 Known.Zero <<= ShAmt;
41631 Known.One <<= ShAmt;
41633 // Low bits known zero.
41634 Known.Zero.setLowBits(ShAmt);
41635 return false;
41637 case X86ISD::VSRLI: {
41638 unsigned ShAmt = Op.getConstantOperandVal(1);
41639 if (ShAmt >= BitWidth)
41640 break;
41642 APInt DemandedMask = OriginalDemandedBits << ShAmt;
41644 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
41645 OriginalDemandedElts, Known, TLO, Depth + 1))
41646 return true;
41648 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
41649 Known.Zero.lshrInPlace(ShAmt);
41650 Known.One.lshrInPlace(ShAmt);
41652 // High bits known zero.
41653 Known.Zero.setHighBits(ShAmt);
41654 return false;
41656 case X86ISD::VSRAI: {
41657 SDValue Op0 = Op.getOperand(0);
41658 SDValue Op1 = Op.getOperand(1);
41660 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
41661 if (ShAmt >= BitWidth)
41662 break;
41664 APInt DemandedMask = OriginalDemandedBits << ShAmt;
41666 // If we just want the sign bit then we don't need to shift it.
41667 if (OriginalDemandedBits.isSignMask())
41668 return TLO.CombineTo(Op, Op0);
41670 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
41671 if (Op0.getOpcode() == X86ISD::VSHLI &&
41672 Op.getOperand(1) == Op0.getOperand(1)) {
41673 SDValue Op00 = Op0.getOperand(0);
41674 unsigned NumSignBits =
41675 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
41676 if (ShAmt < NumSignBits)
41677 return TLO.CombineTo(Op, Op00);
41680 // If any of the demanded bits are produced by the sign extension, we also
41681 // demand the input sign bit.
41682 if (OriginalDemandedBits.countl_zero() < ShAmt)
41683 DemandedMask.setSignBit();
41685 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
41686 TLO, Depth + 1))
41687 return true;
41689 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
41690 Known.Zero.lshrInPlace(ShAmt);
41691 Known.One.lshrInPlace(ShAmt);
41693 // If the input sign bit is known to be zero, or if none of the top bits
41694 // are demanded, turn this into an unsigned shift right.
41695 if (Known.Zero[BitWidth - ShAmt - 1] ||
41696 OriginalDemandedBits.countl_zero() >= ShAmt)
41697 return TLO.CombineTo(
41698 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
41700 // High bits are known one.
41701 if (Known.One[BitWidth - ShAmt - 1])
41702 Known.One.setHighBits(ShAmt);
41703 return false;
41705 case X86ISD::BLENDV: {
41706 SDValue Sel = Op.getOperand(0);
41707 SDValue LHS = Op.getOperand(1);
41708 SDValue RHS = Op.getOperand(2);
41710 APInt SignMask = APInt::getSignMask(BitWidth);
41711 SDValue NewSel = SimplifyMultipleUseDemandedBits(
41712 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
41713 SDValue NewLHS = SimplifyMultipleUseDemandedBits(
41714 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
41715 SDValue NewRHS = SimplifyMultipleUseDemandedBits(
41716 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
41718 if (NewSel || NewLHS || NewRHS) {
41719 NewSel = NewSel ? NewSel : Sel;
41720 NewLHS = NewLHS ? NewLHS : LHS;
41721 NewRHS = NewRHS ? NewRHS : RHS;
41722 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
41723 NewSel, NewLHS, NewRHS));
41725 break;
41727 case X86ISD::PEXTRB:
41728 case X86ISD::PEXTRW: {
41729 SDValue Vec = Op.getOperand(0);
41730 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
41731 MVT VecVT = Vec.getSimpleValueType();
41732 unsigned NumVecElts = VecVT.getVectorNumElements();
41734 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
41735 unsigned Idx = CIdx->getZExtValue();
41736 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
41738 // If we demand no bits from the vector then we must have demanded
41739 // bits from the implict zext - simplify to zero.
41740 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
41741 if (DemandedVecBits == 0)
41742 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41744 APInt KnownUndef, KnownZero;
41745 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
41746 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
41747 KnownZero, TLO, Depth + 1))
41748 return true;
41750 KnownBits KnownVec;
41751 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
41752 KnownVec, TLO, Depth + 1))
41753 return true;
41755 if (SDValue V = SimplifyMultipleUseDemandedBits(
41756 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
41757 return TLO.CombineTo(
41758 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
41760 Known = KnownVec.zext(BitWidth);
41761 return false;
41763 break;
41765 case X86ISD::PINSRB:
41766 case X86ISD::PINSRW: {
41767 SDValue Vec = Op.getOperand(0);
41768 SDValue Scl = Op.getOperand(1);
41769 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
41770 MVT VecVT = Vec.getSimpleValueType();
41772 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
41773 unsigned Idx = CIdx->getZExtValue();
41774 if (!OriginalDemandedElts[Idx])
41775 return TLO.CombineTo(Op, Vec);
41777 KnownBits KnownVec;
41778 APInt DemandedVecElts(OriginalDemandedElts);
41779 DemandedVecElts.clearBit(Idx);
41780 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
41781 KnownVec, TLO, Depth + 1))
41782 return true;
41784 KnownBits KnownScl;
41785 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
41786 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
41787 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
41788 return true;
41790 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
41791 Known = KnownVec.intersectWith(KnownScl);
41792 return false;
41794 break;
41796 case X86ISD::PACKSS:
41797 // PACKSS saturates to MIN/MAX integer values. So if we just want the
41798 // sign bit then we can just ask for the source operands sign bit.
41799 // TODO - add known bits handling.
41800 if (OriginalDemandedBits.isSignMask()) {
41801 APInt DemandedLHS, DemandedRHS;
41802 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
41804 KnownBits KnownLHS, KnownRHS;
41805 APInt SignMask = APInt::getSignMask(BitWidth * 2);
41806 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
41807 KnownLHS, TLO, Depth + 1))
41808 return true;
41809 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
41810 KnownRHS, TLO, Depth + 1))
41811 return true;
41813 // Attempt to avoid multi-use ops if we don't need anything from them.
41814 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
41815 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
41816 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
41817 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
41818 if (DemandedOp0 || DemandedOp1) {
41819 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
41820 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
41821 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
41824 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
41825 break;
41826 case X86ISD::VBROADCAST: {
41827 SDValue Src = Op.getOperand(0);
41828 MVT SrcVT = Src.getSimpleValueType();
41829 APInt DemandedElts = APInt::getOneBitSet(
41830 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
41831 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
41832 TLO, Depth + 1))
41833 return true;
41834 // If we don't need the upper bits, attempt to narrow the broadcast source.
41835 // Don't attempt this on AVX512 as it might affect broadcast folding.
41836 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
41837 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
41838 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
41839 Src->hasOneUse()) {
41840 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
41841 SDValue NewSrc =
41842 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
41843 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
41844 SDValue NewBcst =
41845 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
41846 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
41848 break;
41850 case X86ISD::PCMPGT:
41851 // icmp sgt(0, R) == ashr(R, BitWidth-1).
41852 // iff we only need the sign bit then we can use R directly.
41853 if (OriginalDemandedBits.isSignMask() &&
41854 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
41855 return TLO.CombineTo(Op, Op.getOperand(1));
41856 break;
41857 case X86ISD::MOVMSK: {
41858 SDValue Src = Op.getOperand(0);
41859 MVT SrcVT = Src.getSimpleValueType();
41860 unsigned SrcBits = SrcVT.getScalarSizeInBits();
41861 unsigned NumElts = SrcVT.getVectorNumElements();
41863 // If we don't need the sign bits at all just return zero.
41864 if (OriginalDemandedBits.countr_zero() >= NumElts)
41865 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41867 // See if we only demand bits from the lower 128-bit vector.
41868 if (SrcVT.is256BitVector() &&
41869 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
41870 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
41871 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41874 // Only demand the vector elements of the sign bits we need.
41875 APInt KnownUndef, KnownZero;
41876 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
41877 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
41878 TLO, Depth + 1))
41879 return true;
41881 Known.Zero = KnownZero.zext(BitWidth);
41882 Known.Zero.setHighBits(BitWidth - NumElts);
41884 // MOVMSK only uses the MSB from each vector element.
41885 KnownBits KnownSrc;
41886 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
41887 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
41888 Depth + 1))
41889 return true;
41891 if (KnownSrc.One[SrcBits - 1])
41892 Known.One.setLowBits(NumElts);
41893 else if (KnownSrc.Zero[SrcBits - 1])
41894 Known.Zero.setLowBits(NumElts);
41896 // Attempt to avoid multi-use os if we don't need anything from it.
41897 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
41898 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
41899 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41900 return false;
41902 case X86ISD::TESTP: {
41903 SDValue Op0 = Op.getOperand(0);
41904 SDValue Op1 = Op.getOperand(1);
41905 MVT OpVT = Op0.getSimpleValueType();
41906 assert((OpVT.getVectorElementType() == MVT::f32 ||
41907 OpVT.getVectorElementType() == MVT::f64) &&
41908 "Illegal vector type for X86ISD::TESTP");
41910 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
41911 KnownBits KnownSrc;
41912 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
41913 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
41914 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
41915 AssumeSingleUse) ||
41916 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
41917 AssumeSingleUse);
41919 case X86ISD::BEXTR:
41920 case X86ISD::BEXTRI: {
41921 SDValue Op0 = Op.getOperand(0);
41922 SDValue Op1 = Op.getOperand(1);
41924 // Only bottom 16-bits of the control bits are required.
41925 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
41926 // NOTE: SimplifyDemandedBits won't do this for constants.
41927 uint64_t Val1 = Cst1->getZExtValue();
41928 uint64_t MaskedVal1 = Val1 & 0xFFFF;
41929 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
41930 SDLoc DL(Op);
41931 return TLO.CombineTo(
41932 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
41933 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
41936 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
41937 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
41939 // If the length is 0, the result is 0.
41940 if (Length == 0) {
41941 Known.setAllZero();
41942 return false;
41945 if ((Shift + Length) <= BitWidth) {
41946 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
41947 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
41948 return true;
41950 Known = Known.extractBits(Length, Shift);
41951 Known = Known.zextOrTrunc(BitWidth);
41952 return false;
41954 } else {
41955 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
41956 KnownBits Known1;
41957 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
41958 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
41959 return true;
41961 // If the length is 0, replace with 0.
41962 KnownBits LengthBits = Known1.extractBits(8, 8);
41963 if (LengthBits.isZero())
41964 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41967 break;
41969 case X86ISD::PDEP: {
41970 SDValue Op0 = Op.getOperand(0);
41971 SDValue Op1 = Op.getOperand(1);
41973 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
41974 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
41976 // If the demanded bits has leading zeroes, we don't demand those from the
41977 // mask.
41978 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
41979 return true;
41981 // The number of possible 1s in the mask determines the number of LSBs of
41982 // operand 0 used. Undemanded bits from the mask don't matter so filter
41983 // them before counting.
41984 KnownBits Known2;
41985 uint64_t Count = (~Known.Zero & LoMask).popcount();
41986 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
41987 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
41988 return true;
41990 // Zeroes are retained from the mask, but not ones.
41991 Known.One.clearAllBits();
41992 // The result will have at least as many trailing zeros as the non-mask
41993 // operand since bits can only map to the same or higher bit position.
41994 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
41995 return false;
41999 return TargetLowering::SimplifyDemandedBitsForTargetNode(
42000 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
42003 SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
42004 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
42005 SelectionDAG &DAG, unsigned Depth) const {
42006 int NumElts = DemandedElts.getBitWidth();
42007 unsigned Opc = Op.getOpcode();
42008 EVT VT = Op.getValueType();
42010 switch (Opc) {
42011 case X86ISD::PINSRB:
42012 case X86ISD::PINSRW: {
42013 // If we don't demand the inserted element, return the base vector.
42014 SDValue Vec = Op.getOperand(0);
42015 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42016 MVT VecVT = Vec.getSimpleValueType();
42017 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
42018 !DemandedElts[CIdx->getZExtValue()])
42019 return Vec;
42020 break;
42022 case X86ISD::VSHLI: {
42023 // If we are only demanding sign bits then we can use the shift source
42024 // directly.
42025 SDValue Op0 = Op.getOperand(0);
42026 unsigned ShAmt = Op.getConstantOperandVal(1);
42027 unsigned BitWidth = DemandedBits.getBitWidth();
42028 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
42029 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
42030 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42031 return Op0;
42032 break;
42034 case X86ISD::VSRAI:
42035 // iff we only need the sign bit then we can use the source directly.
42036 // TODO: generalize where we only demand extended signbits.
42037 if (DemandedBits.isSignMask())
42038 return Op.getOperand(0);
42039 break;
42040 case X86ISD::PCMPGT:
42041 // icmp sgt(0, R) == ashr(R, BitWidth-1).
42042 // iff we only need the sign bit then we can use R directly.
42043 if (DemandedBits.isSignMask() &&
42044 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42045 return Op.getOperand(1);
42046 break;
42047 case X86ISD::BLENDV: {
42048 // BLENDV: Cond (MSB) ? LHS : RHS
42049 SDValue Cond = Op.getOperand(0);
42050 SDValue LHS = Op.getOperand(1);
42051 SDValue RHS = Op.getOperand(2);
42053 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
42054 if (CondKnown.isNegative())
42055 return LHS;
42056 if (CondKnown.isNonNegative())
42057 return RHS;
42058 break;
42060 case X86ISD::ANDNP: {
42061 // ANDNP = (~LHS & RHS);
42062 SDValue LHS = Op.getOperand(0);
42063 SDValue RHS = Op.getOperand(1);
42065 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
42066 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
42068 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
42069 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
42070 // this context, so return RHS.
42071 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
42072 return RHS;
42073 break;
42077 APInt ShuffleUndef, ShuffleZero;
42078 SmallVector<int, 16> ShuffleMask;
42079 SmallVector<SDValue, 2> ShuffleOps;
42080 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
42081 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
42082 // If all the demanded elts are from one operand and are inline,
42083 // then we can use the operand directly.
42084 int NumOps = ShuffleOps.size();
42085 if (ShuffleMask.size() == (unsigned)NumElts &&
42086 llvm::all_of(ShuffleOps, [VT](SDValue V) {
42087 return VT.getSizeInBits() == V.getValueSizeInBits();
42088 })) {
42090 if (DemandedElts.isSubsetOf(ShuffleUndef))
42091 return DAG.getUNDEF(VT);
42092 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
42093 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
42095 // Bitmask that indicates which ops have only been accessed 'inline'.
42096 APInt IdentityOp = APInt::getAllOnes(NumOps);
42097 for (int i = 0; i != NumElts; ++i) {
42098 int M = ShuffleMask[i];
42099 if (!DemandedElts[i] || ShuffleUndef[i])
42100 continue;
42101 int OpIdx = M / NumElts;
42102 int EltIdx = M % NumElts;
42103 if (M < 0 || EltIdx != i) {
42104 IdentityOp.clearAllBits();
42105 break;
42107 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
42108 if (IdentityOp == 0)
42109 break;
42111 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
42112 "Multiple identity shuffles detected");
42114 if (IdentityOp != 0)
42115 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
42119 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
42120 Op, DemandedBits, DemandedElts, DAG, Depth);
42123 bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
42124 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
42125 bool PoisonOnly, unsigned Depth) const {
42126 unsigned EltsBits = Op.getScalarValueSizeInBits();
42127 unsigned NumElts = DemandedElts.getBitWidth();
42129 // TODO: Add more target shuffles.
42130 switch (Op.getOpcode()) {
42131 case X86ISD::PSHUFD:
42132 case X86ISD::VPERMILPI: {
42133 SmallVector<int, 8> Mask;
42134 DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);
42136 APInt DemandedSrcElts = APInt::getZero(NumElts);
42137 for (unsigned I = 0; I != NumElts; ++I)
42138 if (DemandedElts[I])
42139 DemandedSrcElts.setBit(Mask[I]);
42141 return DAG.isGuaranteedNotToBeUndefOrPoison(
42142 Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);
42145 return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
42146 Op, DemandedElts, DAG, PoisonOnly, Depth);
42149 bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
42150 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
42151 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
42153 // TODO: Add more target shuffles.
42154 switch (Op.getOpcode()) {
42155 case X86ISD::PSHUFD:
42156 case X86ISD::VPERMILPI:
42157 return false;
42159 return TargetLowering::canCreateUndefOrPoisonForTargetNode(
42160 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
42163 bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
42164 const APInt &DemandedElts,
42165 APInt &UndefElts,
42166 const SelectionDAG &DAG,
42167 unsigned Depth) const {
42168 unsigned NumElts = DemandedElts.getBitWidth();
42169 unsigned Opc = Op.getOpcode();
42171 switch (Opc) {
42172 case X86ISD::VBROADCAST:
42173 case X86ISD::VBROADCAST_LOAD:
42174 UndefElts = APInt::getZero(NumElts);
42175 return true;
42178 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
42179 DAG, Depth);
42182 // Helper to peek through bitops/trunc/setcc to determine size of source vector.
42183 // Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
42184 static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
42185 bool AllowTruncate) {
42186 switch (Src.getOpcode()) {
42187 case ISD::TRUNCATE:
42188 if (!AllowTruncate)
42189 return false;
42190 [[fallthrough]];
42191 case ISD::SETCC:
42192 return Src.getOperand(0).getValueSizeInBits() == Size;
42193 case ISD::AND:
42194 case ISD::XOR:
42195 case ISD::OR:
42196 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
42197 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
42198 case ISD::SELECT:
42199 case ISD::VSELECT:
42200 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
42201 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
42202 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
42203 case ISD::BUILD_VECTOR:
42204 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
42205 ISD::isBuildVectorAllOnes(Src.getNode());
42207 return false;
42210 // Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
42211 static unsigned getAltBitOpcode(unsigned Opcode) {
42212 switch(Opcode) {
42213 case ISD::AND: return X86ISD::FAND;
42214 case ISD::OR: return X86ISD::FOR;
42215 case ISD::XOR: return X86ISD::FXOR;
42216 case X86ISD::ANDNP: return X86ISD::FANDN;
42218 llvm_unreachable("Unknown bitwise opcode");
42221 // Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
42222 static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
42223 const SDLoc &DL) {
42224 EVT SrcVT = Src.getValueType();
42225 if (SrcVT != MVT::v4i1)
42226 return SDValue();
42228 switch (Src.getOpcode()) {
42229 case ISD::SETCC:
42230 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
42231 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
42232 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
42233 SDValue Op0 = Src.getOperand(0);
42234 if (ISD::isNormalLoad(Op0.getNode()))
42235 return DAG.getBitcast(MVT::v4f32, Op0);
42236 if (Op0.getOpcode() == ISD::BITCAST &&
42237 Op0.getOperand(0).getValueType() == MVT::v4f32)
42238 return Op0.getOperand(0);
42240 break;
42241 case ISD::AND:
42242 case ISD::XOR:
42243 case ISD::OR: {
42244 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
42245 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
42246 if (Op0 && Op1)
42247 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
42248 Op1);
42249 break;
42252 return SDValue();
42255 // Helper to push sign extension of vXi1 SETCC result through bitops.
42256 static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
42257 SDValue Src, const SDLoc &DL) {
42258 switch (Src.getOpcode()) {
42259 case ISD::SETCC:
42260 case ISD::TRUNCATE:
42261 case ISD::BUILD_VECTOR:
42262 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
42263 case ISD::AND:
42264 case ISD::XOR:
42265 case ISD::OR:
42266 return DAG.getNode(
42267 Src.getOpcode(), DL, SExtVT,
42268 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
42269 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
42270 case ISD::SELECT:
42271 case ISD::VSELECT:
42272 return DAG.getSelect(
42273 DL, SExtVT, Src.getOperand(0),
42274 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
42275 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
42277 llvm_unreachable("Unexpected node type for vXi1 sign extension");
42280 // Try to match patterns such as
42281 // (i16 bitcast (v16i1 x))
42282 // ->
42283 // (i16 movmsk (16i8 sext (v16i1 x)))
42284 // before the illegal vector is scalarized on subtargets that don't have legal
42285 // vxi1 types.
42286 static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
42287 const SDLoc &DL,
42288 const X86Subtarget &Subtarget) {
42289 EVT SrcVT = Src.getValueType();
42290 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
42291 return SDValue();
42293 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
42294 // legalization destroys the v4i32 type.
42295 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
42296 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
42297 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
42298 DAG.getBitcast(MVT::v4f32, V));
42299 return DAG.getZExtOrTrunc(V, DL, VT);
42303 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
42304 // movmskb even with avx512. This will be better than truncating to vXi1 and
42305 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
42306 // vpcmpeqb/vpcmpgtb.
42307 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
42308 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
42309 Src.getOperand(0).getValueType() == MVT::v32i8 ||
42310 Src.getOperand(0).getValueType() == MVT::v64i8);
42312 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
42313 // directly with vpmovmskb/vmovmskps/vmovmskpd.
42314 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
42315 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
42316 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
42317 EVT CmpVT = Src.getOperand(0).getValueType();
42318 EVT EltVT = CmpVT.getVectorElementType();
42319 if (CmpVT.getSizeInBits() <= 256 &&
42320 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
42321 PreferMovMsk = true;
42324 // With AVX512 vxi1 types are legal and we prefer using k-regs.
42325 // MOVMSK is supported in SSE2 or later.
42326 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
42327 return SDValue();
42329 // If the upper ops of a concatenation are undef, then try to bitcast the
42330 // lower op and extend.
42331 SmallVector<SDValue, 4> SubSrcOps;
42332 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
42333 SubSrcOps.size() >= 2) {
42334 SDValue LowerOp = SubSrcOps[0];
42335 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
42336 if (LowerOp.getOpcode() == ISD::SETCC &&
42337 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
42338 EVT SubVT = VT.getIntegerVT(
42339 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
42340 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
42341 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
42342 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
42347 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
42348 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
42349 // v8i16 and v16i16.
42350 // For these two cases, we can shuffle the upper element bytes to a
42351 // consecutive sequence at the start of the vector and treat the results as
42352 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
42353 // for v16i16 this is not the case, because the shuffle is expensive, so we
42354 // avoid sign-extending to this type entirely.
42355 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
42356 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
42357 MVT SExtVT;
42358 bool PropagateSExt = false;
42359 switch (SrcVT.getSimpleVT().SimpleTy) {
42360 default:
42361 return SDValue();
42362 case MVT::v2i1:
42363 SExtVT = MVT::v2i64;
42364 break;
42365 case MVT::v4i1:
42366 SExtVT = MVT::v4i32;
42367 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
42368 // sign-extend to a 256-bit operation to avoid truncation.
42369 if (Subtarget.hasAVX() &&
42370 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
42371 SExtVT = MVT::v4i64;
42372 PropagateSExt = true;
42374 break;
42375 case MVT::v8i1:
42376 SExtVT = MVT::v8i16;
42377 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
42378 // sign-extend to a 256-bit operation to match the compare.
42379 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
42380 // 256-bit because the shuffle is cheaper than sign extending the result of
42381 // the compare.
42382 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
42383 checkBitcastSrcVectorSize(Src, 512, true))) {
42384 SExtVT = MVT::v8i32;
42385 PropagateSExt = true;
42387 break;
42388 case MVT::v16i1:
42389 SExtVT = MVT::v16i8;
42390 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
42391 // it is not profitable to sign-extend to 256-bit because this will
42392 // require an extra cross-lane shuffle which is more expensive than
42393 // truncating the result of the compare to 128-bits.
42394 break;
42395 case MVT::v32i1:
42396 SExtVT = MVT::v32i8;
42397 break;
42398 case MVT::v64i1:
42399 // If we have AVX512F, but not AVX512BW and the input is truncated from
42400 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
42401 if (Subtarget.hasAVX512()) {
42402 if (Subtarget.hasBWI())
42403 return SDValue();
42404 SExtVT = MVT::v64i8;
42405 break;
42407 // Split if this is a <64 x i8> comparison result.
42408 if (checkBitcastSrcVectorSize(Src, 512, false)) {
42409 SExtVT = MVT::v64i8;
42410 break;
42412 return SDValue();
42415 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
42416 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
42418 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
42419 V = getPMOVMSKB(DL, V, DAG, Subtarget);
42420 } else {
42421 if (SExtVT == MVT::v8i16) {
42422 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
42423 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
42425 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
42428 EVT IntVT =
42429 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
42430 V = DAG.getZExtOrTrunc(V, DL, IntVT);
42431 return DAG.getBitcast(VT, V);
42434 // Convert a vXi1 constant build vector to the same width scalar integer.
42435 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
42436 EVT SrcVT = Op.getValueType();
42437 assert(SrcVT.getVectorElementType() == MVT::i1 &&
42438 "Expected a vXi1 vector");
42439 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
42440 "Expected a constant build vector");
42442 APInt Imm(SrcVT.getVectorNumElements(), 0);
42443 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
42444 SDValue In = Op.getOperand(Idx);
42445 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
42446 Imm.setBit(Idx);
42448 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
42449 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
42452 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
42453 TargetLowering::DAGCombinerInfo &DCI,
42454 const X86Subtarget &Subtarget) {
42455 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
42457 if (!DCI.isBeforeLegalizeOps())
42458 return SDValue();
42460 // Only do this if we have k-registers.
42461 if (!Subtarget.hasAVX512())
42462 return SDValue();
42464 EVT DstVT = N->getValueType(0);
42465 SDValue Op = N->getOperand(0);
42466 EVT SrcVT = Op.getValueType();
42468 if (!Op.hasOneUse())
42469 return SDValue();
42471 // Look for logic ops.
42472 if (Op.getOpcode() != ISD::AND &&
42473 Op.getOpcode() != ISD::OR &&
42474 Op.getOpcode() != ISD::XOR)
42475 return SDValue();
42477 // Make sure we have a bitcast between mask registers and a scalar type.
42478 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
42479 DstVT.isScalarInteger()) &&
42480 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
42481 SrcVT.isScalarInteger()))
42482 return SDValue();
42484 SDValue LHS = Op.getOperand(0);
42485 SDValue RHS = Op.getOperand(1);
42487 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
42488 LHS.getOperand(0).getValueType() == DstVT)
42489 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
42490 DAG.getBitcast(DstVT, RHS));
42492 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
42493 RHS.getOperand(0).getValueType() == DstVT)
42494 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
42495 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
42497 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
42498 // Most of these have to move a constant from the scalar domain anyway.
42499 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
42500 RHS = combinevXi1ConstantToInteger(RHS, DAG);
42501 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
42502 DAG.getBitcast(DstVT, LHS), RHS);
42505 return SDValue();
42508 static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
42509 const X86Subtarget &Subtarget) {
42510 SDLoc DL(BV);
42511 unsigned NumElts = BV->getNumOperands();
42512 SDValue Splat = BV->getSplatValue();
42514 // Build MMX element from integer GPR or SSE float values.
42515 auto CreateMMXElement = [&](SDValue V) {
42516 if (V.isUndef())
42517 return DAG.getUNDEF(MVT::x86mmx);
42518 if (V.getValueType().isFloatingPoint()) {
42519 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
42520 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
42521 V = DAG.getBitcast(MVT::v2i64, V);
42522 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
42524 V = DAG.getBitcast(MVT::i32, V);
42525 } else {
42526 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
42528 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
42531 // Convert build vector ops to MMX data in the bottom elements.
42532 SmallVector<SDValue, 8> Ops;
42534 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42536 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
42537 if (Splat) {
42538 if (Splat.isUndef())
42539 return DAG.getUNDEF(MVT::x86mmx);
42541 Splat = CreateMMXElement(Splat);
42543 if (Subtarget.hasSSE1()) {
42544 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
42545 if (NumElts == 8)
42546 Splat = DAG.getNode(
42547 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
42548 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
42549 TLI.getPointerTy(DAG.getDataLayout())),
42550 Splat, Splat);
42552 // Use PSHUFW to repeat 16-bit elements.
42553 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
42554 return DAG.getNode(
42555 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
42556 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
42557 TLI.getPointerTy(DAG.getDataLayout())),
42558 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
42560 Ops.append(NumElts, Splat);
42561 } else {
42562 for (unsigned i = 0; i != NumElts; ++i)
42563 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
42566 // Use tree of PUNPCKLs to build up general MMX vector.
42567 while (Ops.size() > 1) {
42568 unsigned NumOps = Ops.size();
42569 unsigned IntrinOp =
42570 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
42571 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
42572 : Intrinsic::x86_mmx_punpcklbw));
42573 SDValue Intrin = DAG.getTargetConstant(
42574 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
42575 for (unsigned i = 0; i != NumOps; i += 2)
42576 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
42577 Ops[i], Ops[i + 1]);
42578 Ops.resize(NumOps / 2);
42581 return Ops[0];
42584 // Recursive function that attempts to find if a bool vector node was originally
42585 // a vector/float/double that got truncated/extended/bitcast to/from a scalar
42586 // integer. If so, replace the scalar ops with bool vector equivalents back down
42587 // the chain.
42588 static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
42589 SelectionDAG &DAG,
42590 const X86Subtarget &Subtarget) {
42591 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42592 unsigned Opc = V.getOpcode();
42593 switch (Opc) {
42594 case ISD::BITCAST: {
42595 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
42596 SDValue Src = V.getOperand(0);
42597 EVT SrcVT = Src.getValueType();
42598 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
42599 return DAG.getBitcast(VT, Src);
42600 break;
42602 case ISD::TRUNCATE: {
42603 // If we find a suitable source, a truncated scalar becomes a subvector.
42604 SDValue Src = V.getOperand(0);
42605 EVT NewSrcVT =
42606 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
42607 if (TLI.isTypeLegal(NewSrcVT))
42608 if (SDValue N0 =
42609 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
42610 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
42611 DAG.getIntPtrConstant(0, DL));
42612 break;
42614 case ISD::ANY_EXTEND:
42615 case ISD::ZERO_EXTEND: {
42616 // If we find a suitable source, an extended scalar becomes a subvector.
42617 SDValue Src = V.getOperand(0);
42618 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
42619 Src.getScalarValueSizeInBits());
42620 if (TLI.isTypeLegal(NewSrcVT))
42621 if (SDValue N0 =
42622 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
42623 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42624 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
42625 : DAG.getConstant(0, DL, VT),
42626 N0, DAG.getIntPtrConstant(0, DL));
42627 break;
42629 case ISD::OR: {
42630 // If we find suitable sources, we can just move an OR to the vector domain.
42631 SDValue Src0 = V.getOperand(0);
42632 SDValue Src1 = V.getOperand(1);
42633 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
42634 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
42635 return DAG.getNode(Opc, DL, VT, N0, N1);
42636 break;
42638 case ISD::SHL: {
42639 // If we find a suitable source, a SHL becomes a KSHIFTL.
42640 SDValue Src0 = V.getOperand(0);
42641 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
42642 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
42643 break;
42645 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
42646 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
42647 return DAG.getNode(
42648 X86ISD::KSHIFTL, DL, VT, N0,
42649 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
42650 break;
42653 return SDValue();
42656 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
42657 TargetLowering::DAGCombinerInfo &DCI,
42658 const X86Subtarget &Subtarget) {
42659 SDValue N0 = N->getOperand(0);
42660 EVT VT = N->getValueType(0);
42661 EVT SrcVT = N0.getValueType();
42662 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42664 // Try to match patterns such as
42665 // (i16 bitcast (v16i1 x))
42666 // ->
42667 // (i16 movmsk (16i8 sext (v16i1 x)))
42668 // before the setcc result is scalarized on subtargets that don't have legal
42669 // vxi1 types.
42670 if (DCI.isBeforeLegalize()) {
42671 SDLoc dl(N);
42672 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
42673 return V;
42675 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
42676 // type, widen both sides to avoid a trip through memory.
42677 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
42678 Subtarget.hasAVX512()) {
42679 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
42680 N0 = DAG.getBitcast(MVT::v8i1, N0);
42681 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
42682 DAG.getIntPtrConstant(0, dl));
42685 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
42686 // type, widen both sides to avoid a trip through memory.
42687 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
42688 Subtarget.hasAVX512()) {
42689 // Use zeros for the widening if we already have some zeroes. This can
42690 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
42691 // stream of this.
42692 // FIXME: It might make sense to detect a concat_vectors with a mix of
42693 // zeroes and undef and turn it into insert_subvector for i1 vectors as
42694 // a separate combine. What we can't do is canonicalize the operands of
42695 // such a concat or we'll get into a loop with SimplifyDemandedBits.
42696 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
42697 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
42698 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
42699 SrcVT = LastOp.getValueType();
42700 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
42701 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
42702 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
42703 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
42704 N0 = DAG.getBitcast(MVT::i8, N0);
42705 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
42709 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
42710 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
42711 Ops[0] = N0;
42712 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
42713 N0 = DAG.getBitcast(MVT::i8, N0);
42714 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
42716 } else {
42717 // If we're bitcasting from iX to vXi1, see if the integer originally
42718 // began as a vXi1 and whether we can remove the bitcast entirely.
42719 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
42720 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
42721 if (SDValue V =
42722 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
42723 return V;
42727 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
42728 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
42729 // due to insert_subvector legalization on KNL. By promoting the copy to i16
42730 // we can help with known bits propagation from the vXi1 domain to the
42731 // scalar domain.
42732 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
42733 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42734 N0.getOperand(0).getValueType() == MVT::v16i1 &&
42735 isNullConstant(N0.getOperand(1)))
42736 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
42737 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
42739 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
42740 // and the vbroadcast_load are both integer or both fp. In some cases this
42741 // will remove the bitcast entirely.
42742 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
42743 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
42744 auto *BCast = cast<MemIntrinsicSDNode>(N0);
42745 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
42746 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
42747 // Don't swap i8/i16 since don't have fp types that size.
42748 if (MemSize >= 32) {
42749 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
42750 : MVT::getIntegerVT(MemSize);
42751 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
42752 : MVT::getIntegerVT(SrcVTSize);
42753 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
42755 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
42756 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
42757 SDValue ResNode =
42758 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
42759 MemVT, BCast->getMemOperand());
42760 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
42761 return DAG.getBitcast(VT, ResNode);
42765 // Since MMX types are special and don't usually play with other vector types,
42766 // it's better to handle them early to be sure we emit efficient code by
42767 // avoiding store-load conversions.
42768 if (VT == MVT::x86mmx) {
42769 // Detect MMX constant vectors.
42770 APInt UndefElts;
42771 SmallVector<APInt, 1> EltBits;
42772 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
42773 SDLoc DL(N0);
42774 // Handle zero-extension of i32 with MOVD.
42775 if (EltBits[0].countl_zero() >= 32)
42776 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
42777 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
42778 // Else, bitcast to a double.
42779 // TODO - investigate supporting sext 32-bit immediates on x86_64.
42780 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
42781 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
42784 // Detect bitcasts to x86mmx low word.
42785 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
42786 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
42787 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
42788 bool LowUndef = true, AllUndefOrZero = true;
42789 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
42790 SDValue Op = N0.getOperand(i);
42791 LowUndef &= Op.isUndef() || (i >= e/2);
42792 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
42794 if (AllUndefOrZero) {
42795 SDValue N00 = N0.getOperand(0);
42796 SDLoc dl(N00);
42797 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
42798 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
42799 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
42803 // Detect bitcasts of 64-bit build vectors and convert to a
42804 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
42805 // lowest element.
42806 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
42807 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
42808 SrcVT == MVT::v8i8))
42809 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
42811 // Detect bitcasts between element or subvector extraction to x86mmx.
42812 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
42813 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
42814 isNullConstant(N0.getOperand(1))) {
42815 SDValue N00 = N0.getOperand(0);
42816 if (N00.getValueType().is128BitVector())
42817 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
42818 DAG.getBitcast(MVT::v2i64, N00));
42821 // Detect bitcasts from FP_TO_SINT to x86mmx.
42822 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
42823 SDLoc DL(N0);
42824 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
42825 DAG.getUNDEF(MVT::v2i32));
42826 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
42827 DAG.getBitcast(MVT::v2i64, Res));
42831 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
42832 // most of these to scalar anyway.
42833 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
42834 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
42835 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
42836 return combinevXi1ConstantToInteger(N0, DAG);
42839 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
42840 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
42841 isa<ConstantSDNode>(N0)) {
42842 auto *C = cast<ConstantSDNode>(N0);
42843 if (C->isAllOnes())
42844 return DAG.getConstant(1, SDLoc(N0), VT);
42845 if (C->isZero())
42846 return DAG.getConstant(0, SDLoc(N0), VT);
42849 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
42850 // Turn it into a sign bit compare that produces a k-register. This avoids
42851 // a trip through a GPR.
42852 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
42853 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
42854 isPowerOf2_32(VT.getVectorNumElements())) {
42855 unsigned NumElts = VT.getVectorNumElements();
42856 SDValue Src = N0;
42858 // Peek through truncate.
42859 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
42860 Src = N0.getOperand(0);
42862 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
42863 SDValue MovmskIn = Src.getOperand(0);
42864 MVT MovmskVT = MovmskIn.getSimpleValueType();
42865 unsigned MovMskElts = MovmskVT.getVectorNumElements();
42867 // We allow extra bits of the movmsk to be used since they are known zero.
42868 // We can't convert a VPMOVMSKB without avx512bw.
42869 if (MovMskElts <= NumElts &&
42870 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
42871 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
42872 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
42873 SDLoc dl(N);
42874 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
42875 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
42876 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
42877 if (EVT(CmpVT) == VT)
42878 return Cmp;
42880 // Pad with zeroes up to original VT to replace the zeroes that were
42881 // being used from the MOVMSK.
42882 unsigned NumConcats = NumElts / MovMskElts;
42883 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
42884 Ops[0] = Cmp;
42885 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
42890 // Try to remove bitcasts from input and output of mask arithmetic to
42891 // remove GPR<->K-register crossings.
42892 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
42893 return V;
42895 // Convert a bitcasted integer logic operation that has one bitcasted
42896 // floating-point operand into a floating-point logic operation. This may
42897 // create a load of a constant, but that is cheaper than materializing the
42898 // constant in an integer register and transferring it to an SSE register or
42899 // transferring the SSE operand to integer register and back.
42900 unsigned FPOpcode;
42901 switch (N0.getOpcode()) {
42902 case ISD::AND: FPOpcode = X86ISD::FAND; break;
42903 case ISD::OR: FPOpcode = X86ISD::FOR; break;
42904 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
42905 default: return SDValue();
42908 // Check if we have a bitcast from another integer type as well.
42909 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
42910 (Subtarget.hasSSE2() && VT == MVT::f64) ||
42911 (Subtarget.hasFP16() && VT == MVT::f16) ||
42912 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
42913 TLI.isTypeLegal(VT))))
42914 return SDValue();
42916 SDValue LogicOp0 = N0.getOperand(0);
42917 SDValue LogicOp1 = N0.getOperand(1);
42918 SDLoc DL0(N0);
42920 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
42921 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
42922 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
42923 LogicOp0.getOperand(0).getValueType() == VT &&
42924 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
42925 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
42926 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
42927 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
42929 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
42930 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
42931 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
42932 LogicOp1.getOperand(0).getValueType() == VT &&
42933 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
42934 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
42935 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
42936 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
42939 return SDValue();
42942 // (mul (zext a), (sext, b))
42943 static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
42944 SDValue &Op1) {
42945 Op0 = Mul.getOperand(0);
42946 Op1 = Mul.getOperand(1);
42948 // The operand1 should be signed extend
42949 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
42950 std::swap(Op0, Op1);
42952 auto IsFreeTruncation = [](SDValue &Op) -> bool {
42953 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
42954 Op.getOpcode() == ISD::SIGN_EXTEND) &&
42955 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
42956 return true;
42958 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
42959 return (BV && BV->isConstant());
42962 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
42963 // value, we need to check Op0 is zero extended value. Op1 should be signed
42964 // value, so we just check the signed bits.
42965 if ((IsFreeTruncation(Op0) &&
42966 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
42967 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
42968 return true;
42970 return false;
42973 // Given a ABS node, detect the following pattern:
42974 // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
42975 // This is useful as it is the input into a SAD pattern.
42976 static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
42977 SDValue AbsOp1 = Abs->getOperand(0);
42978 if (AbsOp1.getOpcode() != ISD::SUB)
42979 return false;
42981 Op0 = AbsOp1.getOperand(0);
42982 Op1 = AbsOp1.getOperand(1);
42984 // Check if the operands of the sub are zero-extended from vectors of i8.
42985 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
42986 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
42987 Op1.getOpcode() != ISD::ZERO_EXTEND ||
42988 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
42989 return false;
42991 return true;
42994 static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
42995 unsigned &LogBias, const SDLoc &DL,
42996 const X86Subtarget &Subtarget) {
42997 // Extend or truncate to MVT::i8 first.
42998 MVT Vi8VT =
42999 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
43000 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
43001 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
43003 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
43004 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
43005 // The src A, B element type is i8, but the dst C element type is i32.
43006 // When we calculate the reduce stage, we use src vector type vXi8 for it
43007 // so we need logbias 2 to avoid extra 2 stages.
43008 LogBias = 2;
43010 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
43011 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
43012 RegSize = std::max(512u, RegSize);
43014 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
43015 // fill in the missing vector elements with 0.
43016 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
43017 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
43018 Ops[0] = LHS;
43019 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
43020 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43021 Ops[0] = RHS;
43022 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43024 // Actually build the DotProduct, split as 256/512 bits for
43025 // AVXVNNI/AVX512VNNI.
43026 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43027 ArrayRef<SDValue> Ops) {
43028 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43029 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
43031 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
43032 SDValue Zero = DAG.getConstant(0, DL, DpVT);
43034 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
43035 DpBuilder, false);
43038 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
43039 // to these zexts.
43040 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
43041 const SDValue &Zext1, const SDLoc &DL,
43042 const X86Subtarget &Subtarget) {
43043 // Find the appropriate width for the PSADBW.
43044 EVT InVT = Zext0.getOperand(0).getValueType();
43045 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
43047 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
43048 // fill in the missing vector elements with 0.
43049 unsigned NumConcat = RegSize / InVT.getSizeInBits();
43050 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
43051 Ops[0] = Zext0.getOperand(0);
43052 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
43053 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43054 Ops[0] = Zext1.getOperand(0);
43055 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43057 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
43058 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43059 ArrayRef<SDValue> Ops) {
43060 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
43061 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
43063 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
43064 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
43065 PSADBWBuilder);
43068 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
43069 // PHMINPOSUW.
43070 static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
43071 const X86Subtarget &Subtarget) {
43072 // Bail without SSE41.
43073 if (!Subtarget.hasSSE41())
43074 return SDValue();
43076 EVT ExtractVT = Extract->getValueType(0);
43077 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
43078 return SDValue();
43080 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
43081 ISD::NodeType BinOp;
43082 SDValue Src = DAG.matchBinOpReduction(
43083 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
43084 if (!Src)
43085 return SDValue();
43087 EVT SrcVT = Src.getValueType();
43088 EVT SrcSVT = SrcVT.getScalarType();
43089 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
43090 return SDValue();
43092 SDLoc DL(Extract);
43093 SDValue MinPos = Src;
43095 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
43096 while (SrcVT.getSizeInBits() > 128) {
43097 SDValue Lo, Hi;
43098 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
43099 SrcVT = Lo.getValueType();
43100 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
43102 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
43103 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
43104 "Unexpected value type");
43106 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
43107 // to flip the value accordingly.
43108 SDValue Mask;
43109 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
43110 if (BinOp == ISD::SMAX)
43111 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
43112 else if (BinOp == ISD::SMIN)
43113 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
43114 else if (BinOp == ISD::UMAX)
43115 Mask = DAG.getAllOnesConstant(DL, SrcVT);
43117 if (Mask)
43118 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
43120 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
43121 // shuffling each upper element down and insert zeros. This means that the
43122 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
43123 // ready for the PHMINPOS.
43124 if (ExtractVT == MVT::i8) {
43125 SDValue Upper = DAG.getVectorShuffle(
43126 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
43127 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
43128 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
43131 // Perform the PHMINPOS on a v8i16 vector,
43132 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
43133 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
43134 MinPos = DAG.getBitcast(SrcVT, MinPos);
43136 if (Mask)
43137 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
43139 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
43140 DAG.getIntPtrConstant(0, DL));
43143 // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
43144 static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
43145 const X86Subtarget &Subtarget) {
43146 // Bail without SSE2.
43147 if (!Subtarget.hasSSE2())
43148 return SDValue();
43150 EVT ExtractVT = Extract->getValueType(0);
43151 unsigned BitWidth = ExtractVT.getSizeInBits();
43152 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
43153 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
43154 return SDValue();
43156 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
43157 ISD::NodeType BinOp;
43158 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
43159 if (!Match && ExtractVT == MVT::i1)
43160 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
43161 if (!Match)
43162 return SDValue();
43164 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
43165 // which we can't support here for now.
43166 if (Match.getScalarValueSizeInBits() != BitWidth)
43167 return SDValue();
43169 SDValue Movmsk;
43170 SDLoc DL(Extract);
43171 EVT MatchVT = Match.getValueType();
43172 unsigned NumElts = MatchVT.getVectorNumElements();
43173 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
43174 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43175 LLVMContext &Ctx = *DAG.getContext();
43177 if (ExtractVT == MVT::i1) {
43178 // Special case for (pre-legalization) vXi1 reductions.
43179 if (NumElts > 64 || !isPowerOf2_32(NumElts))
43180 return SDValue();
43181 if (Match.getOpcode() == ISD::SETCC) {
43182 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
43183 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
43184 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
43185 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
43186 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
43187 X86::CondCode X86CC;
43188 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
43189 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
43190 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
43191 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
43192 DAG, X86CC))
43193 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
43194 getSETCC(X86CC, V, DL, DAG));
43197 if (TLI.isTypeLegal(MatchVT)) {
43198 // If this is a legal AVX512 predicate type then we can just bitcast.
43199 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
43200 Movmsk = DAG.getBitcast(MovmskVT, Match);
43201 } else {
43202 // Use combineBitcastvxi1 to create the MOVMSK.
43203 while (NumElts > MaxElts) {
43204 SDValue Lo, Hi;
43205 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
43206 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
43207 NumElts /= 2;
43209 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
43210 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
43212 if (!Movmsk)
43213 return SDValue();
43214 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
43215 } else {
43216 // FIXME: Better handling of k-registers or 512-bit vectors?
43217 unsigned MatchSizeInBits = Match.getValueSizeInBits();
43218 if (!(MatchSizeInBits == 128 ||
43219 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
43220 return SDValue();
43222 // Make sure this isn't a vector of 1 element. The perf win from using
43223 // MOVMSK diminishes with less elements in the reduction, but it is
43224 // generally better to get the comparison over to the GPRs as soon as
43225 // possible to reduce the number of vector ops.
43226 if (Match.getValueType().getVectorNumElements() < 2)
43227 return SDValue();
43229 // Check that we are extracting a reduction of all sign bits.
43230 if (DAG.ComputeNumSignBits(Match) != BitWidth)
43231 return SDValue();
43233 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
43234 SDValue Lo, Hi;
43235 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
43236 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
43237 MatchSizeInBits = Match.getValueSizeInBits();
43240 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
43241 MVT MaskSrcVT;
43242 if (64 == BitWidth || 32 == BitWidth)
43243 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
43244 MatchSizeInBits / BitWidth);
43245 else
43246 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
43248 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
43249 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
43250 NumElts = MaskSrcVT.getVectorNumElements();
43252 assert((NumElts <= 32 || NumElts == 64) &&
43253 "Not expecting more than 64 elements");
43255 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
43256 if (BinOp == ISD::XOR) {
43257 // parity -> (PARITY(MOVMSK X))
43258 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
43259 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
43262 SDValue CmpC;
43263 ISD::CondCode CondCode;
43264 if (BinOp == ISD::OR) {
43265 // any_of -> MOVMSK != 0
43266 CmpC = DAG.getConstant(0, DL, CmpVT);
43267 CondCode = ISD::CondCode::SETNE;
43268 } else {
43269 // all_of -> MOVMSK == ((1 << NumElts) - 1)
43270 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
43271 DL, CmpVT);
43272 CondCode = ISD::CondCode::SETEQ;
43275 // The setcc produces an i8 of 0/1, so extend that to the result width and
43276 // negate to get the final 0/-1 mask value.
43277 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
43278 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
43279 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
43280 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
43281 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
43284 static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
43285 const X86Subtarget &Subtarget) {
43286 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
43287 return SDValue();
43289 EVT ExtractVT = Extract->getValueType(0);
43290 // Verify the type we're extracting is i32, as the output element type of
43291 // vpdpbusd is i32.
43292 if (ExtractVT != MVT::i32)
43293 return SDValue();
43295 EVT VT = Extract->getOperand(0).getValueType();
43296 if (!isPowerOf2_32(VT.getVectorNumElements()))
43297 return SDValue();
43299 // Match shuffle + add pyramid.
43300 ISD::NodeType BinOp;
43301 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
43303 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
43304 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
43305 // before adding into the accumulator.
43306 // TODO:
43307 // We also need to verify that the multiply has at least 2x the number of bits
43308 // of the input. We shouldn't match
43309 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
43310 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
43311 // Root = Root.getOperand(0);
43313 // If there was a match, we want Root to be a mul.
43314 if (!Root || Root.getOpcode() != ISD::MUL)
43315 return SDValue();
43317 // Check whether we have an extend and mul pattern
43318 SDValue LHS, RHS;
43319 if (!detectExtMul(DAG, Root, LHS, RHS))
43320 return SDValue();
43322 // Create the dot product instruction.
43323 SDLoc DL(Extract);
43324 unsigned StageBias;
43325 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
43327 // If the original vector was wider than 4 elements, sum over the results
43328 // in the DP vector.
43329 unsigned Stages = Log2_32(VT.getVectorNumElements());
43330 EVT DpVT = DP.getValueType();
43332 if (Stages > StageBias) {
43333 unsigned DpElems = DpVT.getVectorNumElements();
43335 for (unsigned i = Stages - StageBias; i > 0; --i) {
43336 SmallVector<int, 16> Mask(DpElems, -1);
43337 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
43338 Mask[j] = MaskEnd + j;
43340 SDValue Shuffle =
43341 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
43342 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
43346 // Return the lowest ExtractSizeInBits bits.
43347 EVT ResVT =
43348 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
43349 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
43350 DP = DAG.getBitcast(ResVT, DP);
43351 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
43352 Extract->getOperand(1));
43355 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
43356 const X86Subtarget &Subtarget) {
43357 // PSADBW is only supported on SSE2 and up.
43358 if (!Subtarget.hasSSE2())
43359 return SDValue();
43361 EVT ExtractVT = Extract->getValueType(0);
43362 // Verify the type we're extracting is either i32 or i64.
43363 // FIXME: Could support other types, but this is what we have coverage for.
43364 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
43365 return SDValue();
43367 EVT VT = Extract->getOperand(0).getValueType();
43368 if (!isPowerOf2_32(VT.getVectorNumElements()))
43369 return SDValue();
43371 // Match shuffle + add pyramid.
43372 ISD::NodeType BinOp;
43373 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
43375 // The operand is expected to be zero extended from i8
43376 // (verified in detectZextAbsDiff).
43377 // In order to convert to i64 and above, additional any/zero/sign
43378 // extend is expected.
43379 // The zero extend from 32 bit has no mathematical effect on the result.
43380 // Also the sign extend is basically zero extend
43381 // (extends the sign bit which is zero).
43382 // So it is correct to skip the sign/zero extend instruction.
43383 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
43384 Root.getOpcode() == ISD::ZERO_EXTEND ||
43385 Root.getOpcode() == ISD::ANY_EXTEND))
43386 Root = Root.getOperand(0);
43388 // If there was a match, we want Root to be a select that is the root of an
43389 // abs-diff pattern.
43390 if (!Root || Root.getOpcode() != ISD::ABS)
43391 return SDValue();
43393 // Check whether we have an abs-diff pattern feeding into the select.
43394 SDValue Zext0, Zext1;
43395 if (!detectZextAbsDiff(Root, Zext0, Zext1))
43396 return SDValue();
43398 // Create the SAD instruction.
43399 SDLoc DL(Extract);
43400 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
43402 // If the original vector was wider than 8 elements, sum over the results
43403 // in the SAD vector.
43404 unsigned Stages = Log2_32(VT.getVectorNumElements());
43405 EVT SadVT = SAD.getValueType();
43406 if (Stages > 3) {
43407 unsigned SadElems = SadVT.getVectorNumElements();
43409 for(unsigned i = Stages - 3; i > 0; --i) {
43410 SmallVector<int, 16> Mask(SadElems, -1);
43411 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
43412 Mask[j] = MaskEnd + j;
43414 SDValue Shuffle =
43415 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
43416 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
43420 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
43421 // Return the lowest ExtractSizeInBits bits.
43422 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
43423 SadVT.getSizeInBits() / ExtractSizeInBits);
43424 SAD = DAG.getBitcast(ResVT, SAD);
43425 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
43426 Extract->getOperand(1));
43429 // Attempt to peek through a target shuffle and extract the scalar from the
43430 // source.
43431 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
43432 TargetLowering::DAGCombinerInfo &DCI,
43433 const X86Subtarget &Subtarget) {
43434 if (DCI.isBeforeLegalizeOps())
43435 return SDValue();
43437 SDLoc dl(N);
43438 SDValue Src = N->getOperand(0);
43439 SDValue Idx = N->getOperand(1);
43441 EVT VT = N->getValueType(0);
43442 EVT SrcVT = Src.getValueType();
43443 EVT SrcSVT = SrcVT.getVectorElementType();
43444 unsigned SrcEltBits = SrcSVT.getSizeInBits();
43445 unsigned NumSrcElts = SrcVT.getVectorNumElements();
43447 // Don't attempt this for boolean mask vectors or unknown extraction indices.
43448 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
43449 return SDValue();
43451 const APInt &IdxC = N->getConstantOperandAPInt(1);
43452 if (IdxC.uge(NumSrcElts))
43453 return SDValue();
43455 SDValue SrcBC = peekThroughBitcasts(Src);
43457 // Handle extract(bitcast(broadcast(scalar_value))).
43458 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
43459 SDValue SrcOp = SrcBC.getOperand(0);
43460 EVT SrcOpVT = SrcOp.getValueType();
43461 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
43462 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
43463 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
43464 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
43465 // TODO support non-zero offsets.
43466 if (Offset == 0) {
43467 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
43468 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
43469 return SrcOp;
43474 // If we're extracting a single element from a broadcast load and there are
43475 // no other users, just create a single load.
43476 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
43477 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
43478 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
43479 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
43480 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
43481 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
43482 MemIntr->getBasePtr(),
43483 MemIntr->getPointerInfo(),
43484 MemIntr->getOriginalAlign(),
43485 MemIntr->getMemOperand()->getFlags());
43486 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
43487 return Load;
43491 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
43492 // TODO: Move to DAGCombine?
43493 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
43494 SrcBC.getValueType().isInteger() &&
43495 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
43496 SrcBC.getScalarValueSizeInBits() ==
43497 SrcBC.getOperand(0).getValueSizeInBits()) {
43498 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
43499 if (IdxC.ult(Scale)) {
43500 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
43501 SDValue Scl = SrcBC.getOperand(0);
43502 EVT SclVT = Scl.getValueType();
43503 if (Offset) {
43504 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
43505 DAG.getShiftAmountConstant(Offset, SclVT, dl));
43507 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
43508 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
43509 return Scl;
43513 // Handle extract(truncate(x)) for 0'th index.
43514 // TODO: Treat this as a faux shuffle?
43515 // TODO: When can we use this for general indices?
43516 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
43517 (SrcVT.getSizeInBits() % 128) == 0) {
43518 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
43519 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
43520 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
43521 Idx);
43524 // We can only legally extract other elements from 128-bit vectors and in
43525 // certain circumstances, depending on SSE-level.
43526 // TODO: Investigate float/double extraction if it will be just stored.
43527 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
43528 unsigned Idx) {
43529 EVT VecSVT = VecVT.getScalarType();
43530 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
43531 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
43532 VecSVT == MVT::i64)) {
43533 unsigned EltSizeInBits = VecSVT.getSizeInBits();
43534 unsigned NumEltsPerLane = 128 / EltSizeInBits;
43535 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
43536 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
43537 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
43538 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
43539 Idx &= (NumEltsPerLane - 1);
43541 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
43542 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
43543 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
43544 DAG.getBitcast(VecVT, Vec),
43545 DAG.getIntPtrConstant(Idx, dl));
43547 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
43548 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
43549 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
43550 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
43551 DAG.getTargetConstant(Idx, dl, MVT::i8));
43553 return SDValue();
43556 // Resolve the target shuffle inputs and mask.
43557 SmallVector<int, 16> Mask;
43558 SmallVector<SDValue, 2> Ops;
43559 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
43560 return SDValue();
43562 // Shuffle inputs must be the same size as the result.
43563 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
43564 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
43566 return SDValue();
43568 // Attempt to narrow/widen the shuffle mask to the correct size.
43569 if (Mask.size() != NumSrcElts) {
43570 if ((NumSrcElts % Mask.size()) == 0) {
43571 SmallVector<int, 16> ScaledMask;
43572 int Scale = NumSrcElts / Mask.size();
43573 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
43574 Mask = std::move(ScaledMask);
43575 } else if ((Mask.size() % NumSrcElts) == 0) {
43576 // Simplify Mask based on demanded element.
43577 int ExtractIdx = (int)IdxC.getZExtValue();
43578 int Scale = Mask.size() / NumSrcElts;
43579 int Lo = Scale * ExtractIdx;
43580 int Hi = Scale * (ExtractIdx + 1);
43581 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
43582 if (i < Lo || Hi <= i)
43583 Mask[i] = SM_SentinelUndef;
43585 SmallVector<int, 16> WidenedMask;
43586 while (Mask.size() > NumSrcElts &&
43587 canWidenShuffleElements(Mask, WidenedMask))
43588 Mask = std::move(WidenedMask);
43592 // If narrowing/widening failed, see if we can extract+zero-extend.
43593 int ExtractIdx;
43594 EVT ExtractVT;
43595 if (Mask.size() == NumSrcElts) {
43596 ExtractIdx = Mask[IdxC.getZExtValue()];
43597 ExtractVT = SrcVT;
43598 } else {
43599 unsigned Scale = Mask.size() / NumSrcElts;
43600 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
43601 return SDValue();
43602 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
43603 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
43604 return SDValue();
43605 ExtractIdx = Mask[ScaledIdx];
43606 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
43607 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
43608 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
43609 "Failed to widen vector type");
43612 // If the shuffle source element is undef/zero then we can just accept it.
43613 if (ExtractIdx == SM_SentinelUndef)
43614 return DAG.getUNDEF(VT);
43616 if (ExtractIdx == SM_SentinelZero)
43617 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
43618 : DAG.getConstant(0, dl, VT);
43620 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
43621 ExtractIdx = ExtractIdx % Mask.size();
43622 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
43623 return DAG.getZExtOrTrunc(V, dl, VT);
43625 return SDValue();
43628 /// Extracting a scalar FP value from vector element 0 is free, so extract each
43629 /// operand first, then perform the math as a scalar op.
43630 static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
43631 const X86Subtarget &Subtarget) {
43632 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
43633 SDValue Vec = ExtElt->getOperand(0);
43634 SDValue Index = ExtElt->getOperand(1);
43635 EVT VT = ExtElt->getValueType(0);
43636 EVT VecVT = Vec.getValueType();
43638 // TODO: If this is a unary/expensive/expand op, allow extraction from a
43639 // non-zero element because the shuffle+scalar op will be cheaper?
43640 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
43641 return SDValue();
43643 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
43644 // extract, the condition code), so deal with those as a special-case.
43645 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
43646 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
43647 if (OpVT != MVT::f32 && OpVT != MVT::f64)
43648 return SDValue();
43650 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
43651 SDLoc DL(ExtElt);
43652 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
43653 Vec.getOperand(0), Index);
43654 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
43655 Vec.getOperand(1), Index);
43656 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
43659 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
43660 VT != MVT::f64)
43661 return SDValue();
43663 // Vector FP selects don't fit the pattern of FP math ops (because the
43664 // condition has a different type and we have to change the opcode), so deal
43665 // with those here.
43666 // FIXME: This is restricted to pre type legalization by ensuring the setcc
43667 // has i1 elements. If we loosen this we need to convert vector bool to a
43668 // scalar bool.
43669 if (Vec.getOpcode() == ISD::VSELECT &&
43670 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
43671 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
43672 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
43673 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
43674 SDLoc DL(ExtElt);
43675 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
43676 Vec.getOperand(0).getValueType().getScalarType(),
43677 Vec.getOperand(0), Index);
43678 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
43679 Vec.getOperand(1), Index);
43680 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
43681 Vec.getOperand(2), Index);
43682 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
43685 // TODO: This switch could include FNEG and the x86-specific FP logic ops
43686 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
43687 // missed load folding and fma+fneg combining.
43688 switch (Vec.getOpcode()) {
43689 case ISD::FMA: // Begin 3 operands
43690 case ISD::FMAD:
43691 case ISD::FADD: // Begin 2 operands
43692 case ISD::FSUB:
43693 case ISD::FMUL:
43694 case ISD::FDIV:
43695 case ISD::FREM:
43696 case ISD::FCOPYSIGN:
43697 case ISD::FMINNUM:
43698 case ISD::FMAXNUM:
43699 case ISD::FMINNUM_IEEE:
43700 case ISD::FMAXNUM_IEEE:
43701 case ISD::FMAXIMUM:
43702 case ISD::FMINIMUM:
43703 case X86ISD::FMAX:
43704 case X86ISD::FMIN:
43705 case ISD::FABS: // Begin 1 operand
43706 case ISD::FSQRT:
43707 case ISD::FRINT:
43708 case ISD::FCEIL:
43709 case ISD::FTRUNC:
43710 case ISD::FNEARBYINT:
43711 case ISD::FROUND:
43712 case ISD::FFLOOR:
43713 case X86ISD::FRCP:
43714 case X86ISD::FRSQRT: {
43715 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
43716 SDLoc DL(ExtElt);
43717 SmallVector<SDValue, 4> ExtOps;
43718 for (SDValue Op : Vec->ops())
43719 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
43720 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
43722 default:
43723 return SDValue();
43725 llvm_unreachable("All opcodes should return within switch");
43728 /// Try to convert a vector reduction sequence composed of binops and shuffles
43729 /// into horizontal ops.
43730 static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
43731 const X86Subtarget &Subtarget) {
43732 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
43734 // We need at least SSE2 to anything here.
43735 if (!Subtarget.hasSSE2())
43736 return SDValue();
43738 ISD::NodeType Opc;
43739 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
43740 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
43741 if (!Rdx)
43742 return SDValue();
43744 SDValue Index = ExtElt->getOperand(1);
43745 assert(isNullConstant(Index) &&
43746 "Reduction doesn't end in an extract from index 0");
43748 EVT VT = ExtElt->getValueType(0);
43749 EVT VecVT = Rdx.getValueType();
43750 if (VecVT.getScalarType() != VT)
43751 return SDValue();
43753 SDLoc DL(ExtElt);
43754 unsigned NumElts = VecVT.getVectorNumElements();
43755 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
43757 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
43758 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
43759 if (V.getValueType() == MVT::v4i8) {
43760 if (ZeroExtend && Subtarget.hasSSE41()) {
43761 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
43762 DAG.getConstant(0, DL, MVT::v4i32),
43763 DAG.getBitcast(MVT::i32, V),
43764 DAG.getIntPtrConstant(0, DL));
43765 return DAG.getBitcast(MVT::v16i8, V);
43767 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
43768 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
43769 : DAG.getUNDEF(MVT::v4i8));
43771 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
43772 DAG.getUNDEF(MVT::v8i8));
43775 // vXi8 mul reduction - promote to vXi16 mul reduction.
43776 if (Opc == ISD::MUL) {
43777 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
43778 return SDValue();
43779 if (VecVT.getSizeInBits() >= 128) {
43780 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
43781 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
43782 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
43783 Lo = DAG.getBitcast(WideVT, Lo);
43784 Hi = DAG.getBitcast(WideVT, Hi);
43785 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
43786 while (Rdx.getValueSizeInBits() > 128) {
43787 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
43788 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
43790 } else {
43791 Rdx = WidenToV16I8(Rdx, false);
43792 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
43793 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
43795 if (NumElts >= 8)
43796 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
43797 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
43798 {4, 5, 6, 7, -1, -1, -1, -1}));
43799 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
43800 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
43801 {2, 3, -1, -1, -1, -1, -1, -1}));
43802 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
43803 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
43804 {1, -1, -1, -1, -1, -1, -1, -1}));
43805 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
43806 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43809 // vXi8 add reduction - sub 128-bit vector.
43810 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
43811 Rdx = WidenToV16I8(Rdx, true);
43812 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
43813 DAG.getConstant(0, DL, MVT::v16i8));
43814 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
43815 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43818 // Must be a >=128-bit vector with pow2 elements.
43819 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
43820 return SDValue();
43822 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
43823 if (VT == MVT::i8) {
43824 while (Rdx.getValueSizeInBits() > 128) {
43825 SDValue Lo, Hi;
43826 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
43827 VecVT = Lo.getValueType();
43828 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
43830 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
43832 SDValue Hi = DAG.getVectorShuffle(
43833 MVT::v16i8, DL, Rdx, Rdx,
43834 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
43835 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
43836 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
43837 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
43838 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
43839 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43842 // See if we can use vXi8 PSADBW add reduction for larger zext types.
43843 // If the source vector values are 0-255, then we can use PSADBW to
43844 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
43845 // TODO: See if its worth avoiding vXi16/i32 truncations?
43846 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
43847 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
43848 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
43849 Subtarget.hasAVX512())) {
43850 if (Rdx.getValueType() == MVT::v8i16) {
43851 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
43852 DAG.getUNDEF(MVT::v8i16));
43853 } else {
43854 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
43855 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
43856 if (ByteVT.getSizeInBits() < 128)
43857 Rdx = WidenToV16I8(Rdx, true);
43860 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
43861 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43862 ArrayRef<SDValue> Ops) {
43863 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
43864 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
43865 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
43867 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
43868 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
43870 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
43871 while (Rdx.getValueSizeInBits() > 128) {
43872 SDValue Lo, Hi;
43873 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
43874 VecVT = Lo.getValueType();
43875 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
43877 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
43879 if (NumElts > 8) {
43880 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
43881 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
43884 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
43885 Rdx = DAG.getBitcast(VecVT, Rdx);
43886 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43889 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
43890 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
43891 return SDValue();
43893 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
43895 // 256-bit horizontal instructions operate on 128-bit chunks rather than
43896 // across the whole vector, so we need an extract + hop preliminary stage.
43897 // This is the only step where the operands of the hop are not the same value.
43898 // TODO: We could extend this to handle 512-bit or even longer vectors.
43899 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
43900 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
43901 unsigned NumElts = VecVT.getVectorNumElements();
43902 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
43903 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
43904 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
43905 VecVT = Rdx.getValueType();
43907 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
43908 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
43909 return SDValue();
43911 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
43912 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
43913 for (unsigned i = 0; i != ReductionSteps; ++i)
43914 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
43916 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43919 /// Detect vector gather/scatter index generation and convert it from being a
43920 /// bunch of shuffles and extracts into a somewhat faster sequence.
43921 /// For i686, the best sequence is apparently storing the value and loading
43922 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
43923 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
43924 TargetLowering::DAGCombinerInfo &DCI,
43925 const X86Subtarget &Subtarget) {
43926 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
43927 return NewOp;
43929 SDValue InputVector = N->getOperand(0);
43930 SDValue EltIdx = N->getOperand(1);
43931 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
43933 EVT SrcVT = InputVector.getValueType();
43934 EVT VT = N->getValueType(0);
43935 SDLoc dl(InputVector);
43936 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
43937 unsigned NumSrcElts = SrcVT.getVectorNumElements();
43938 unsigned NumEltBits = VT.getScalarSizeInBits();
43939 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43941 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
43942 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
43944 // Integer Constant Folding.
43945 if (CIdx && VT.isInteger()) {
43946 APInt UndefVecElts;
43947 SmallVector<APInt, 16> EltBits;
43948 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
43949 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
43950 EltBits, true, false)) {
43951 uint64_t Idx = CIdx->getZExtValue();
43952 if (UndefVecElts[Idx])
43953 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
43954 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
43957 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
43958 // Improves lowering of bool masks on rust which splits them into byte array.
43959 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
43960 SDValue Src = peekThroughBitcasts(InputVector);
43961 if (Src.getValueType().getScalarType() == MVT::i1 &&
43962 TLI.isTypeLegal(Src.getValueType())) {
43963 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
43964 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
43965 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
43966 return DAG.getBitcast(VT, Sub);
43971 if (IsPextr) {
43972 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
43973 DCI))
43974 return SDValue(N, 0);
43976 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
43977 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
43978 InputVector.getOpcode() == X86ISD::PINSRW) &&
43979 InputVector.getOperand(2) == EltIdx) {
43980 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
43981 "Vector type mismatch");
43982 SDValue Scl = InputVector.getOperand(1);
43983 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
43984 return DAG.getZExtOrTrunc(Scl, dl, VT);
43987 // TODO - Remove this once we can handle the implicit zero-extension of
43988 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
43989 // combineBasicSADPattern.
43990 return SDValue();
43993 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
43994 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
43995 InputVector.getOpcode() == ISD::BITCAST &&
43996 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
43997 isNullConstant(EltIdx) && InputVector.hasOneUse())
43998 return DAG.getBitcast(VT, InputVector);
44000 // Detect mmx to i32 conversion through a v2i32 elt extract.
44001 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
44002 InputVector.getOpcode() == ISD::BITCAST &&
44003 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
44004 isNullConstant(EltIdx) && InputVector.hasOneUse())
44005 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
44006 InputVector.getOperand(0));
44008 // Check whether this extract is the root of a sum of absolute differences
44009 // pattern. This has to be done here because we really want it to happen
44010 // pre-legalization,
44011 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
44012 return SAD;
44014 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
44015 return VPDPBUSD;
44017 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
44018 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
44019 return Cmp;
44021 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
44022 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
44023 return MinMax;
44025 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
44026 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
44027 return V;
44029 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
44030 return V;
44032 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
44033 // and then testing the relevant element.
44035 // Note that we only combine extracts on the *same* result number, i.e.
44036 // t0 = merge_values a0, a1, a2, a3
44037 // i1 = extract_vector_elt t0, Constant:i64<2>
44038 // i1 = extract_vector_elt t0, Constant:i64<3>
44039 // but not
44040 // i1 = extract_vector_elt t0:1, Constant:i64<2>
44041 // since the latter would need its own MOVMSK.
44042 if (SrcVT.getScalarType() == MVT::i1) {
44043 bool IsVar = !CIdx;
44044 SmallVector<SDNode *, 16> BoolExtracts;
44045 unsigned ResNo = InputVector.getResNo();
44046 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
44047 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44048 Use->getOperand(0).getResNo() == ResNo &&
44049 Use->getValueType(0) == MVT::i1) {
44050 BoolExtracts.push_back(Use);
44051 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
44052 return true;
44054 return false;
44056 // TODO: Can we drop the oneuse check for constant extracts?
44057 if (all_of(InputVector->uses(), IsBoolExtract) &&
44058 (IsVar || BoolExtracts.size() > 1)) {
44059 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
44060 if (SDValue BC =
44061 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
44062 for (SDNode *Use : BoolExtracts) {
44063 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
44064 // Mask = 1 << MaskIdx
44065 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
44066 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
44067 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
44068 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
44069 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
44070 DCI.CombineTo(Use, Res);
44072 return SDValue(N, 0);
44077 // If this extract is from a loaded vector value and will be used as an
44078 // integer, that requires a potentially expensive XMM -> GPR transfer.
44079 // Additionally, if we can convert to a scalar integer load, that will likely
44080 // be folded into a subsequent integer op.
44081 // Note: Unlike the related fold for this in DAGCombiner, this is not limited
44082 // to a single-use of the loaded vector. For the reasons above, we
44083 // expect this to be profitable even if it creates an extra load.
44084 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
44085 return Use->getOpcode() == ISD::STORE ||
44086 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
44087 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
44089 auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
44090 if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
44091 SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
44092 !LikelyUsedAsVector && LoadVec->isSimple()) {
44093 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44094 SDValue NewPtr =
44095 TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
44096 unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
44097 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
44098 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
44099 SDValue Load =
44100 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
44101 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
44102 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
44103 return Load;
44106 return SDValue();
44109 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
44110 // This is more or less the reverse of combineBitcastvxi1.
44111 static SDValue combineToExtendBoolVectorInReg(
44112 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
44113 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
44114 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
44115 Opcode != ISD::ANY_EXTEND)
44116 return SDValue();
44117 if (!DCI.isBeforeLegalizeOps())
44118 return SDValue();
44119 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
44120 return SDValue();
44122 EVT SVT = VT.getScalarType();
44123 EVT InSVT = N0.getValueType().getScalarType();
44124 unsigned EltSizeInBits = SVT.getSizeInBits();
44126 // Input type must be extending a bool vector (bit-casted from a scalar
44127 // integer) to legal integer types.
44128 if (!VT.isVector())
44129 return SDValue();
44130 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
44131 return SDValue();
44132 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
44133 return SDValue();
44135 SDValue N00 = N0.getOperand(0);
44136 EVT SclVT = N00.getValueType();
44137 if (!SclVT.isScalarInteger())
44138 return SDValue();
44140 SDValue Vec;
44141 SmallVector<int> ShuffleMask;
44142 unsigned NumElts = VT.getVectorNumElements();
44143 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
44145 // Broadcast the scalar integer to the vector elements.
44146 if (NumElts > EltSizeInBits) {
44147 // If the scalar integer is greater than the vector element size, then we
44148 // must split it down into sub-sections for broadcasting. For example:
44149 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
44150 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
44151 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
44152 unsigned Scale = NumElts / EltSizeInBits;
44153 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
44154 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44155 Vec = DAG.getBitcast(VT, Vec);
44157 for (unsigned i = 0; i != Scale; ++i)
44158 ShuffleMask.append(EltSizeInBits, i);
44159 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44160 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
44161 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
44162 // If we have register broadcast instructions, use the scalar size as the
44163 // element type for the shuffle. Then cast to the wider element type. The
44164 // widened bits won't be used, and this might allow the use of a broadcast
44165 // load.
44166 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
44167 unsigned Scale = EltSizeInBits / NumElts;
44168 EVT BroadcastVT =
44169 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
44170 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44171 ShuffleMask.append(NumElts * Scale, 0);
44172 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
44173 Vec = DAG.getBitcast(VT, Vec);
44174 } else {
44175 // For smaller scalar integers, we can simply any-extend it to the vector
44176 // element size (we don't care about the upper bits) and broadcast it to all
44177 // elements.
44178 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
44179 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
44180 ShuffleMask.append(NumElts, 0);
44181 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44184 // Now, mask the relevant bit in each element.
44185 SmallVector<SDValue, 32> Bits;
44186 for (unsigned i = 0; i != NumElts; ++i) {
44187 int BitIdx = (i % EltSizeInBits);
44188 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
44189 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
44191 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
44192 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
44194 // Compare against the bitmask and extend the result.
44195 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
44196 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
44197 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
44199 // For SEXT, this is now done, otherwise shift the result down for
44200 // zero-extension.
44201 if (Opcode == ISD::SIGN_EXTEND)
44202 return Vec;
44203 return DAG.getNode(ISD::SRL, DL, VT, Vec,
44204 DAG.getConstant(EltSizeInBits - 1, DL, VT));
44207 /// If a vector select has an operand that is -1 or 0, try to simplify the
44208 /// select to a bitwise logic operation.
44209 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
44210 static SDValue
44211 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
44212 TargetLowering::DAGCombinerInfo &DCI,
44213 const X86Subtarget &Subtarget) {
44214 SDValue Cond = N->getOperand(0);
44215 SDValue LHS = N->getOperand(1);
44216 SDValue RHS = N->getOperand(2);
44217 EVT VT = LHS.getValueType();
44218 EVT CondVT = Cond.getValueType();
44219 SDLoc DL(N);
44220 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44222 if (N->getOpcode() != ISD::VSELECT)
44223 return SDValue();
44225 assert(CondVT.isVector() && "Vector select expects a vector selector!");
44227 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
44228 // TODO: Can we assert that both operands are not zeros (because that should
44229 // get simplified at node creation time)?
44230 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
44231 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
44233 // If both inputs are 0/undef, create a complete zero vector.
44234 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
44235 if (TValIsAllZeros && FValIsAllZeros) {
44236 if (VT.isFloatingPoint())
44237 return DAG.getConstantFP(0.0, DL, VT);
44238 return DAG.getConstant(0, DL, VT);
44241 // To use the condition operand as a bitwise mask, it must have elements that
44242 // are the same size as the select elements. Ie, the condition operand must
44243 // have already been promoted from the IR select condition type <N x i1>.
44244 // Don't check if the types themselves are equal because that excludes
44245 // vector floating-point selects.
44246 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
44247 return SDValue();
44249 // Try to invert the condition if true value is not all 1s and false value is
44250 // not all 0s. Only do this if the condition has one use.
44251 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
44252 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
44253 // Check if the selector will be produced by CMPP*/PCMP*.
44254 Cond.getOpcode() == ISD::SETCC &&
44255 // Check if SETCC has already been promoted.
44256 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
44257 CondVT) {
44258 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
44260 if (TValIsAllZeros || FValIsAllOnes) {
44261 SDValue CC = Cond.getOperand(2);
44262 ISD::CondCode NewCC = ISD::getSetCCInverse(
44263 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
44264 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
44265 NewCC);
44266 std::swap(LHS, RHS);
44267 TValIsAllOnes = FValIsAllOnes;
44268 FValIsAllZeros = TValIsAllZeros;
44272 // Cond value must be 'sign splat' to be converted to a logical op.
44273 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
44274 return SDValue();
44276 // vselect Cond, 111..., 000... -> Cond
44277 if (TValIsAllOnes && FValIsAllZeros)
44278 return DAG.getBitcast(VT, Cond);
44280 if (!TLI.isTypeLegal(CondVT))
44281 return SDValue();
44283 // vselect Cond, 111..., X -> or Cond, X
44284 if (TValIsAllOnes) {
44285 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
44286 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
44287 return DAG.getBitcast(VT, Or);
44290 // vselect Cond, X, 000... -> and Cond, X
44291 if (FValIsAllZeros) {
44292 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
44293 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
44294 return DAG.getBitcast(VT, And);
44297 // vselect Cond, 000..., X -> andn Cond, X
44298 if (TValIsAllZeros) {
44299 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
44300 SDValue AndN;
44301 // The canonical form differs for i1 vectors - x86andnp is not used
44302 if (CondVT.getScalarType() == MVT::i1)
44303 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
44304 CastRHS);
44305 else
44306 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
44307 return DAG.getBitcast(VT, AndN);
44310 return SDValue();
44313 /// If both arms of a vector select are concatenated vectors, split the select,
44314 /// and concatenate the result to eliminate a wide (256-bit) vector instruction:
44315 /// vselect Cond, (concat T0, T1), (concat F0, F1) -->
44316 /// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
44317 static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
44318 const X86Subtarget &Subtarget) {
44319 unsigned Opcode = N->getOpcode();
44320 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
44321 return SDValue();
44323 // TODO: Split 512-bit vectors too?
44324 EVT VT = N->getValueType(0);
44325 if (!VT.is256BitVector())
44326 return SDValue();
44328 // TODO: Split as long as any 2 of the 3 operands are concatenated?
44329 SDValue Cond = N->getOperand(0);
44330 SDValue TVal = N->getOperand(1);
44331 SDValue FVal = N->getOperand(2);
44332 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
44333 !isFreeToSplitVector(TVal.getNode(), DAG) ||
44334 !isFreeToSplitVector(FVal.getNode(), DAG))
44335 return SDValue();
44337 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
44338 ArrayRef<SDValue> Ops) {
44339 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
44341 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
44342 makeBlend, /*CheckBWI*/ false);
44345 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
44346 SDValue Cond = N->getOperand(0);
44347 SDValue LHS = N->getOperand(1);
44348 SDValue RHS = N->getOperand(2);
44349 SDLoc DL(N);
44351 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
44352 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
44353 if (!TrueC || !FalseC)
44354 return SDValue();
44356 // Don't do this for crazy integer types.
44357 EVT VT = N->getValueType(0);
44358 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
44359 return SDValue();
44361 // We're going to use the condition bit in math or logic ops. We could allow
44362 // this with a wider condition value (post-legalization it becomes an i8),
44363 // but if nothing is creating selects that late, it doesn't matter.
44364 if (Cond.getValueType() != MVT::i1)
44365 return SDValue();
44367 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
44368 // 3, 5, or 9 with i32/i64, so those get transformed too.
44369 // TODO: For constants that overflow or do not differ by power-of-2 or small
44370 // multiplier, convert to 'and' + 'add'.
44371 const APInt &TrueVal = TrueC->getAPIntValue();
44372 const APInt &FalseVal = FalseC->getAPIntValue();
44374 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
44375 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
44376 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
44377 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44378 if (CC == ISD::SETEQ || CC == ISD::SETNE)
44379 return SDValue();
44382 bool OV;
44383 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
44384 if (OV)
44385 return SDValue();
44387 APInt AbsDiff = Diff.abs();
44388 if (AbsDiff.isPowerOf2() ||
44389 ((VT == MVT::i32 || VT == MVT::i64) &&
44390 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
44392 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
44393 // of the condition can usually be folded into a compare predicate, but even
44394 // without that, the sequence should be cheaper than a CMOV alternative.
44395 if (TrueVal.slt(FalseVal)) {
44396 Cond = DAG.getNOT(DL, Cond, MVT::i1);
44397 std::swap(TrueC, FalseC);
44400 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
44401 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
44403 // Multiply condition by the difference if non-one.
44404 if (!AbsDiff.isOne())
44405 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
44407 // Add the base if non-zero.
44408 if (!FalseC->isZero())
44409 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
44411 return R;
44414 return SDValue();
44417 /// If this is a *dynamic* select (non-constant condition) and we can match
44418 /// this node with one of the variable blend instructions, restructure the
44419 /// condition so that blends can use the high (sign) bit of each element.
44420 /// This function will also call SimplifyDemandedBits on already created
44421 /// BLENDV to perform additional simplifications.
44422 static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
44423 TargetLowering::DAGCombinerInfo &DCI,
44424 const X86Subtarget &Subtarget) {
44425 SDValue Cond = N->getOperand(0);
44426 if ((N->getOpcode() != ISD::VSELECT &&
44427 N->getOpcode() != X86ISD::BLENDV) ||
44428 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
44429 return SDValue();
44431 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44432 unsigned BitWidth = Cond.getScalarValueSizeInBits();
44433 EVT VT = N->getValueType(0);
44435 // We can only handle the cases where VSELECT is directly legal on the
44436 // subtarget. We custom lower VSELECT nodes with constant conditions and
44437 // this makes it hard to see whether a dynamic VSELECT will correctly
44438 // lower, so we both check the operation's status and explicitly handle the
44439 // cases where a *dynamic* blend will fail even though a constant-condition
44440 // blend could be custom lowered.
44441 // FIXME: We should find a better way to handle this class of problems.
44442 // Potentially, we should combine constant-condition vselect nodes
44443 // pre-legalization into shuffles and not mark as many types as custom
44444 // lowered.
44445 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
44446 return SDValue();
44447 // FIXME: We don't support i16-element blends currently. We could and
44448 // should support them by making *all* the bits in the condition be set
44449 // rather than just the high bit and using an i8-element blend.
44450 if (VT.getVectorElementType() == MVT::i16)
44451 return SDValue();
44452 // Dynamic blending was only available from SSE4.1 onward.
44453 if (VT.is128BitVector() && !Subtarget.hasSSE41())
44454 return SDValue();
44455 // Byte blends are only available in AVX2
44456 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
44457 return SDValue();
44458 // There are no 512-bit blend instructions that use sign bits.
44459 if (VT.is512BitVector())
44460 return SDValue();
44462 // Don't optimize before the condition has been transformed to a legal type
44463 // and don't ever optimize vector selects that map to AVX512 mask-registers.
44464 if (BitWidth < 8 || BitWidth > 64)
44465 return SDValue();
44467 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
44468 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
44469 UI != UE; ++UI)
44470 if ((UI->getOpcode() != ISD::VSELECT &&
44471 UI->getOpcode() != X86ISD::BLENDV) ||
44472 UI.getOperandNo() != 0)
44473 return false;
44475 return true;
44478 APInt DemandedBits(APInt::getSignMask(BitWidth));
44480 if (OnlyUsedAsSelectCond(Cond)) {
44481 KnownBits Known;
44482 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
44483 !DCI.isBeforeLegalizeOps());
44484 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
44485 return SDValue();
44487 // If we changed the computation somewhere in the DAG, this change will
44488 // affect all users of Cond. Update all the nodes so that we do not use
44489 // the generic VSELECT anymore. Otherwise, we may perform wrong
44490 // optimizations as we messed with the actual expectation for the vector
44491 // boolean values.
44492 for (SDNode *U : Cond->uses()) {
44493 if (U->getOpcode() == X86ISD::BLENDV)
44494 continue;
44496 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
44497 Cond, U->getOperand(1), U->getOperand(2));
44498 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
44499 DCI.AddToWorklist(U);
44501 DCI.CommitTargetLoweringOpt(TLO);
44502 return SDValue(N, 0);
44505 // Otherwise we can still at least try to simplify multiple use bits.
44506 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
44507 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
44508 N->getOperand(1), N->getOperand(2));
44510 return SDValue();
44513 // Try to match:
44514 // (or (and (M, (sub 0, X)), (pandn M, X)))
44515 // which is a special case of:
44516 // (select M, (sub 0, X), X)
44517 // Per:
44518 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
44519 // We know that, if fNegate is 0 or 1:
44520 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
44522 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
44523 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
44524 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
44525 // This lets us transform our vselect to:
44526 // (add (xor X, M), (and M, 1))
44527 // And further to:
44528 // (sub (xor X, M), M)
44529 static SDValue combineLogicBlendIntoConditionalNegate(
44530 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
44531 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
44532 EVT MaskVT = Mask.getValueType();
44533 assert(MaskVT.isInteger() &&
44534 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
44535 "Mask must be zero/all-bits");
44537 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
44538 return SDValue();
44539 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
44540 return SDValue();
44542 auto IsNegV = [](SDNode *N, SDValue V) {
44543 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
44544 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
44547 SDValue V;
44548 if (IsNegV(Y.getNode(), X))
44549 V = X;
44550 else if (IsNegV(X.getNode(), Y))
44551 V = Y;
44552 else
44553 return SDValue();
44555 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
44556 SDValue SubOp2 = Mask;
44558 // If the negate was on the false side of the select, then
44559 // the operands of the SUB need to be swapped. PR 27251.
44560 // This is because the pattern being matched above is
44561 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
44562 // but if the pattern matched was
44563 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
44564 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
44565 // pattern also needs to be a negation of the replacement pattern above.
44566 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
44567 // sub accomplishes the negation of the replacement pattern.
44568 if (V == Y)
44569 std::swap(SubOp1, SubOp2);
44571 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
44572 return DAG.getBitcast(VT, Res);
44575 static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG,
44576 const X86Subtarget &Subtarget) {
44577 if (!Subtarget.hasAVX512())
44578 return SDValue();
44579 if (N->getOpcode() != ISD::VSELECT)
44580 return SDValue();
44582 SDLoc DL(N);
44583 SDValue Cond = N->getOperand(0);
44584 SDValue LHS = N->getOperand(1);
44585 SDValue RHS = N->getOperand(2);
44587 if (canCombineAsMaskOperation(LHS, Subtarget))
44588 return SDValue();
44590 if (!canCombineAsMaskOperation(RHS, Subtarget))
44591 return SDValue();
44593 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
44594 return SDValue();
44596 // Commute LHS and RHS to create opportunity to select mask instruction.
44597 // (vselect M, L, R) -> (vselect ~M, R, L)
44598 ISD::CondCode NewCC =
44599 ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
44600 Cond.getOperand(0).getValueType());
44601 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
44602 Cond.getOperand(1), NewCC);
44603 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
44606 /// Do target-specific dag combines on SELECT and VSELECT nodes.
44607 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
44608 TargetLowering::DAGCombinerInfo &DCI,
44609 const X86Subtarget &Subtarget) {
44610 SDLoc DL(N);
44611 SDValue Cond = N->getOperand(0);
44612 SDValue LHS = N->getOperand(1);
44613 SDValue RHS = N->getOperand(2);
44615 // Try simplification again because we use this function to optimize
44616 // BLENDV nodes that are not handled by the generic combiner.
44617 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
44618 return V;
44620 // When avx512 is available the lhs operand of select instruction can be
44621 // folded with mask instruction, while the rhs operand can't. Commute the
44622 // lhs and rhs of the select instruction to create the opportunity of
44623 // folding.
44624 if (SDValue V = commuteSelect(N, DAG, Subtarget))
44625 return V;
44627 EVT VT = LHS.getValueType();
44628 EVT CondVT = Cond.getValueType();
44629 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44630 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
44632 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
44633 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
44634 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
44635 if (CondVT.isVector() && CondVT.isInteger() &&
44636 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
44637 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
44638 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
44639 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
44640 DL, DAG, Subtarget))
44641 return V;
44643 // Convert vselects with constant condition into shuffles.
44644 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
44645 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
44646 SmallVector<int, 64> Mask;
44647 if (createShuffleMaskFromVSELECT(Mask, Cond,
44648 N->getOpcode() == X86ISD::BLENDV))
44649 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
44652 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
44653 // by forcing the unselected elements to zero.
44654 // TODO: Can we handle more shuffles with this?
44655 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
44656 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
44657 LHS.hasOneUse() && RHS.hasOneUse()) {
44658 MVT SimpleVT = VT.getSimpleVT();
44659 SmallVector<SDValue, 1> LHSOps, RHSOps;
44660 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
44661 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
44662 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
44663 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
44664 int NumElts = VT.getVectorNumElements();
44665 for (int i = 0; i != NumElts; ++i) {
44666 // getConstVector sets negative shuffle mask values as undef, so ensure
44667 // we hardcode SM_SentinelZero values to zero (0x80).
44668 if (CondMask[i] < NumElts) {
44669 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
44670 RHSMask[i] = 0x80;
44671 } else {
44672 LHSMask[i] = 0x80;
44673 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
44676 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
44677 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
44678 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
44679 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
44680 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
44684 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
44685 // instructions match the semantics of the common C idiom x<y?x:y but not
44686 // x<=y?x:y, because of how they handle negative zero (which can be
44687 // ignored in unsafe-math mode).
44688 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
44689 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
44690 VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
44691 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
44692 (Subtarget.hasSSE2() ||
44693 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
44694 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44696 unsigned Opcode = 0;
44697 // Check for x CC y ? x : y.
44698 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
44699 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
44700 switch (CC) {
44701 default: break;
44702 case ISD::SETULT:
44703 // Converting this to a min would handle NaNs incorrectly, and swapping
44704 // the operands would cause it to handle comparisons between positive
44705 // and negative zero incorrectly.
44706 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
44707 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44708 !(DAG.isKnownNeverZeroFloat(LHS) ||
44709 DAG.isKnownNeverZeroFloat(RHS)))
44710 break;
44711 std::swap(LHS, RHS);
44713 Opcode = X86ISD::FMIN;
44714 break;
44715 case ISD::SETOLE:
44716 // Converting this to a min would handle comparisons between positive
44717 // and negative zero incorrectly.
44718 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44719 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
44720 break;
44721 Opcode = X86ISD::FMIN;
44722 break;
44723 case ISD::SETULE:
44724 // Converting this to a min would handle both negative zeros and NaNs
44725 // incorrectly, but we can swap the operands to fix both.
44726 std::swap(LHS, RHS);
44727 [[fallthrough]];
44728 case ISD::SETOLT:
44729 case ISD::SETLT:
44730 case ISD::SETLE:
44731 Opcode = X86ISD::FMIN;
44732 break;
44734 case ISD::SETOGE:
44735 // Converting this to a max would handle comparisons between positive
44736 // and negative zero incorrectly.
44737 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44738 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
44739 break;
44740 Opcode = X86ISD::FMAX;
44741 break;
44742 case ISD::SETUGT:
44743 // Converting this to a max would handle NaNs incorrectly, and swapping
44744 // the operands would cause it to handle comparisons between positive
44745 // and negative zero incorrectly.
44746 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
44747 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44748 !(DAG.isKnownNeverZeroFloat(LHS) ||
44749 DAG.isKnownNeverZeroFloat(RHS)))
44750 break;
44751 std::swap(LHS, RHS);
44753 Opcode = X86ISD::FMAX;
44754 break;
44755 case ISD::SETUGE:
44756 // Converting this to a max would handle both negative zeros and NaNs
44757 // incorrectly, but we can swap the operands to fix both.
44758 std::swap(LHS, RHS);
44759 [[fallthrough]];
44760 case ISD::SETOGT:
44761 case ISD::SETGT:
44762 case ISD::SETGE:
44763 Opcode = X86ISD::FMAX;
44764 break;
44766 // Check for x CC y ? y : x -- a min/max with reversed arms.
44767 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
44768 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
44769 switch (CC) {
44770 default: break;
44771 case ISD::SETOGE:
44772 // Converting this to a min would handle comparisons between positive
44773 // and negative zero incorrectly, and swapping the operands would
44774 // cause it to handle NaNs incorrectly.
44775 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44776 !(DAG.isKnownNeverZeroFloat(LHS) ||
44777 DAG.isKnownNeverZeroFloat(RHS))) {
44778 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44779 break;
44780 std::swap(LHS, RHS);
44782 Opcode = X86ISD::FMIN;
44783 break;
44784 case ISD::SETUGT:
44785 // Converting this to a min would handle NaNs incorrectly.
44786 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44787 break;
44788 Opcode = X86ISD::FMIN;
44789 break;
44790 case ISD::SETUGE:
44791 // Converting this to a min would handle both negative zeros and NaNs
44792 // incorrectly, but we can swap the operands to fix both.
44793 std::swap(LHS, RHS);
44794 [[fallthrough]];
44795 case ISD::SETOGT:
44796 case ISD::SETGT:
44797 case ISD::SETGE:
44798 Opcode = X86ISD::FMIN;
44799 break;
44801 case ISD::SETULT:
44802 // Converting this to a max would handle NaNs incorrectly.
44803 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44804 break;
44805 Opcode = X86ISD::FMAX;
44806 break;
44807 case ISD::SETOLE:
44808 // Converting this to a max would handle comparisons between positive
44809 // and negative zero incorrectly, and swapping the operands would
44810 // cause it to handle NaNs incorrectly.
44811 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44812 !DAG.isKnownNeverZeroFloat(LHS) &&
44813 !DAG.isKnownNeverZeroFloat(RHS)) {
44814 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44815 break;
44816 std::swap(LHS, RHS);
44818 Opcode = X86ISD::FMAX;
44819 break;
44820 case ISD::SETULE:
44821 // Converting this to a max would handle both negative zeros and NaNs
44822 // incorrectly, but we can swap the operands to fix both.
44823 std::swap(LHS, RHS);
44824 [[fallthrough]];
44825 case ISD::SETOLT:
44826 case ISD::SETLT:
44827 case ISD::SETLE:
44828 Opcode = X86ISD::FMAX;
44829 break;
44833 if (Opcode)
44834 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
44837 // Some mask scalar intrinsics rely on checking if only one bit is set
44838 // and implement it in C code like this:
44839 // A[0] = (U & 1) ? A[0] : W[0];
44840 // This creates some redundant instructions that break pattern matching.
44841 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
44842 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
44843 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
44844 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44845 SDValue AndNode = Cond.getOperand(0);
44846 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
44847 isNullConstant(Cond.getOperand(1)) &&
44848 isOneConstant(AndNode.getOperand(1))) {
44849 // LHS and RHS swapped due to
44850 // setcc outputting 1 when AND resulted in 0 and vice versa.
44851 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
44852 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
44856 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
44857 // lowering on KNL. In this case we convert it to
44858 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
44859 // The same situation all vectors of i8 and i16 without BWI.
44860 // Make sure we extend these even before type legalization gets a chance to
44861 // split wide vectors.
44862 // Since SKX these selects have a proper lowering.
44863 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
44864 CondVT.getVectorElementType() == MVT::i1 &&
44865 (VT.getVectorElementType() == MVT::i8 ||
44866 VT.getVectorElementType() == MVT::i16)) {
44867 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
44868 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
44871 // AVX512 - Extend select with zero to merge with target shuffle.
44872 // select(mask, extract_subvector(shuffle(x)), zero) -->
44873 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
44874 // TODO - support non target shuffles as well.
44875 if (Subtarget.hasAVX512() && CondVT.isVector() &&
44876 CondVT.getVectorElementType() == MVT::i1) {
44877 auto SelectableOp = [&TLI](SDValue Op) {
44878 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44879 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
44880 isNullConstant(Op.getOperand(1)) &&
44881 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
44882 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
44885 bool SelectableLHS = SelectableOp(LHS);
44886 bool SelectableRHS = SelectableOp(RHS);
44887 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
44888 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
44890 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
44891 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
44892 : RHS.getOperand(0).getValueType();
44893 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
44894 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
44895 VT.getSizeInBits());
44896 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
44897 VT.getSizeInBits());
44898 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
44899 DAG.getUNDEF(SrcCondVT), Cond,
44900 DAG.getIntPtrConstant(0, DL));
44901 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
44902 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
44906 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
44907 return V;
44909 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
44910 Cond.hasOneUse()) {
44911 EVT CondVT = Cond.getValueType();
44912 SDValue Cond0 = Cond.getOperand(0);
44913 SDValue Cond1 = Cond.getOperand(1);
44914 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44916 // Canonicalize min/max:
44917 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
44918 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
44919 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
44920 // the need for an extra compare against zero. e.g.
44921 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
44922 // subl %esi, %edi
44923 // testl %edi, %edi
44924 // movl $0, %eax
44925 // cmovgl %edi, %eax
44926 // =>
44927 // xorl %eax, %eax
44928 // subl %esi, $edi
44929 // cmovsl %eax, %edi
44931 // We can also canonicalize
44932 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
44933 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
44934 // This allows the use of a test instruction for the compare.
44935 if (LHS == Cond0 && RHS == Cond1) {
44936 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
44937 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
44938 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
44939 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
44940 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
44942 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
44943 ISD::CondCode NewCC = ISD::SETUGE;
44944 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
44945 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
44949 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
44950 // fold eq + gt/lt nested selects into ge/le selects
44951 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
44952 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
44953 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
44954 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
44955 // .. etc ..
44956 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
44957 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
44958 SDValue InnerSetCC = RHS.getOperand(0);
44959 ISD::CondCode InnerCC =
44960 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
44961 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
44962 Cond0 == InnerSetCC.getOperand(0) &&
44963 Cond1 == InnerSetCC.getOperand(1)) {
44964 ISD::CondCode NewCC;
44965 switch (CC == ISD::SETEQ ? InnerCC : CC) {
44966 case ISD::SETGT: NewCC = ISD::SETGE; break;
44967 case ISD::SETLT: NewCC = ISD::SETLE; break;
44968 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
44969 case ISD::SETULT: NewCC = ISD::SETULE; break;
44970 default: NewCC = ISD::SETCC_INVALID; break;
44972 if (NewCC != ISD::SETCC_INVALID) {
44973 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
44974 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
44980 // Check if the first operand is all zeros and Cond type is vXi1.
44981 // If this an avx512 target we can improve the use of zero masking by
44982 // swapping the operands and inverting the condition.
44983 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
44984 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
44985 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
44986 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
44987 // Invert the cond to not(cond) : xor(op,allones)=not(op)
44988 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
44989 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
44990 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
44993 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
44994 // get split by legalization.
44995 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
44996 CondVT.getVectorElementType() == MVT::i1 &&
44997 TLI.isTypeLegal(VT.getScalarType())) {
44998 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
44999 if (SDValue ExtCond = combineToExtendBoolVectorInReg(
45000 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
45001 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
45002 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
45006 // Early exit check
45007 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
45008 return SDValue();
45010 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
45011 return V;
45013 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
45014 return V;
45016 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
45017 return V;
45019 // select(~Cond, X, Y) -> select(Cond, Y, X)
45020 if (CondVT.getScalarType() != MVT::i1) {
45021 if (SDValue CondNot = IsNOT(Cond, DAG))
45022 return DAG.getNode(N->getOpcode(), DL, VT,
45023 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
45025 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
45026 // signbit.
45027 if (Cond.getOpcode() == X86ISD::PCMPGT &&
45028 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
45029 Cond.hasOneUse()) {
45030 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
45031 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
45032 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
45036 // Try to optimize vXi1 selects if both operands are either all constants or
45037 // bitcasts from scalar integer type. In that case we can convert the operands
45038 // to integer and use an integer select which will be converted to a CMOV.
45039 // We need to take a little bit of care to avoid creating an i64 type after
45040 // type legalization.
45041 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
45042 VT.getVectorElementType() == MVT::i1 &&
45043 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
45044 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
45045 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
45046 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
45047 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
45049 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
45050 LHS.getOperand(0).getValueType() == IntVT)) &&
45051 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
45052 RHS.getOperand(0).getValueType() == IntVT))) {
45053 if (LHSIsConst)
45054 LHS = combinevXi1ConstantToInteger(LHS, DAG);
45055 else
45056 LHS = LHS.getOperand(0);
45058 if (RHSIsConst)
45059 RHS = combinevXi1ConstantToInteger(RHS, DAG);
45060 else
45061 RHS = RHS.getOperand(0);
45063 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
45064 return DAG.getBitcast(VT, Select);
45069 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
45070 // single bits, then invert the predicate and swap the select operands.
45071 // This can lower using a vector shift bit-hack rather than mask and compare.
45072 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
45073 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
45074 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
45075 Cond.getOperand(0).getOpcode() == ISD::AND &&
45076 isNullOrNullSplat(Cond.getOperand(1)) &&
45077 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
45078 Cond.getOperand(0).getValueType() == VT) {
45079 // The 'and' mask must be composed of power-of-2 constants.
45080 SDValue And = Cond.getOperand(0);
45081 auto *C = isConstOrConstSplat(And.getOperand(1));
45082 if (C && C->getAPIntValue().isPowerOf2()) {
45083 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
45084 SDValue NotCond =
45085 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
45086 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
45089 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
45090 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
45091 // 16-bit lacks a proper blendv.
45092 unsigned EltBitWidth = VT.getScalarSizeInBits();
45093 bool CanShiftBlend =
45094 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
45095 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
45096 (Subtarget.hasXOP()));
45097 if (CanShiftBlend &&
45098 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
45099 return C->getAPIntValue().isPowerOf2();
45100 })) {
45101 // Create a left-shift constant to get the mask bits over to the sign-bit.
45102 SDValue Mask = And.getOperand(1);
45103 SmallVector<int, 32> ShlVals;
45104 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
45105 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
45106 ShlVals.push_back(EltBitWidth - 1 -
45107 MaskVal->getAPIntValue().exactLogBase2());
45109 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
45110 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
45111 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
45112 SDValue NewCond =
45113 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
45114 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
45118 return SDValue();
45121 /// Combine:
45122 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
45123 /// to:
45124 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
45125 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
45126 /// Note that this is only legal for some op/cc combinations.
45127 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
45128 SelectionDAG &DAG,
45129 const X86Subtarget &Subtarget) {
45130 // This combine only operates on CMP-like nodes.
45131 if (!(Cmp.getOpcode() == X86ISD::CMP ||
45132 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
45133 return SDValue();
45135 // Can't replace the cmp if it has more uses than the one we're looking at.
45136 // FIXME: We would like to be able to handle this, but would need to make sure
45137 // all uses were updated.
45138 if (!Cmp.hasOneUse())
45139 return SDValue();
45141 // This only applies to variations of the common case:
45142 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
45143 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
45144 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
45145 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
45146 // Using the proper condcodes (see below), overflow is checked for.
45148 // FIXME: We can generalize both constraints:
45149 // - XOR/OR/AND (if they were made to survive AtomicExpand)
45150 // - LHS != 1
45151 // if the result is compared.
45153 SDValue CmpLHS = Cmp.getOperand(0);
45154 SDValue CmpRHS = Cmp.getOperand(1);
45155 EVT CmpVT = CmpLHS.getValueType();
45157 if (!CmpLHS.hasOneUse())
45158 return SDValue();
45160 unsigned Opc = CmpLHS.getOpcode();
45161 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
45162 return SDValue();
45164 SDValue OpRHS = CmpLHS.getOperand(2);
45165 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
45166 if (!OpRHSC)
45167 return SDValue();
45169 APInt Addend = OpRHSC->getAPIntValue();
45170 if (Opc == ISD::ATOMIC_LOAD_SUB)
45171 Addend = -Addend;
45173 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
45174 if (!CmpRHSC)
45175 return SDValue();
45177 APInt Comparison = CmpRHSC->getAPIntValue();
45178 APInt NegAddend = -Addend;
45180 // See if we can adjust the CC to make the comparison match the negated
45181 // addend.
45182 if (Comparison != NegAddend) {
45183 APInt IncComparison = Comparison + 1;
45184 if (IncComparison == NegAddend) {
45185 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
45186 Comparison = IncComparison;
45187 CC = X86::COND_AE;
45188 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
45189 Comparison = IncComparison;
45190 CC = X86::COND_L;
45193 APInt DecComparison = Comparison - 1;
45194 if (DecComparison == NegAddend) {
45195 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
45196 Comparison = DecComparison;
45197 CC = X86::COND_A;
45198 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
45199 Comparison = DecComparison;
45200 CC = X86::COND_LE;
45205 // If the addend is the negation of the comparison value, then we can do
45206 // a full comparison by emitting the atomic arithmetic as a locked sub.
45207 if (Comparison == NegAddend) {
45208 // The CC is fine, but we need to rewrite the LHS of the comparison as an
45209 // atomic sub.
45210 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
45211 auto AtomicSub = DAG.getAtomic(
45212 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
45213 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
45214 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
45215 AN->getMemOperand());
45216 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
45217 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
45218 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
45219 return LockOp;
45222 // We can handle comparisons with zero in a number of cases by manipulating
45223 // the CC used.
45224 if (!Comparison.isZero())
45225 return SDValue();
45227 if (CC == X86::COND_S && Addend == 1)
45228 CC = X86::COND_LE;
45229 else if (CC == X86::COND_NS && Addend == 1)
45230 CC = X86::COND_G;
45231 else if (CC == X86::COND_G && Addend == -1)
45232 CC = X86::COND_GE;
45233 else if (CC == X86::COND_LE && Addend == -1)
45234 CC = X86::COND_L;
45235 else
45236 return SDValue();
45238 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
45239 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
45240 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
45241 return LockOp;
45244 // Check whether a boolean test is testing a boolean value generated by
45245 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
45246 // code.
45248 // Simplify the following patterns:
45249 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
45250 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
45251 // to (Op EFLAGS Cond)
45253 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
45254 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
45255 // to (Op EFLAGS !Cond)
45257 // where Op could be BRCOND or CMOV.
45259 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
45260 // This combine only operates on CMP-like nodes.
45261 if (!(Cmp.getOpcode() == X86ISD::CMP ||
45262 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
45263 return SDValue();
45265 // Quit if not used as a boolean value.
45266 if (CC != X86::COND_E && CC != X86::COND_NE)
45267 return SDValue();
45269 // Check CMP operands. One of them should be 0 or 1 and the other should be
45270 // an SetCC or extended from it.
45271 SDValue Op1 = Cmp.getOperand(0);
45272 SDValue Op2 = Cmp.getOperand(1);
45274 SDValue SetCC;
45275 const ConstantSDNode* C = nullptr;
45276 bool needOppositeCond = (CC == X86::COND_E);
45277 bool checkAgainstTrue = false; // Is it a comparison against 1?
45279 if ((C = dyn_cast<ConstantSDNode>(Op1)))
45280 SetCC = Op2;
45281 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
45282 SetCC = Op1;
45283 else // Quit if all operands are not constants.
45284 return SDValue();
45286 if (C->getZExtValue() == 1) {
45287 needOppositeCond = !needOppositeCond;
45288 checkAgainstTrue = true;
45289 } else if (C->getZExtValue() != 0)
45290 // Quit if the constant is neither 0 or 1.
45291 return SDValue();
45293 bool truncatedToBoolWithAnd = false;
45294 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
45295 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
45296 SetCC.getOpcode() == ISD::TRUNCATE ||
45297 SetCC.getOpcode() == ISD::AND) {
45298 if (SetCC.getOpcode() == ISD::AND) {
45299 int OpIdx = -1;
45300 if (isOneConstant(SetCC.getOperand(0)))
45301 OpIdx = 1;
45302 if (isOneConstant(SetCC.getOperand(1)))
45303 OpIdx = 0;
45304 if (OpIdx < 0)
45305 break;
45306 SetCC = SetCC.getOperand(OpIdx);
45307 truncatedToBoolWithAnd = true;
45308 } else
45309 SetCC = SetCC.getOperand(0);
45312 switch (SetCC.getOpcode()) {
45313 case X86ISD::SETCC_CARRY:
45314 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
45315 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
45316 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
45317 // truncated to i1 using 'and'.
45318 if (checkAgainstTrue && !truncatedToBoolWithAnd)
45319 break;
45320 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
45321 "Invalid use of SETCC_CARRY!");
45322 [[fallthrough]];
45323 case X86ISD::SETCC:
45324 // Set the condition code or opposite one if necessary.
45325 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
45326 if (needOppositeCond)
45327 CC = X86::GetOppositeBranchCondition(CC);
45328 return SetCC.getOperand(1);
45329 case X86ISD::CMOV: {
45330 // Check whether false/true value has canonical one, i.e. 0 or 1.
45331 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
45332 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
45333 // Quit if true value is not a constant.
45334 if (!TVal)
45335 return SDValue();
45336 // Quit if false value is not a constant.
45337 if (!FVal) {
45338 SDValue Op = SetCC.getOperand(0);
45339 // Skip 'zext' or 'trunc' node.
45340 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
45341 Op.getOpcode() == ISD::TRUNCATE)
45342 Op = Op.getOperand(0);
45343 // A special case for rdrand/rdseed, where 0 is set if false cond is
45344 // found.
45345 if ((Op.getOpcode() != X86ISD::RDRAND &&
45346 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
45347 return SDValue();
45349 // Quit if false value is not the constant 0 or 1.
45350 bool FValIsFalse = true;
45351 if (FVal && FVal->getZExtValue() != 0) {
45352 if (FVal->getZExtValue() != 1)
45353 return SDValue();
45354 // If FVal is 1, opposite cond is needed.
45355 needOppositeCond = !needOppositeCond;
45356 FValIsFalse = false;
45358 // Quit if TVal is not the constant opposite of FVal.
45359 if (FValIsFalse && TVal->getZExtValue() != 1)
45360 return SDValue();
45361 if (!FValIsFalse && TVal->getZExtValue() != 0)
45362 return SDValue();
45363 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
45364 if (needOppositeCond)
45365 CC = X86::GetOppositeBranchCondition(CC);
45366 return SetCC.getOperand(3);
45370 return SDValue();
45373 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
45374 /// Match:
45375 /// (X86or (X86setcc) (X86setcc))
45376 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
45377 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
45378 X86::CondCode &CC1, SDValue &Flags,
45379 bool &isAnd) {
45380 if (Cond->getOpcode() == X86ISD::CMP) {
45381 if (!isNullConstant(Cond->getOperand(1)))
45382 return false;
45384 Cond = Cond->getOperand(0);
45387 isAnd = false;
45389 SDValue SetCC0, SetCC1;
45390 switch (Cond->getOpcode()) {
45391 default: return false;
45392 case ISD::AND:
45393 case X86ISD::AND:
45394 isAnd = true;
45395 [[fallthrough]];
45396 case ISD::OR:
45397 case X86ISD::OR:
45398 SetCC0 = Cond->getOperand(0);
45399 SetCC1 = Cond->getOperand(1);
45400 break;
45403 // Make sure we have SETCC nodes, using the same flags value.
45404 if (SetCC0.getOpcode() != X86ISD::SETCC ||
45405 SetCC1.getOpcode() != X86ISD::SETCC ||
45406 SetCC0->getOperand(1) != SetCC1->getOperand(1))
45407 return false;
45409 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
45410 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
45411 Flags = SetCC0->getOperand(1);
45412 return true;
45415 // When legalizing carry, we create carries via add X, -1
45416 // If that comes from an actual carry, via setcc, we use the
45417 // carry directly.
45418 static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
45419 if (EFLAGS.getOpcode() == X86ISD::ADD) {
45420 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
45421 bool FoundAndLSB = false;
45422 SDValue Carry = EFLAGS.getOperand(0);
45423 while (Carry.getOpcode() == ISD::TRUNCATE ||
45424 Carry.getOpcode() == ISD::ZERO_EXTEND ||
45425 (Carry.getOpcode() == ISD::AND &&
45426 isOneConstant(Carry.getOperand(1)))) {
45427 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
45428 Carry = Carry.getOperand(0);
45430 if (Carry.getOpcode() == X86ISD::SETCC ||
45431 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
45432 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
45433 uint64_t CarryCC = Carry.getConstantOperandVal(0);
45434 SDValue CarryOp1 = Carry.getOperand(1);
45435 if (CarryCC == X86::COND_B)
45436 return CarryOp1;
45437 if (CarryCC == X86::COND_A) {
45438 // Try to convert COND_A into COND_B in an attempt to facilitate
45439 // materializing "setb reg".
45441 // Do not flip "e > c", where "c" is a constant, because Cmp
45442 // instruction cannot take an immediate as its first operand.
45444 if (CarryOp1.getOpcode() == X86ISD::SUB &&
45445 CarryOp1.getNode()->hasOneUse() &&
45446 CarryOp1.getValueType().isInteger() &&
45447 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
45448 SDValue SubCommute =
45449 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
45450 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
45451 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
45454 // If this is a check of the z flag of an add with 1, switch to the
45455 // C flag.
45456 if (CarryCC == X86::COND_E &&
45457 CarryOp1.getOpcode() == X86ISD::ADD &&
45458 isOneConstant(CarryOp1.getOperand(1)))
45459 return CarryOp1;
45460 } else if (FoundAndLSB) {
45461 SDLoc DL(Carry);
45462 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
45463 if (Carry.getOpcode() == ISD::SRL) {
45464 BitNo = Carry.getOperand(1);
45465 Carry = Carry.getOperand(0);
45467 return getBT(Carry, BitNo, DL, DAG);
45472 return SDValue();
45475 /// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
45476 /// to avoid the inversion.
45477 static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
45478 SelectionDAG &DAG,
45479 const X86Subtarget &Subtarget) {
45480 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
45481 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
45482 EFLAGS.getOpcode() != X86ISD::TESTP)
45483 return SDValue();
45485 // PTEST/TESTP sets EFLAGS as:
45486 // TESTZ: ZF = (Op0 & Op1) == 0
45487 // TESTC: CF = (~Op0 & Op1) == 0
45488 // TESTNZC: ZF == 0 && CF == 0
45489 MVT VT = EFLAGS.getSimpleValueType();
45490 SDValue Op0 = EFLAGS.getOperand(0);
45491 SDValue Op1 = EFLAGS.getOperand(1);
45492 MVT OpVT = Op0.getSimpleValueType();
45494 // TEST*(~X,Y) == TEST*(X,Y)
45495 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
45496 X86::CondCode InvCC;
45497 switch (CC) {
45498 case X86::COND_B:
45499 // testc -> testz.
45500 InvCC = X86::COND_E;
45501 break;
45502 case X86::COND_AE:
45503 // !testc -> !testz.
45504 InvCC = X86::COND_NE;
45505 break;
45506 case X86::COND_E:
45507 // testz -> testc.
45508 InvCC = X86::COND_B;
45509 break;
45510 case X86::COND_NE:
45511 // !testz -> !testc.
45512 InvCC = X86::COND_AE;
45513 break;
45514 case X86::COND_A:
45515 case X86::COND_BE:
45516 // testnzc -> testnzc (no change).
45517 InvCC = CC;
45518 break;
45519 default:
45520 InvCC = X86::COND_INVALID;
45521 break;
45524 if (InvCC != X86::COND_INVALID) {
45525 CC = InvCC;
45526 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
45527 DAG.getBitcast(OpVT, NotOp0), Op1);
45531 if (CC == X86::COND_B || CC == X86::COND_AE) {
45532 // TESTC(X,~X) == TESTC(X,-1)
45533 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
45534 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
45535 SDLoc DL(EFLAGS);
45536 return DAG.getNode(
45537 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
45538 DAG.getBitcast(OpVT,
45539 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
45544 if (CC == X86::COND_E || CC == X86::COND_NE) {
45545 // TESTZ(X,~Y) == TESTC(Y,X)
45546 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
45547 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
45548 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
45549 DAG.getBitcast(OpVT, NotOp1), Op0);
45552 if (Op0 == Op1) {
45553 SDValue BC = peekThroughBitcasts(Op0);
45554 EVT BCVT = BC.getValueType();
45556 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
45557 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
45558 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
45559 DAG.getBitcast(OpVT, BC.getOperand(0)),
45560 DAG.getBitcast(OpVT, BC.getOperand(1)));
45563 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
45564 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
45565 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
45566 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
45567 DAG.getBitcast(OpVT, BC.getOperand(0)),
45568 DAG.getBitcast(OpVT, BC.getOperand(1)));
45571 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
45572 // to more efficiently extract the sign bits and compare that.
45573 // TODO: Handle TESTC with comparison inversion.
45574 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
45575 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
45576 if (BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
45577 unsigned EltBits = BCVT.getScalarSizeInBits();
45578 if (DAG.ComputeNumSignBits(BC) == EltBits) {
45579 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
45580 APInt SignMask = APInt::getSignMask(EltBits);
45581 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45582 if (SDValue Res =
45583 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
45584 // For vXi16 cases we need to use pmovmksb and extract every other
45585 // sign bit.
45586 SDLoc DL(EFLAGS);
45587 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
45588 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
45589 MVT FloatVT =
45590 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
45591 Res = DAG.getBitcast(FloatVT, Res);
45592 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
45593 } else if (EltBits == 16) {
45594 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
45595 Res = DAG.getBitcast(MovmskVT, Res);
45596 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
45597 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
45598 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
45599 } else {
45600 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
45602 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
45603 DAG.getConstant(0, DL, MVT::i32));
45609 // TESTZ(-1,X) == TESTZ(X,X)
45610 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
45611 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
45613 // TESTZ(X,-1) == TESTZ(X,X)
45614 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
45615 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
45617 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
45618 // TODO: Add COND_NE handling?
45619 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
45620 SDValue Src0 = peekThroughBitcasts(Op0);
45621 SDValue Src1 = peekThroughBitcasts(Op1);
45622 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
45623 Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),
45624 peekThroughBitcasts(Src0.getOperand(1)), true);
45625 Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),
45626 peekThroughBitcasts(Src1.getOperand(1)), true);
45627 if (Src0 && Src1) {
45628 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
45629 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
45630 DAG.getBitcast(OpVT2, Src0),
45631 DAG.getBitcast(OpVT2, Src1));
45637 return SDValue();
45640 // Attempt to simplify the MOVMSK input based on the comparison type.
45641 static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
45642 SelectionDAG &DAG,
45643 const X86Subtarget &Subtarget) {
45644 // Handle eq/ne against zero (any_of).
45645 // Handle eq/ne against -1 (all_of).
45646 if (!(CC == X86::COND_E || CC == X86::COND_NE))
45647 return SDValue();
45648 if (EFLAGS.getValueType() != MVT::i32)
45649 return SDValue();
45650 unsigned CmpOpcode = EFLAGS.getOpcode();
45651 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
45652 return SDValue();
45653 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
45654 if (!CmpConstant)
45655 return SDValue();
45656 const APInt &CmpVal = CmpConstant->getAPIntValue();
45658 SDValue CmpOp = EFLAGS.getOperand(0);
45659 unsigned CmpBits = CmpOp.getValueSizeInBits();
45660 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
45662 // Peek through any truncate.
45663 if (CmpOp.getOpcode() == ISD::TRUNCATE)
45664 CmpOp = CmpOp.getOperand(0);
45666 // Bail if we don't find a MOVMSK.
45667 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
45668 return SDValue();
45670 SDValue Vec = CmpOp.getOperand(0);
45671 MVT VecVT = Vec.getSimpleValueType();
45672 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
45673 "Unexpected MOVMSK operand");
45674 unsigned NumElts = VecVT.getVectorNumElements();
45675 unsigned NumEltBits = VecVT.getScalarSizeInBits();
45677 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
45678 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
45679 NumElts <= CmpBits && CmpVal.isMask(NumElts);
45680 if (!IsAnyOf && !IsAllOf)
45681 return SDValue();
45683 // TODO: Check more combining cases for me.
45684 // Here we check the cmp use number to decide do combining or not.
45685 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
45686 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
45687 bool IsOneUse = CmpOp.getNode()->hasOneUse();
45689 // See if we can peek through to a vector with a wider element type, if the
45690 // signbits extend down to all the sub-elements as well.
45691 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
45692 // potential SimplifyDemandedBits/Elts cases.
45693 // If we looked through a truncate that discard bits, we can't do this
45694 // transform.
45695 // FIXME: We could do this transform for truncates that discarded bits by
45696 // inserting an AND mask between the new MOVMSK and the CMP.
45697 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
45698 SDValue BC = peekThroughBitcasts(Vec);
45699 MVT BCVT = BC.getSimpleValueType();
45700 unsigned BCNumElts = BCVT.getVectorNumElements();
45701 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
45702 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
45703 BCNumEltBits > NumEltBits &&
45704 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
45705 SDLoc DL(EFLAGS);
45706 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
45707 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
45708 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
45709 DAG.getConstant(CmpMask, DL, MVT::i32));
45713 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
45714 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
45715 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
45716 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
45717 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
45718 SmallVector<SDValue> Ops;
45719 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
45720 Ops.size() == 2) {
45721 SDLoc DL(EFLAGS);
45722 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
45723 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
45724 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
45725 DAG.getBitcast(SubVT, Ops[0]),
45726 DAG.getBitcast(SubVT, Ops[1]));
45727 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
45728 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
45729 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
45730 DAG.getConstant(CmpMask, DL, MVT::i32));
45734 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
45735 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
45736 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
45737 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
45738 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
45739 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
45740 SDValue BC = peekThroughBitcasts(Vec);
45741 // Ensure MOVMSK was testing every signbit of BC.
45742 if (BC.getValueType().getVectorNumElements() <= NumElts) {
45743 if (BC.getOpcode() == X86ISD::PCMPEQ) {
45744 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
45745 BC.getOperand(0), BC.getOperand(1));
45746 V = DAG.getBitcast(TestVT, V);
45747 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
45749 // Check for 256-bit split vector cases.
45750 if (BC.getOpcode() == ISD::AND &&
45751 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
45752 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
45753 SDValue LHS = BC.getOperand(0);
45754 SDValue RHS = BC.getOperand(1);
45755 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
45756 LHS.getOperand(0), LHS.getOperand(1));
45757 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
45758 RHS.getOperand(0), RHS.getOperand(1));
45759 LHS = DAG.getBitcast(TestVT, LHS);
45760 RHS = DAG.getBitcast(TestVT, RHS);
45761 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
45762 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
45767 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
45768 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
45769 // sign bits prior to the comparison with zero unless we know that
45770 // the vXi16 splats the sign bit down to the lower i8 half.
45771 // TODO: Handle all_of patterns.
45772 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
45773 SDValue VecOp0 = Vec.getOperand(0);
45774 SDValue VecOp1 = Vec.getOperand(1);
45775 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
45776 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
45777 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
45778 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
45779 SDLoc DL(EFLAGS);
45780 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
45781 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45782 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
45783 if (!SignExt0) {
45784 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
45785 DAG.getConstant(0xAAAA, DL, MVT::i16));
45787 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
45788 DAG.getConstant(0, DL, MVT::i16));
45790 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
45791 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
45792 if (CmpBits >= 16 && Subtarget.hasInt256() &&
45793 (IsAnyOf || (SignExt0 && SignExt1))) {
45794 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
45795 SDLoc DL(EFLAGS);
45796 SDValue Result = peekThroughBitcasts(Src);
45797 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
45798 Result.getValueType().getVectorNumElements() <= NumElts) {
45799 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
45800 Result.getOperand(0), Result.getOperand(1));
45801 V = DAG.getBitcast(MVT::v4i64, V);
45802 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
45804 Result = DAG.getBitcast(MVT::v32i8, Result);
45805 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45806 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
45807 if (!SignExt0 || !SignExt1) {
45808 assert(IsAnyOf &&
45809 "Only perform v16i16 signmasks for any_of patterns");
45810 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
45811 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
45813 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
45814 DAG.getConstant(CmpMask, DL, MVT::i32));
45819 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
45820 // Since we peek through a bitcast, we need to be careful if the base vector
45821 // type has smaller elements than the MOVMSK type. In that case, even if
45822 // all the elements are demanded by the shuffle mask, only the "high"
45823 // elements which have highbits that align with highbits in the MOVMSK vec
45824 // elements are actually demanded. A simplification of spurious operations
45825 // on the "low" elements take place during other simplifications.
45827 // For example:
45828 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
45829 // demanded, because we are swapping around the result can change.
45831 // To address this, we check that we can scale the shuffle mask to MOVMSK
45832 // element width (this will ensure "high" elements match). Its slightly overly
45833 // conservative, but fine for an edge case fold.
45834 SmallVector<int, 32> ShuffleMask, ScaledMaskUnused;
45835 SmallVector<SDValue, 2> ShuffleInputs;
45836 if (NumElts <= CmpBits &&
45837 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
45838 ShuffleMask, DAG) &&
45839 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
45840 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
45841 scaleShuffleElements(ShuffleMask, NumElts, ScaledMaskUnused)) {
45842 unsigned NumShuffleElts = ShuffleMask.size();
45843 APInt DemandedElts = APInt::getZero(NumShuffleElts);
45844 for (int M : ShuffleMask) {
45845 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
45846 DemandedElts.setBit(M);
45848 if (DemandedElts.isAllOnes()) {
45849 SDLoc DL(EFLAGS);
45850 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
45851 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45852 Result =
45853 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
45854 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
45855 EFLAGS.getOperand(1));
45859 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
45860 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
45861 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
45862 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
45863 // iff every element is referenced.
45864 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
45865 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
45866 (NumEltBits == 32 || NumEltBits == 64)) {
45867 SDLoc DL(EFLAGS);
45868 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
45869 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
45870 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
45871 SDValue LHS = Vec;
45872 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
45873 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
45874 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
45875 DAG.getBitcast(FloatVT, LHS),
45876 DAG.getBitcast(FloatVT, RHS));
45879 return SDValue();
45882 /// Optimize an EFLAGS definition used according to the condition code \p CC
45883 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
45884 /// uses of chain values.
45885 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
45886 SelectionDAG &DAG,
45887 const X86Subtarget &Subtarget) {
45888 if (CC == X86::COND_B)
45889 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
45890 return Flags;
45892 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
45893 return R;
45895 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
45896 return R;
45898 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
45899 return R;
45901 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
45904 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
45905 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
45906 TargetLowering::DAGCombinerInfo &DCI,
45907 const X86Subtarget &Subtarget) {
45908 SDLoc DL(N);
45910 SDValue FalseOp = N->getOperand(0);
45911 SDValue TrueOp = N->getOperand(1);
45912 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
45913 SDValue Cond = N->getOperand(3);
45915 // cmov X, X, ?, ? --> X
45916 if (TrueOp == FalseOp)
45917 return TrueOp;
45919 // Try to simplify the EFLAGS and condition code operands.
45920 // We can't always do this as FCMOV only supports a subset of X86 cond.
45921 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
45922 if (!(FalseOp.getValueType() == MVT::f80 ||
45923 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
45924 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
45925 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
45926 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
45927 Flags};
45928 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
45932 // If this is a select between two integer constants, try to do some
45933 // optimizations. Note that the operands are ordered the opposite of SELECT
45934 // operands.
45935 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
45936 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
45937 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
45938 // larger than FalseC (the false value).
45939 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
45940 CC = X86::GetOppositeBranchCondition(CC);
45941 std::swap(TrueC, FalseC);
45942 std::swap(TrueOp, FalseOp);
45945 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
45946 // This is efficient for any integer data type (including i8/i16) and
45947 // shift amount.
45948 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
45949 Cond = getSETCC(CC, Cond, DL, DAG);
45951 // Zero extend the condition if needed.
45952 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
45954 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
45955 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
45956 DAG.getConstant(ShAmt, DL, MVT::i8));
45957 return Cond;
45960 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
45961 // for any integer data type, including i8/i16.
45962 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
45963 Cond = getSETCC(CC, Cond, DL, DAG);
45965 // Zero extend the condition if needed.
45966 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
45967 FalseC->getValueType(0), Cond);
45968 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
45969 SDValue(FalseC, 0));
45970 return Cond;
45973 // Optimize cases that will turn into an LEA instruction. This requires
45974 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
45975 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
45976 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
45977 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
45978 "Implicit constant truncation");
45980 bool isFastMultiplier = false;
45981 if (Diff.ult(10)) {
45982 switch (Diff.getZExtValue()) {
45983 default: break;
45984 case 1: // result = add base, cond
45985 case 2: // result = lea base( , cond*2)
45986 case 3: // result = lea base(cond, cond*2)
45987 case 4: // result = lea base( , cond*4)
45988 case 5: // result = lea base(cond, cond*4)
45989 case 8: // result = lea base( , cond*8)
45990 case 9: // result = lea base(cond, cond*8)
45991 isFastMultiplier = true;
45992 break;
45996 if (isFastMultiplier) {
45997 Cond = getSETCC(CC, Cond, DL ,DAG);
45998 // Zero extend the condition if needed.
45999 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
46000 Cond);
46001 // Scale the condition by the difference.
46002 if (Diff != 1)
46003 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
46004 DAG.getConstant(Diff, DL, Cond.getValueType()));
46006 // Add the base if non-zero.
46007 if (FalseC->getAPIntValue() != 0)
46008 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
46009 SDValue(FalseC, 0));
46010 return Cond;
46016 // Handle these cases:
46017 // (select (x != c), e, c) -> select (x != c), e, x),
46018 // (select (x == c), c, e) -> select (x == c), x, e)
46019 // where the c is an integer constant, and the "select" is the combination
46020 // of CMOV and CMP.
46022 // The rationale for this change is that the conditional-move from a constant
46023 // needs two instructions, however, conditional-move from a register needs
46024 // only one instruction.
46026 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
46027 // some instruction-combining opportunities. This opt needs to be
46028 // postponed as late as possible.
46030 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
46031 // the DCI.xxxx conditions are provided to postpone the optimization as
46032 // late as possible.
46034 ConstantSDNode *CmpAgainst = nullptr;
46035 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
46036 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
46037 !isa<ConstantSDNode>(Cond.getOperand(0))) {
46039 if (CC == X86::COND_NE &&
46040 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
46041 CC = X86::GetOppositeBranchCondition(CC);
46042 std::swap(TrueOp, FalseOp);
46045 if (CC == X86::COND_E &&
46046 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
46047 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
46048 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
46049 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46054 // Transform:
46056 // (cmov 1 T (uge T 2))
46058 // to:
46060 // (adc T 0 (sub T 1))
46061 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
46062 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
46063 SDValue Cond0 = Cond.getOperand(0);
46064 if (Cond0.getOpcode() == ISD::TRUNCATE)
46065 Cond0 = Cond0.getOperand(0);
46066 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
46067 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
46068 EVT CondVT = Cond->getValueType(0);
46069 EVT OuterVT = N->getValueType(0);
46070 // Subtract 1 and generate a carry.
46071 SDValue NewSub =
46072 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
46073 DAG.getConstant(1, DL, CondVT));
46074 SDValue EFLAGS(NewSub.getNode(), 1);
46075 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
46076 TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
46080 // Fold and/or of setcc's to double CMOV:
46081 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
46082 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
46084 // This combine lets us generate:
46085 // cmovcc1 (jcc1 if we don't have CMOV)
46086 // cmovcc2 (same)
46087 // instead of:
46088 // setcc1
46089 // setcc2
46090 // and/or
46091 // cmovne (jne if we don't have CMOV)
46092 // When we can't use the CMOV instruction, it might increase branch
46093 // mispredicts.
46094 // When we can use CMOV, or when there is no mispredict, this improves
46095 // throughput and reduces register pressure.
46097 if (CC == X86::COND_NE) {
46098 SDValue Flags;
46099 X86::CondCode CC0, CC1;
46100 bool isAndSetCC;
46101 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
46102 if (isAndSetCC) {
46103 std::swap(FalseOp, TrueOp);
46104 CC0 = X86::GetOppositeBranchCondition(CC0);
46105 CC1 = X86::GetOppositeBranchCondition(CC1);
46108 SDValue LOps[] = {FalseOp, TrueOp,
46109 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
46110 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
46111 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
46112 Flags};
46113 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46114 return CMOV;
46118 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
46119 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
46120 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
46121 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
46122 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
46123 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
46124 SDValue Add = TrueOp;
46125 SDValue Const = FalseOp;
46126 // Canonicalize the condition code for easier matching and output.
46127 if (CC == X86::COND_E)
46128 std::swap(Add, Const);
46130 // We might have replaced the constant in the cmov with the LHS of the
46131 // compare. If so change it to the RHS of the compare.
46132 if (Const == Cond.getOperand(0))
46133 Const = Cond.getOperand(1);
46135 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
46136 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
46137 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
46138 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
46139 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
46140 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
46141 EVT VT = N->getValueType(0);
46142 // This should constant fold.
46143 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
46144 SDValue CMov =
46145 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
46146 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
46147 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
46151 return SDValue();
46154 /// Different mul shrinking modes.
46155 enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
46157 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
46158 EVT VT = N->getOperand(0).getValueType();
46159 if (VT.getScalarSizeInBits() != 32)
46160 return false;
46162 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
46163 unsigned SignBits[2] = {1, 1};
46164 bool IsPositive[2] = {false, false};
46165 for (unsigned i = 0; i < 2; i++) {
46166 SDValue Opd = N->getOperand(i);
46168 SignBits[i] = DAG.ComputeNumSignBits(Opd);
46169 IsPositive[i] = DAG.SignBitIsZero(Opd);
46172 bool AllPositive = IsPositive[0] && IsPositive[1];
46173 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
46174 // When ranges are from -128 ~ 127, use MULS8 mode.
46175 if (MinSignBits >= 25)
46176 Mode = ShrinkMode::MULS8;
46177 // When ranges are from 0 ~ 255, use MULU8 mode.
46178 else if (AllPositive && MinSignBits >= 24)
46179 Mode = ShrinkMode::MULU8;
46180 // When ranges are from -32768 ~ 32767, use MULS16 mode.
46181 else if (MinSignBits >= 17)
46182 Mode = ShrinkMode::MULS16;
46183 // When ranges are from 0 ~ 65535, use MULU16 mode.
46184 else if (AllPositive && MinSignBits >= 16)
46185 Mode = ShrinkMode::MULU16;
46186 else
46187 return false;
46188 return true;
46191 /// When the operands of vector mul are extended from smaller size values,
46192 /// like i8 and i16, the type of mul may be shrinked to generate more
46193 /// efficient code. Two typical patterns are handled:
46194 /// Pattern1:
46195 /// %2 = sext/zext <N x i8> %1 to <N x i32>
46196 /// %4 = sext/zext <N x i8> %3 to <N x i32>
46197 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
46198 /// %5 = mul <N x i32> %2, %4
46200 /// Pattern2:
46201 /// %2 = zext/sext <N x i16> %1 to <N x i32>
46202 /// %4 = zext/sext <N x i16> %3 to <N x i32>
46203 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
46204 /// %5 = mul <N x i32> %2, %4
46206 /// There are four mul shrinking modes:
46207 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
46208 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
46209 /// generate pmullw+sext32 for it (MULS8 mode).
46210 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
46211 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
46212 /// generate pmullw+zext32 for it (MULU8 mode).
46213 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
46214 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
46215 /// generate pmullw+pmulhw for it (MULS16 mode).
46216 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
46217 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
46218 /// generate pmullw+pmulhuw for it (MULU16 mode).
46219 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
46220 const X86Subtarget &Subtarget) {
46221 // Check for legality
46222 // pmullw/pmulhw are not supported by SSE.
46223 if (!Subtarget.hasSSE2())
46224 return SDValue();
46226 // Check for profitability
46227 // pmulld is supported since SSE41. It is better to use pmulld
46228 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
46229 // the expansion.
46230 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
46231 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
46232 return SDValue();
46234 ShrinkMode Mode;
46235 if (!canReduceVMulWidth(N, DAG, Mode))
46236 return SDValue();
46238 SDLoc DL(N);
46239 SDValue N0 = N->getOperand(0);
46240 SDValue N1 = N->getOperand(1);
46241 EVT VT = N->getOperand(0).getValueType();
46242 unsigned NumElts = VT.getVectorNumElements();
46243 if ((NumElts % 2) != 0)
46244 return SDValue();
46246 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
46248 // Shrink the operands of mul.
46249 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
46250 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
46252 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
46253 // lower part is needed.
46254 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
46255 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
46256 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
46257 : ISD::SIGN_EXTEND,
46258 DL, VT, MulLo);
46260 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
46261 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
46262 // the higher part is also needed.
46263 SDValue MulHi =
46264 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
46265 ReducedVT, NewN0, NewN1);
46267 // Repack the lower part and higher part result of mul into a wider
46268 // result.
46269 // Generate shuffle functioning as punpcklwd.
46270 SmallVector<int, 16> ShuffleMask(NumElts);
46271 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
46272 ShuffleMask[2 * i] = i;
46273 ShuffleMask[2 * i + 1] = i + NumElts;
46275 SDValue ResLo =
46276 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
46277 ResLo = DAG.getBitcast(ResVT, ResLo);
46278 // Generate shuffle functioning as punpckhwd.
46279 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
46280 ShuffleMask[2 * i] = i + NumElts / 2;
46281 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
46283 SDValue ResHi =
46284 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
46285 ResHi = DAG.getBitcast(ResVT, ResHi);
46286 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
46289 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
46290 EVT VT, const SDLoc &DL) {
46292 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
46293 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46294 DAG.getConstant(Mult, DL, VT));
46295 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
46296 DAG.getConstant(Shift, DL, MVT::i8));
46297 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
46298 N->getOperand(0));
46299 return Result;
46302 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
46303 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46304 DAG.getConstant(Mul1, DL, VT));
46305 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
46306 DAG.getConstant(Mul2, DL, VT));
46307 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
46308 N->getOperand(0));
46309 return Result;
46312 switch (MulAmt) {
46313 default:
46314 break;
46315 case 11:
46316 // mul x, 11 => add ((shl (mul x, 5), 1), x)
46317 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
46318 case 21:
46319 // mul x, 21 => add ((shl (mul x, 5), 2), x)
46320 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
46321 case 41:
46322 // mul x, 41 => add ((shl (mul x, 5), 3), x)
46323 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
46324 case 22:
46325 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
46326 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
46327 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
46328 case 19:
46329 // mul x, 19 => add ((shl (mul x, 9), 1), x)
46330 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
46331 case 37:
46332 // mul x, 37 => add ((shl (mul x, 9), 2), x)
46333 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
46334 case 73:
46335 // mul x, 73 => add ((shl (mul x, 9), 3), x)
46336 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
46337 case 13:
46338 // mul x, 13 => add ((shl (mul x, 3), 2), x)
46339 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
46340 case 23:
46341 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
46342 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
46343 case 26:
46344 // mul x, 26 => add ((mul (mul x, 5), 5), x)
46345 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
46346 case 28:
46347 // mul x, 28 => add ((mul (mul x, 9), 3), x)
46348 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
46349 case 29:
46350 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
46351 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
46352 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
46355 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
46356 // by a single LEA.
46357 // First check if this a sum of two power of 2s because that's easy. Then
46358 // count how many zeros are up to the first bit.
46359 // TODO: We can do this even without LEA at a cost of two shifts and an add.
46360 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
46361 unsigned ScaleShift = llvm::countr_zero(MulAmt);
46362 if (ScaleShift >= 1 && ScaleShift < 4) {
46363 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
46364 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46365 DAG.getConstant(ShiftAmt, DL, MVT::i8));
46366 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46367 DAG.getConstant(ScaleShift, DL, MVT::i8));
46368 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
46372 return SDValue();
46375 // If the upper 17 bits of either element are zero and the other element are
46376 // zero/sign bits then we can use PMADDWD, which is always at least as quick as
46377 // PMULLD, except on KNL.
46378 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
46379 const X86Subtarget &Subtarget) {
46380 if (!Subtarget.hasSSE2())
46381 return SDValue();
46383 if (Subtarget.isPMADDWDSlow())
46384 return SDValue();
46386 EVT VT = N->getValueType(0);
46388 // Only support vXi32 vectors.
46389 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
46390 return SDValue();
46392 // Make sure the type is legal or can split/widen to a legal type.
46393 // With AVX512 but without BWI, we would need to split v32i16.
46394 unsigned NumElts = VT.getVectorNumElements();
46395 if (NumElts == 1 || !isPowerOf2_32(NumElts))
46396 return SDValue();
46398 // With AVX512 but without BWI, we would need to split v32i16.
46399 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
46400 return SDValue();
46402 SDValue N0 = N->getOperand(0);
46403 SDValue N1 = N->getOperand(1);
46405 // If we are zero/sign extending two steps without SSE4.1, its better to
46406 // reduce the vmul width instead.
46407 if (!Subtarget.hasSSE41() &&
46408 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
46409 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
46410 (N1.getOpcode() == ISD::ZERO_EXTEND &&
46411 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
46412 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
46413 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
46414 (N1.getOpcode() == ISD::SIGN_EXTEND &&
46415 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
46416 return SDValue();
46418 // If we are sign extending a wide vector without SSE4.1, its better to reduce
46419 // the vmul width instead.
46420 if (!Subtarget.hasSSE41() &&
46421 (N0.getOpcode() == ISD::SIGN_EXTEND &&
46422 N0.getOperand(0).getValueSizeInBits() > 128) &&
46423 (N1.getOpcode() == ISD::SIGN_EXTEND &&
46424 N1.getOperand(0).getValueSizeInBits() > 128))
46425 return SDValue();
46427 // Sign bits must extend down to the lowest i16.
46428 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
46429 DAG.ComputeMaxSignificantBits(N0) > 16)
46430 return SDValue();
46432 // At least one of the elements must be zero in the upper 17 bits, or can be
46433 // safely made zero without altering the final result.
46434 auto GetZeroableOp = [&](SDValue Op) {
46435 APInt Mask17 = APInt::getHighBitsSet(32, 17);
46436 if (DAG.MaskedValueIsZero(Op, Mask17))
46437 return Op;
46438 // Mask off upper 16-bits of sign-extended constants.
46439 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
46440 return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
46441 DAG.getConstant(0xFFFF, SDLoc(N), VT));
46442 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
46443 SDValue Src = Op.getOperand(0);
46444 // Convert sext(vXi16) to zext(vXi16).
46445 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
46446 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
46447 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
46448 // which will expand the extension.
46449 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
46450 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
46451 Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
46452 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
46455 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
46456 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
46457 N->isOnlyUserOf(Op.getNode())) {
46458 SDValue Src = Op.getOperand(0);
46459 if (Src.getScalarValueSizeInBits() == 16)
46460 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
46462 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
46463 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
46464 N->isOnlyUserOf(Op.getNode())) {
46465 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
46466 Op.getOperand(1));
46468 return SDValue();
46470 SDValue ZeroN0 = GetZeroableOp(N0);
46471 SDValue ZeroN1 = GetZeroableOp(N1);
46472 if (!ZeroN0 && !ZeroN1)
46473 return SDValue();
46474 N0 = ZeroN0 ? ZeroN0 : N0;
46475 N1 = ZeroN1 ? ZeroN1 : N1;
46477 // Use SplitOpsAndApply to handle AVX splitting.
46478 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46479 ArrayRef<SDValue> Ops) {
46480 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46481 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
46482 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
46483 DAG.getBitcast(OpVT, Ops[0]),
46484 DAG.getBitcast(OpVT, Ops[1]));
46486 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},
46487 PMADDWDBuilder);
46490 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
46491 const X86Subtarget &Subtarget) {
46492 if (!Subtarget.hasSSE2())
46493 return SDValue();
46495 EVT VT = N->getValueType(0);
46497 // Only support vXi64 vectors.
46498 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
46499 VT.getVectorNumElements() < 2 ||
46500 !isPowerOf2_32(VT.getVectorNumElements()))
46501 return SDValue();
46503 SDValue N0 = N->getOperand(0);
46504 SDValue N1 = N->getOperand(1);
46506 // MULDQ returns the 64-bit result of the signed multiplication of the lower
46507 // 32-bits. We can lower with this if the sign bits stretch that far.
46508 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
46509 DAG.ComputeNumSignBits(N1) > 32) {
46510 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46511 ArrayRef<SDValue> Ops) {
46512 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
46514 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
46515 PMULDQBuilder, /*CheckBWI*/false);
46518 // If the upper bits are zero we can use a single pmuludq.
46519 APInt Mask = APInt::getHighBitsSet(64, 32);
46520 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
46521 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46522 ArrayRef<SDValue> Ops) {
46523 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
46525 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
46526 PMULUDQBuilder, /*CheckBWI*/false);
46529 return SDValue();
46532 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
46533 TargetLowering::DAGCombinerInfo &DCI,
46534 const X86Subtarget &Subtarget) {
46535 EVT VT = N->getValueType(0);
46537 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
46538 return V;
46540 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
46541 return V;
46543 if (DCI.isBeforeLegalize() && VT.isVector())
46544 return reduceVMULWidth(N, DAG, Subtarget);
46546 // Optimize a single multiply with constant into two operations in order to
46547 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
46548 if (!MulConstantOptimization)
46549 return SDValue();
46551 // An imul is usually smaller than the alternative sequence.
46552 if (DAG.getMachineFunction().getFunction().hasMinSize())
46553 return SDValue();
46555 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
46556 return SDValue();
46558 if (VT != MVT::i64 && VT != MVT::i32 &&
46559 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
46560 return SDValue();
46562 ConstantSDNode *CNode = isConstOrConstSplat(
46563 N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false);
46564 const APInt *C = nullptr;
46565 if (!CNode) {
46566 if (VT.isVector())
46567 if (auto *RawC = getTargetConstantFromNode(N->getOperand(1)))
46568 if (auto *SplatC = RawC->getSplatValue())
46569 C = &(SplatC->getUniqueInteger());
46571 if (!C || C->getBitWidth() != VT.getScalarSizeInBits())
46572 return SDValue();
46573 } else {
46574 C = &(CNode->getAPIntValue());
46577 if (isPowerOf2_64(C->getZExtValue()))
46578 return SDValue();
46580 int64_t SignMulAmt = C->getSExtValue();
46581 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
46582 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
46584 SDLoc DL(N);
46585 SDValue NewMul = SDValue();
46586 if (VT == MVT::i64 || VT == MVT::i32) {
46587 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
46588 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46589 DAG.getConstant(AbsMulAmt, DL, VT));
46590 if (SignMulAmt < 0)
46591 NewMul =
46592 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
46594 return NewMul;
46597 uint64_t MulAmt1 = 0;
46598 uint64_t MulAmt2 = 0;
46599 if ((AbsMulAmt % 9) == 0) {
46600 MulAmt1 = 9;
46601 MulAmt2 = AbsMulAmt / 9;
46602 } else if ((AbsMulAmt % 5) == 0) {
46603 MulAmt1 = 5;
46604 MulAmt2 = AbsMulAmt / 5;
46605 } else if ((AbsMulAmt % 3) == 0) {
46606 MulAmt1 = 3;
46607 MulAmt2 = AbsMulAmt / 3;
46610 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
46611 if (MulAmt2 &&
46612 (isPowerOf2_64(MulAmt2) ||
46613 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
46615 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
46616 N->use_begin()->getOpcode() == ISD::ADD))
46617 // If second multiplifer is pow2, issue it first. We want the multiply
46618 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
46619 // use is an add. Only do this for positive multiply amounts since the
46620 // negate would prevent it from being used as an address mode anyway.
46621 std::swap(MulAmt1, MulAmt2);
46623 if (isPowerOf2_64(MulAmt1))
46624 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46625 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
46626 else
46627 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46628 DAG.getConstant(MulAmt1, DL, VT));
46630 if (isPowerOf2_64(MulAmt2))
46631 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
46632 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
46633 else
46634 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
46635 DAG.getConstant(MulAmt2, DL, VT));
46637 // Negate the result.
46638 if (SignMulAmt < 0)
46639 NewMul =
46640 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
46641 } else if (!Subtarget.slowLEA())
46642 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
46644 if (!NewMul) {
46645 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
46646 assert(C->getZExtValue() != 0 &&
46647 C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) &&
46648 "Both cases that could cause potential overflows should have "
46649 "already been handled.");
46650 if (isPowerOf2_64(AbsMulAmt - 1)) {
46651 // (mul x, 2^N + 1) => (add (shl x, N), x)
46652 NewMul = DAG.getNode(
46653 ISD::ADD, DL, VT, N->getOperand(0),
46654 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46655 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
46656 // To negate, subtract the number from zero
46657 if (SignMulAmt < 0)
46658 NewMul =
46659 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
46660 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
46661 // (mul x, 2^N - 1) => (sub (shl x, N), x)
46662 NewMul =
46663 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46664 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
46665 // To negate, reverse the operands of the subtract.
46666 if (SignMulAmt < 0)
46667 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
46668 else
46669 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
46670 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
46671 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
46672 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
46673 NewMul =
46674 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46675 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
46676 NewMul = DAG.getNode(
46677 ISD::ADD, DL, VT, NewMul,
46678 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
46679 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
46680 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
46681 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
46682 NewMul =
46683 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46684 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
46685 NewMul = DAG.getNode(
46686 ISD::SUB, DL, VT, NewMul,
46687 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
46688 } else if (SignMulAmt >= 0 && VT.isVector() &&
46689 Subtarget.fastImmVectorShift()) {
46690 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
46691 uint64_t ShiftAmt1;
46692 std::optional<unsigned> Opc;
46693 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
46694 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
46695 Opc = ISD::ADD;
46696 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
46697 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
46698 Opc = ISD::SUB;
46701 if (Opc) {
46702 SDValue Shift1 =
46703 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46704 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
46705 SDValue Shift2 =
46706 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46707 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
46708 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
46713 return NewMul;
46716 // Try to form a MULHU or MULHS node by looking for
46717 // (srl (mul ext, ext), 16)
46718 // TODO: This is X86 specific because we want to be able to handle wide types
46719 // before type legalization. But we can only do it if the vector will be
46720 // legalized via widening/splitting. Type legalization can't handle promotion
46721 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
46722 // combiner.
46723 static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
46724 const X86Subtarget &Subtarget) {
46725 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
46726 "SRL or SRA node is required here!");
46727 SDLoc DL(N);
46729 if (!Subtarget.hasSSE2())
46730 return SDValue();
46732 // The operation feeding into the shift must be a multiply.
46733 SDValue ShiftOperand = N->getOperand(0);
46734 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
46735 return SDValue();
46737 // Input type should be at least vXi32.
46738 EVT VT = N->getValueType(0);
46739 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
46740 return SDValue();
46742 // Need a shift by 16.
46743 APInt ShiftAmt;
46744 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
46745 ShiftAmt != 16)
46746 return SDValue();
46748 SDValue LHS = ShiftOperand.getOperand(0);
46749 SDValue RHS = ShiftOperand.getOperand(1);
46751 unsigned ExtOpc = LHS.getOpcode();
46752 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
46753 RHS.getOpcode() != ExtOpc)
46754 return SDValue();
46756 // Peek through the extends.
46757 LHS = LHS.getOperand(0);
46758 RHS = RHS.getOperand(0);
46760 // Ensure the input types match.
46761 EVT MulVT = LHS.getValueType();
46762 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
46763 return SDValue();
46765 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
46766 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
46768 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
46769 return DAG.getNode(ExtOpc, DL, VT, Mulh);
46772 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
46773 SDValue N0 = N->getOperand(0);
46774 SDValue N1 = N->getOperand(1);
46775 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
46776 EVT VT = N0.getValueType();
46778 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
46779 // since the result of setcc_c is all zero's or all ones.
46780 if (VT.isInteger() && !VT.isVector() &&
46781 N1C && N0.getOpcode() == ISD::AND &&
46782 N0.getOperand(1).getOpcode() == ISD::Constant) {
46783 SDValue N00 = N0.getOperand(0);
46784 APInt Mask = N0.getConstantOperandAPInt(1);
46785 Mask <<= N1C->getAPIntValue();
46786 bool MaskOK = false;
46787 // We can handle cases concerning bit-widening nodes containing setcc_c if
46788 // we carefully interrogate the mask to make sure we are semantics
46789 // preserving.
46790 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
46791 // of the underlying setcc_c operation if the setcc_c was zero extended.
46792 // Consider the following example:
46793 // zext(setcc_c) -> i32 0x0000FFFF
46794 // c1 -> i32 0x0000FFFF
46795 // c2 -> i32 0x00000001
46796 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
46797 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
46798 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
46799 MaskOK = true;
46800 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
46801 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
46802 MaskOK = true;
46803 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
46804 N00.getOpcode() == ISD::ANY_EXTEND) &&
46805 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
46806 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
46808 if (MaskOK && Mask != 0) {
46809 SDLoc DL(N);
46810 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
46814 return SDValue();
46817 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
46818 const X86Subtarget &Subtarget) {
46819 SDValue N0 = N->getOperand(0);
46820 SDValue N1 = N->getOperand(1);
46821 EVT VT = N0.getValueType();
46822 unsigned Size = VT.getSizeInBits();
46824 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
46825 return V;
46827 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
46828 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
46829 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
46830 // depending on sign of (SarConst - [56,48,32,24,16])
46832 // sexts in X86 are MOVs. The MOVs have the same code size
46833 // as above SHIFTs (only SHIFT on 1 has lower code size).
46834 // However the MOVs have 2 advantages to a SHIFT:
46835 // 1. MOVs can write to a register that differs from source
46836 // 2. MOVs accept memory operands
46838 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
46839 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
46840 N0.getOperand(1).getOpcode() != ISD::Constant)
46841 return SDValue();
46843 SDValue N00 = N0.getOperand(0);
46844 SDValue N01 = N0.getOperand(1);
46845 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
46846 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
46847 EVT CVT = N1.getValueType();
46849 if (SarConst.isNegative())
46850 return SDValue();
46852 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
46853 unsigned ShiftSize = SVT.getSizeInBits();
46854 // skipping types without corresponding sext/zext and
46855 // ShlConst that is not one of [56,48,32,24,16]
46856 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
46857 continue;
46858 SDLoc DL(N);
46859 SDValue NN =
46860 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
46861 SarConst = SarConst - (Size - ShiftSize);
46862 if (SarConst == 0)
46863 return NN;
46864 if (SarConst.isNegative())
46865 return DAG.getNode(ISD::SHL, DL, VT, NN,
46866 DAG.getConstant(-SarConst, DL, CVT));
46867 return DAG.getNode(ISD::SRA, DL, VT, NN,
46868 DAG.getConstant(SarConst, DL, CVT));
46870 return SDValue();
46873 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
46874 TargetLowering::DAGCombinerInfo &DCI,
46875 const X86Subtarget &Subtarget) {
46876 SDValue N0 = N->getOperand(0);
46877 SDValue N1 = N->getOperand(1);
46878 EVT VT = N0.getValueType();
46880 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
46881 return V;
46883 // Only do this on the last DAG combine as it can interfere with other
46884 // combines.
46885 if (!DCI.isAfterLegalizeDAG())
46886 return SDValue();
46888 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
46889 // TODO: This is a generic DAG combine that became an x86-only combine to
46890 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
46891 // and-not ('andn').
46892 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
46893 return SDValue();
46895 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
46896 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
46897 if (!ShiftC || !AndC)
46898 return SDValue();
46900 // If we can shrink the constant mask below 8-bits or 32-bits, then this
46901 // transform should reduce code size. It may also enable secondary transforms
46902 // from improved known-bits analysis or instruction selection.
46903 APInt MaskVal = AndC->getAPIntValue();
46905 // If this can be matched by a zero extend, don't optimize.
46906 if (MaskVal.isMask()) {
46907 unsigned TO = MaskVal.countr_one();
46908 if (TO >= 8 && isPowerOf2_32(TO))
46909 return SDValue();
46912 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
46913 unsigned OldMaskSize = MaskVal.getSignificantBits();
46914 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
46915 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
46916 (OldMaskSize > 32 && NewMaskSize <= 32)) {
46917 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
46918 SDLoc DL(N);
46919 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
46920 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
46921 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
46923 return SDValue();
46926 static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
46927 const X86Subtarget &Subtarget) {
46928 unsigned Opcode = N->getOpcode();
46929 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
46931 SDLoc DL(N);
46932 EVT VT = N->getValueType(0);
46933 SDValue N0 = N->getOperand(0);
46934 SDValue N1 = N->getOperand(1);
46935 EVT SrcVT = N0.getValueType();
46937 SDValue BC0 =
46938 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
46939 SDValue BC1 =
46940 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
46942 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
46943 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
46944 // truncation trees that help us avoid lane crossing shuffles.
46945 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
46946 // TODO: We don't handle vXf64 shuffles yet.
46947 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
46948 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
46949 SmallVector<SDValue> ShuffleOps;
46950 SmallVector<int> ShuffleMask, ScaledMask;
46951 SDValue Vec = peekThroughBitcasts(BCSrc);
46952 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
46953 resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
46954 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
46955 // shuffle to a v4X64 width - we can probably relax this in the future.
46956 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
46957 ShuffleOps[0].getValueType().is256BitVector() &&
46958 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
46959 SDValue Lo, Hi;
46960 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
46961 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
46962 Lo = DAG.getBitcast(SrcVT, Lo);
46963 Hi = DAG.getBitcast(SrcVT, Hi);
46964 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
46965 Res = DAG.getBitcast(ShufVT, Res);
46966 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
46967 return DAG.getBitcast(VT, Res);
46973 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
46974 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
46975 // If either/both ops are a shuffle that can scale to v2x64,
46976 // then see if we can perform this as a v4x32 post shuffle.
46977 SmallVector<SDValue> Ops0, Ops1;
46978 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
46979 bool IsShuf0 =
46980 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
46981 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
46982 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
46983 bool IsShuf1 =
46984 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
46985 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
46986 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
46987 if (IsShuf0 || IsShuf1) {
46988 if (!IsShuf0) {
46989 Ops0.assign({BC0});
46990 ScaledMask0.assign({0, 1});
46992 if (!IsShuf1) {
46993 Ops1.assign({BC1});
46994 ScaledMask1.assign({0, 1});
46997 SDValue LHS, RHS;
46998 int PostShuffle[4] = {-1, -1, -1, -1};
46999 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
47000 if (M < 0)
47001 return true;
47002 Idx = M % 2;
47003 SDValue Src = Ops[M / 2];
47004 if (!LHS || LHS == Src) {
47005 LHS = Src;
47006 return true;
47008 if (!RHS || RHS == Src) {
47009 Idx += 2;
47010 RHS = Src;
47011 return true;
47013 return false;
47015 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
47016 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
47017 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
47018 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
47019 LHS = DAG.getBitcast(SrcVT, LHS);
47020 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
47021 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
47022 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
47023 Res = DAG.getBitcast(ShufVT, Res);
47024 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
47025 return DAG.getBitcast(VT, Res);
47030 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
47031 if (VT.is256BitVector() && Subtarget.hasInt256()) {
47032 SmallVector<int> Mask0, Mask1;
47033 SmallVector<SDValue> Ops0, Ops1;
47034 SmallVector<int, 2> ScaledMask0, ScaledMask1;
47035 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
47036 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
47037 !Ops0.empty() && !Ops1.empty() &&
47038 all_of(Ops0,
47039 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47040 all_of(Ops1,
47041 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47042 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
47043 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
47044 SDValue Op00 = peekThroughBitcasts(Ops0.front());
47045 SDValue Op10 = peekThroughBitcasts(Ops1.front());
47046 SDValue Op01 = peekThroughBitcasts(Ops0.back());
47047 SDValue Op11 = peekThroughBitcasts(Ops1.back());
47048 if ((Op00 == Op11) && (Op01 == Op10)) {
47049 std::swap(Op10, Op11);
47050 ShuffleVectorSDNode::commuteMask(ScaledMask1);
47052 if ((Op00 == Op10) && (Op01 == Op11)) {
47053 const int Map[4] = {0, 2, 1, 3};
47054 SmallVector<int, 4> ShuffleMask(
47055 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
47056 Map[ScaledMask1[1]]});
47057 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
47058 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
47059 DAG.getBitcast(SrcVT, Op01));
47060 Res = DAG.getBitcast(ShufVT, Res);
47061 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
47062 return DAG.getBitcast(VT, Res);
47067 return SDValue();
47070 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
47071 TargetLowering::DAGCombinerInfo &DCI,
47072 const X86Subtarget &Subtarget) {
47073 unsigned Opcode = N->getOpcode();
47074 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
47075 "Unexpected pack opcode");
47077 EVT VT = N->getValueType(0);
47078 SDValue N0 = N->getOperand(0);
47079 SDValue N1 = N->getOperand(1);
47080 unsigned NumDstElts = VT.getVectorNumElements();
47081 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
47082 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
47083 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
47084 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
47085 "Unexpected PACKSS/PACKUS input type");
47087 bool IsSigned = (X86ISD::PACKSS == Opcode);
47089 // Constant Folding.
47090 APInt UndefElts0, UndefElts1;
47091 SmallVector<APInt, 32> EltBits0, EltBits1;
47092 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
47093 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
47094 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
47095 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
47096 unsigned NumLanes = VT.getSizeInBits() / 128;
47097 unsigned NumSrcElts = NumDstElts / 2;
47098 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
47099 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
47101 APInt Undefs(NumDstElts, 0);
47102 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
47103 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
47104 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
47105 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
47106 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
47107 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
47109 if (UndefElts[SrcIdx]) {
47110 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
47111 continue;
47114 APInt &Val = EltBits[SrcIdx];
47115 if (IsSigned) {
47116 // PACKSS: Truncate signed value with signed saturation.
47117 // Source values less than dst minint are saturated to minint.
47118 // Source values greater than dst maxint are saturated to maxint.
47119 if (Val.isSignedIntN(DstBitsPerElt))
47120 Val = Val.trunc(DstBitsPerElt);
47121 else if (Val.isNegative())
47122 Val = APInt::getSignedMinValue(DstBitsPerElt);
47123 else
47124 Val = APInt::getSignedMaxValue(DstBitsPerElt);
47125 } else {
47126 // PACKUS: Truncate signed value with unsigned saturation.
47127 // Source values less than zero are saturated to zero.
47128 // Source values greater than dst maxuint are saturated to maxuint.
47129 if (Val.isIntN(DstBitsPerElt))
47130 Val = Val.trunc(DstBitsPerElt);
47131 else if (Val.isNegative())
47132 Val = APInt::getZero(DstBitsPerElt);
47133 else
47134 Val = APInt::getAllOnes(DstBitsPerElt);
47136 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
47140 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
47143 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
47144 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
47145 return V;
47147 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
47148 // Currently limit this to allsignbits cases only.
47149 if (IsSigned &&
47150 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
47151 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
47152 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
47153 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
47154 if (Not0 && Not1) {
47155 SDLoc DL(N);
47156 MVT SrcVT = N0.getSimpleValueType();
47157 SDValue Pack =
47158 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
47159 DAG.getBitcast(SrcVT, Not1));
47160 return DAG.getNOT(DL, Pack, VT);
47164 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
47165 // truncate to create a larger truncate.
47166 if (Subtarget.hasAVX512() &&
47167 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
47168 N0.getOperand(0).getValueType() == MVT::v8i32) {
47169 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
47170 (!IsSigned &&
47171 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
47172 if (Subtarget.hasVLX())
47173 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
47175 // Widen input to v16i32 so we can truncate that.
47176 SDLoc dl(N);
47177 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
47178 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
47179 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
47183 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
47184 if (VT.is128BitVector()) {
47185 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
47186 SDValue Src0, Src1;
47187 if (N0.getOpcode() == ExtOpc &&
47188 N0.getOperand(0).getValueType().is64BitVector() &&
47189 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
47190 Src0 = N0.getOperand(0);
47192 if (N1.getOpcode() == ExtOpc &&
47193 N1.getOperand(0).getValueType().is64BitVector() &&
47194 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
47195 Src1 = N1.getOperand(0);
47197 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
47198 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
47199 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
47200 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
47201 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
47204 // Try again with pack(*_extend_vector_inreg, undef).
47205 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
47206 : ISD::ZERO_EXTEND_VECTOR_INREG;
47207 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
47208 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
47209 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
47210 DAG);
47213 // Attempt to combine as shuffle.
47214 SDValue Op(N, 0);
47215 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47216 return Res;
47218 return SDValue();
47221 static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
47222 TargetLowering::DAGCombinerInfo &DCI,
47223 const X86Subtarget &Subtarget) {
47224 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
47225 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
47226 "Unexpected horizontal add/sub opcode");
47228 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
47229 MVT VT = N->getSimpleValueType(0);
47230 SDValue LHS = N->getOperand(0);
47231 SDValue RHS = N->getOperand(1);
47233 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
47234 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
47235 LHS.getOpcode() == RHS.getOpcode() &&
47236 LHS.getValueType() == RHS.getValueType() &&
47237 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
47238 SDValue LHS0 = LHS.getOperand(0);
47239 SDValue LHS1 = LHS.getOperand(1);
47240 SDValue RHS0 = RHS.getOperand(0);
47241 SDValue RHS1 = RHS.getOperand(1);
47242 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
47243 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
47244 SDLoc DL(N);
47245 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
47246 LHS0.isUndef() ? LHS1 : LHS0,
47247 RHS0.isUndef() ? RHS1 : RHS0);
47248 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
47249 Res = DAG.getBitcast(ShufVT, Res);
47250 SDValue NewLHS =
47251 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
47252 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
47253 SDValue NewRHS =
47254 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
47255 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
47256 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
47257 DAG.getBitcast(VT, NewRHS));
47262 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
47263 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
47264 return V;
47266 return SDValue();
47269 static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
47270 TargetLowering::DAGCombinerInfo &DCI,
47271 const X86Subtarget &Subtarget) {
47272 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
47273 X86ISD::VSRL == N->getOpcode()) &&
47274 "Unexpected shift opcode");
47275 EVT VT = N->getValueType(0);
47276 SDValue N0 = N->getOperand(0);
47277 SDValue N1 = N->getOperand(1);
47279 // Shift zero -> zero.
47280 if (ISD::isBuildVectorAllZeros(N0.getNode()))
47281 return DAG.getConstant(0, SDLoc(N), VT);
47283 // Detect constant shift amounts.
47284 APInt UndefElts;
47285 SmallVector<APInt, 32> EltBits;
47286 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
47287 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
47288 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
47289 EltBits[0].getZExtValue(), DAG);
47292 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47293 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
47294 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
47295 return SDValue(N, 0);
47297 return SDValue();
47300 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
47301 TargetLowering::DAGCombinerInfo &DCI,
47302 const X86Subtarget &Subtarget) {
47303 unsigned Opcode = N->getOpcode();
47304 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
47305 X86ISD::VSRLI == Opcode) &&
47306 "Unexpected shift opcode");
47307 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
47308 EVT VT = N->getValueType(0);
47309 SDValue N0 = N->getOperand(0);
47310 SDValue N1 = N->getOperand(1);
47311 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
47312 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
47313 "Unexpected value type");
47314 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
47316 // (shift undef, X) -> 0
47317 if (N0.isUndef())
47318 return DAG.getConstant(0, SDLoc(N), VT);
47320 // Out of range logical bit shifts are guaranteed to be zero.
47321 // Out of range arithmetic bit shifts splat the sign bit.
47322 unsigned ShiftVal = N->getConstantOperandVal(1);
47323 if (ShiftVal >= NumBitsPerElt) {
47324 if (LogicalShift)
47325 return DAG.getConstant(0, SDLoc(N), VT);
47326 ShiftVal = NumBitsPerElt - 1;
47329 // (shift X, 0) -> X
47330 if (!ShiftVal)
47331 return N0;
47333 // (shift 0, C) -> 0
47334 if (ISD::isBuildVectorAllZeros(N0.getNode()))
47335 // N0 is all zeros or undef. We guarantee that the bits shifted into the
47336 // result are all zeros, not undef.
47337 return DAG.getConstant(0, SDLoc(N), VT);
47339 // (VSRAI -1, C) -> -1
47340 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
47341 // N0 is all ones or undef. We guarantee that the bits shifted into the
47342 // result are all ones, not undef.
47343 return DAG.getConstant(-1, SDLoc(N), VT);
47345 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
47346 unsigned NewShiftVal = Amt0 + Amt1;
47347 if (NewShiftVal >= NumBitsPerElt) {
47348 // Out of range logical bit shifts are guaranteed to be zero.
47349 // Out of range arithmetic bit shifts splat the sign bit.
47350 if (LogicalShift)
47351 return DAG.getConstant(0, SDLoc(N), VT);
47352 NewShiftVal = NumBitsPerElt - 1;
47354 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
47355 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
47358 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
47359 if (Opcode == N0.getOpcode())
47360 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
47362 // (shl (add X, X), C) -> (shl X, (C + 1))
47363 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
47364 N0.getOperand(0) == N0.getOperand(1))
47365 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
47367 // We can decode 'whole byte' logical bit shifts as shuffles.
47368 if (LogicalShift && (ShiftVal % 8) == 0) {
47369 SDValue Op(N, 0);
47370 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47371 return Res;
47374 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
47375 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
47376 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
47377 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
47378 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
47379 N0.getOpcode() == X86ISD::PSHUFD &&
47380 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
47381 N0->hasOneUse()) {
47382 SDValue BC = peekThroughOneUseBitcasts(N0.getOperand(0));
47383 if (BC.getOpcode() == X86ISD::VSHLI &&
47384 BC.getScalarValueSizeInBits() == 64 &&
47385 BC.getConstantOperandVal(1) == 63) {
47386 SDLoc DL(N);
47387 SDValue Src = BC.getOperand(0);
47388 Src = DAG.getBitcast(VT, Src);
47389 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
47390 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
47391 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
47392 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
47393 return Src;
47397 auto TryConstantFold = [&](SDValue V) {
47398 APInt UndefElts;
47399 SmallVector<APInt, 32> EltBits;
47400 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits))
47401 return SDValue();
47402 assert(EltBits.size() == VT.getVectorNumElements() &&
47403 "Unexpected shift value type");
47404 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
47405 // created an undef input due to no input bits being demanded, but user
47406 // still expects 0 in other bits.
47407 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
47408 APInt &Elt = EltBits[i];
47409 if (UndefElts[i])
47410 Elt = 0;
47411 else if (X86ISD::VSHLI == Opcode)
47412 Elt <<= ShiftVal;
47413 else if (X86ISD::VSRAI == Opcode)
47414 Elt.ashrInPlace(ShiftVal);
47415 else
47416 Elt.lshrInPlace(ShiftVal);
47418 // Reset undef elements since they were zeroed above.
47419 UndefElts = 0;
47420 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
47423 // Constant Folding.
47424 if (N->isOnlyUserOf(N0.getNode())) {
47425 if (SDValue C = TryConstantFold(N0))
47426 return C;
47428 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
47429 // Don't break NOT patterns.
47430 SDValue BC = peekThroughOneUseBitcasts(N0);
47431 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
47432 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
47433 !ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) {
47434 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
47435 SDLoc DL(N);
47436 SDValue LHS = DAG.getNode(Opcode, DL, VT,
47437 DAG.getBitcast(VT, BC.getOperand(0)), N1);
47438 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
47443 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47444 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
47445 DCI))
47446 return SDValue(N, 0);
47448 return SDValue();
47451 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
47452 TargetLowering::DAGCombinerInfo &DCI,
47453 const X86Subtarget &Subtarget) {
47454 EVT VT = N->getValueType(0);
47455 unsigned Opcode = N->getOpcode();
47456 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
47457 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
47458 Opcode == ISD::INSERT_VECTOR_ELT) &&
47459 "Unexpected vector insertion");
47461 SDValue Vec = N->getOperand(0);
47462 SDValue Scl = N->getOperand(1);
47463 SDValue Idx = N->getOperand(2);
47465 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
47466 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
47467 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
47469 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
47470 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
47471 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47472 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
47473 APInt::getAllOnes(NumBitsPerElt), DCI))
47474 return SDValue(N, 0);
47477 // Attempt to combine insertion patterns to a shuffle.
47478 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
47479 SDValue Op(N, 0);
47480 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47481 return Res;
47484 return SDValue();
47487 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
47488 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
47489 /// OR -> CMPNEQSS.
47490 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
47491 TargetLowering::DAGCombinerInfo &DCI,
47492 const X86Subtarget &Subtarget) {
47493 unsigned opcode;
47495 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
47496 // we're requiring SSE2 for both.
47497 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
47498 SDValue N0 = N->getOperand(0);
47499 SDValue N1 = N->getOperand(1);
47500 SDValue CMP0 = N0.getOperand(1);
47501 SDValue CMP1 = N1.getOperand(1);
47502 SDLoc DL(N);
47504 // The SETCCs should both refer to the same CMP.
47505 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
47506 return SDValue();
47508 SDValue CMP00 = CMP0->getOperand(0);
47509 SDValue CMP01 = CMP0->getOperand(1);
47510 EVT VT = CMP00.getValueType();
47512 if (VT == MVT::f32 || VT == MVT::f64 ||
47513 (VT == MVT::f16 && Subtarget.hasFP16())) {
47514 bool ExpectingFlags = false;
47515 // Check for any users that want flags:
47516 for (const SDNode *U : N->uses()) {
47517 if (ExpectingFlags)
47518 break;
47520 switch (U->getOpcode()) {
47521 default:
47522 case ISD::BR_CC:
47523 case ISD::BRCOND:
47524 case ISD::SELECT:
47525 ExpectingFlags = true;
47526 break;
47527 case ISD::CopyToReg:
47528 case ISD::SIGN_EXTEND:
47529 case ISD::ZERO_EXTEND:
47530 case ISD::ANY_EXTEND:
47531 break;
47535 if (!ExpectingFlags) {
47536 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
47537 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
47539 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
47540 X86::CondCode tmp = cc0;
47541 cc0 = cc1;
47542 cc1 = tmp;
47545 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
47546 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
47547 // FIXME: need symbolic constants for these magic numbers.
47548 // See X86ATTInstPrinter.cpp:printSSECC().
47549 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
47550 if (Subtarget.hasAVX512()) {
47551 SDValue FSetCC =
47552 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
47553 DAG.getTargetConstant(x86cc, DL, MVT::i8));
47554 // Need to fill with zeros to ensure the bitcast will produce zeroes
47555 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
47556 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
47557 DAG.getConstant(0, DL, MVT::v16i1),
47558 FSetCC, DAG.getIntPtrConstant(0, DL));
47559 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
47560 N->getSimpleValueType(0));
47562 SDValue OnesOrZeroesF =
47563 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
47564 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
47566 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
47567 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
47569 if (is64BitFP && !Subtarget.is64Bit()) {
47570 // On a 32-bit target, we cannot bitcast the 64-bit float to a
47571 // 64-bit integer, since that's not a legal type. Since
47572 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
47573 // bits, but can do this little dance to extract the lowest 32 bits
47574 // and work with those going forward.
47575 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
47576 OnesOrZeroesF);
47577 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
47578 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
47579 Vector32, DAG.getIntPtrConstant(0, DL));
47580 IntVT = MVT::i32;
47583 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
47584 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
47585 DAG.getConstant(1, DL, IntVT));
47586 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
47587 ANDed);
47588 return OneBitOfTruth;
47593 return SDValue();
47596 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
47597 static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
47598 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
47600 MVT VT = N->getSimpleValueType(0);
47601 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
47602 return SDValue();
47604 SDValue X, Y;
47605 SDValue N0 = N->getOperand(0);
47606 SDValue N1 = N->getOperand(1);
47608 if (SDValue Not = IsNOT(N0, DAG)) {
47609 X = Not;
47610 Y = N1;
47611 } else if (SDValue Not = IsNOT(N1, DAG)) {
47612 X = Not;
47613 Y = N0;
47614 } else
47615 return SDValue();
47617 X = DAG.getBitcast(VT, X);
47618 Y = DAG.getBitcast(VT, Y);
47619 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
47622 /// Try to fold:
47623 /// and (vector_shuffle<Z,...,Z>
47624 /// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
47625 /// ->
47626 /// andnp (vector_shuffle<Z,...,Z>
47627 /// (insert_vector_elt undef, X, Z), undef), Y
47628 static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
47629 const X86Subtarget &Subtarget) {
47630 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
47632 EVT VT = N->getValueType(0);
47633 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
47634 // value and require extra moves.
47635 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
47636 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
47637 return SDValue();
47639 auto GetNot = [&DAG](SDValue V) {
47640 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
47641 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
47642 // end-users are ISD::AND including cases
47643 // (and(extract_vector_element(SVN), Y)).
47644 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
47645 !SVN->getOperand(1).isUndef()) {
47646 return SDValue();
47648 SDValue IVEN = SVN->getOperand(0);
47649 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
47650 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
47651 return SDValue();
47652 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
47653 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
47654 return SDValue();
47655 SDValue Src = IVEN.getOperand(1);
47656 if (SDValue Not = IsNOT(Src, DAG)) {
47657 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
47658 SDValue NotIVEN =
47659 DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),
47660 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
47661 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
47662 SVN->getOperand(1), SVN->getMask());
47664 return SDValue();
47667 SDValue X, Y;
47668 SDValue N0 = N->getOperand(0);
47669 SDValue N1 = N->getOperand(1);
47671 if (SDValue Not = GetNot(N0)) {
47672 X = Not;
47673 Y = N1;
47674 } else if (SDValue Not = GetNot(N1)) {
47675 X = Not;
47676 Y = N0;
47677 } else
47678 return SDValue();
47680 X = DAG.getBitcast(VT, X);
47681 Y = DAG.getBitcast(VT, Y);
47682 SDLoc DL(N);
47683 // We do not split for SSE at all, but we need to split vectors for AVX1 and
47684 // AVX2.
47685 if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {
47686 SDValue LoX, HiX;
47687 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
47688 SDValue LoY, HiY;
47689 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
47690 EVT SplitVT = LoX.getValueType();
47691 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
47692 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
47693 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
47695 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
47698 // Try to widen AND, OR and XOR nodes to VT in order to remove casts around
47699 // logical operations, like in the example below.
47700 // or (and (truncate x, truncate y)),
47701 // (xor (truncate z, build_vector (constants)))
47702 // Given a target type \p VT, we generate
47703 // or (and x, y), (xor z, zext(build_vector (constants)))
47704 // given x, y and z are of type \p VT. We can do so, if operands are either
47705 // truncates from VT types, the second operand is a vector of constants or can
47706 // be recursively promoted.
47707 static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
47708 unsigned Depth) {
47709 // Limit recursion to avoid excessive compile times.
47710 if (Depth >= SelectionDAG::MaxRecursionDepth)
47711 return SDValue();
47713 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
47714 N->getOpcode() != ISD::OR)
47715 return SDValue();
47717 SDValue N0 = N->getOperand(0);
47718 SDValue N1 = N->getOperand(1);
47719 SDLoc DL(N);
47721 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47722 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
47723 return SDValue();
47725 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
47726 N0 = NN0;
47727 else {
47728 // The Left side has to be a trunc.
47729 if (N0.getOpcode() != ISD::TRUNCATE)
47730 return SDValue();
47732 // The type of the truncated inputs.
47733 if (N0.getOperand(0).getValueType() != VT)
47734 return SDValue();
47736 N0 = N0.getOperand(0);
47739 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
47740 N1 = NN1;
47741 else {
47742 // The right side has to be a 'trunc' or a constant vector.
47743 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
47744 N1.getOperand(0).getValueType() == VT;
47745 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
47746 return SDValue();
47748 if (RHSTrunc)
47749 N1 = N1.getOperand(0);
47750 else
47751 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
47754 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
47757 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
47758 // register. In most cases we actually compare or select YMM-sized registers
47759 // and mixing the two types creates horrible code. This method optimizes
47760 // some of the transition sequences.
47761 // Even with AVX-512 this is still useful for removing casts around logical
47762 // operations on vXi1 mask types.
47763 static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
47764 const X86Subtarget &Subtarget) {
47765 EVT VT = N->getValueType(0);
47766 assert(VT.isVector() && "Expected vector type");
47768 SDLoc DL(N);
47769 assert((N->getOpcode() == ISD::ANY_EXTEND ||
47770 N->getOpcode() == ISD::ZERO_EXTEND ||
47771 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
47773 SDValue Narrow = N->getOperand(0);
47774 EVT NarrowVT = Narrow.getValueType();
47776 // Generate the wide operation.
47777 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
47778 if (!Op)
47779 return SDValue();
47780 switch (N->getOpcode()) {
47781 default: llvm_unreachable("Unexpected opcode");
47782 case ISD::ANY_EXTEND:
47783 return Op;
47784 case ISD::ZERO_EXTEND:
47785 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
47786 case ISD::SIGN_EXTEND:
47787 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
47788 Op, DAG.getValueType(NarrowVT));
47792 static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
47793 unsigned FPOpcode;
47794 switch (Opcode) {
47795 default: llvm_unreachable("Unexpected input node for FP logic conversion");
47796 case ISD::AND: FPOpcode = X86ISD::FAND; break;
47797 case ISD::OR: FPOpcode = X86ISD::FOR; break;
47798 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
47800 return FPOpcode;
47803 /// If both input operands of a logic op are being cast from floating-point
47804 /// types or FP compares, try to convert this into a floating-point logic node
47805 /// to avoid unnecessary moves from SSE to integer registers.
47806 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
47807 TargetLowering::DAGCombinerInfo &DCI,
47808 const X86Subtarget &Subtarget) {
47809 EVT VT = N->getValueType(0);
47810 SDValue N0 = N->getOperand(0);
47811 SDValue N1 = N->getOperand(1);
47812 SDLoc DL(N);
47814 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
47815 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
47816 return SDValue();
47818 SDValue N00 = N0.getOperand(0);
47819 SDValue N10 = N1.getOperand(0);
47820 EVT N00Type = N00.getValueType();
47821 EVT N10Type = N10.getValueType();
47823 // Ensure that both types are the same and are legal scalar fp types.
47824 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
47825 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
47826 (Subtarget.hasFP16() && N00Type == MVT::f16)))
47827 return SDValue();
47829 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
47830 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
47831 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
47832 return DAG.getBitcast(VT, FPLogic);
47835 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
47836 !N1.hasOneUse())
47837 return SDValue();
47839 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
47840 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
47842 // The vector ISA for FP predicates is incomplete before AVX, so converting
47843 // COMIS* to CMPS* may not be a win before AVX.
47844 if (!Subtarget.hasAVX() &&
47845 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
47846 return SDValue();
47848 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
47849 // and vector logic:
47850 // logic (setcc N00, N01), (setcc N10, N11) -->
47851 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
47852 unsigned NumElts = 128 / N00Type.getSizeInBits();
47853 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
47854 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47855 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
47856 SDValue N01 = N0.getOperand(1);
47857 SDValue N11 = N1.getOperand(1);
47858 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
47859 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
47860 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
47861 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
47862 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
47863 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
47864 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
47865 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
47868 // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
47869 // to reduce XMM->GPR traffic.
47870 static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
47871 unsigned Opc = N->getOpcode();
47872 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
47873 "Unexpected bit opcode");
47875 SDValue N0 = N->getOperand(0);
47876 SDValue N1 = N->getOperand(1);
47878 // Both operands must be single use MOVMSK.
47879 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
47880 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
47881 return SDValue();
47883 SDValue Vec0 = N0.getOperand(0);
47884 SDValue Vec1 = N1.getOperand(0);
47885 EVT VecVT0 = Vec0.getValueType();
47886 EVT VecVT1 = Vec1.getValueType();
47888 // Both MOVMSK operands must be from vectors of the same size and same element
47889 // size, but its OK for a fp/int diff.
47890 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
47891 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
47892 return SDValue();
47894 SDLoc DL(N);
47895 unsigned VecOpc =
47896 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
47897 SDValue Result =
47898 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
47899 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47902 // Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
47903 // NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
47904 // handles in InstCombine.
47905 static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
47906 unsigned Opc = N->getOpcode();
47907 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
47908 "Unexpected bit opcode");
47910 SDValue N0 = N->getOperand(0);
47911 SDValue N1 = N->getOperand(1);
47912 EVT VT = N->getValueType(0);
47914 // Both operands must be single use.
47915 if (!N0.hasOneUse() || !N1.hasOneUse())
47916 return SDValue();
47918 // Search for matching shifts.
47919 SDValue BC0 = peekThroughOneUseBitcasts(N0);
47920 SDValue BC1 = peekThroughOneUseBitcasts(N1);
47922 unsigned BCOpc = BC0.getOpcode();
47923 EVT BCVT = BC0.getValueType();
47924 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
47925 return SDValue();
47927 switch (BCOpc) {
47928 case X86ISD::VSHLI:
47929 case X86ISD::VSRLI:
47930 case X86ISD::VSRAI: {
47931 if (BC0.getOperand(1) != BC1.getOperand(1))
47932 return SDValue();
47934 SDLoc DL(N);
47935 SDValue BitOp =
47936 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
47937 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
47938 return DAG.getBitcast(VT, Shift);
47942 return SDValue();
47945 // Attempt to fold:
47946 // BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
47947 // TODO: Handle PACKUS handling.
47948 static SDValue combineBitOpWithPACK(SDNode *N, SelectionDAG &DAG) {
47949 unsigned Opc = N->getOpcode();
47950 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
47951 "Unexpected bit opcode");
47953 SDValue N0 = N->getOperand(0);
47954 SDValue N1 = N->getOperand(1);
47955 EVT VT = N->getValueType(0);
47957 // Both operands must be single use.
47958 if (!N0.hasOneUse() || !N1.hasOneUse())
47959 return SDValue();
47961 // Search for matching packs.
47962 N0 = peekThroughOneUseBitcasts(N0);
47963 N1 = peekThroughOneUseBitcasts(N1);
47965 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
47966 return SDValue();
47968 MVT DstVT = N0.getSimpleValueType();
47969 if (DstVT != N1.getSimpleValueType())
47970 return SDValue();
47972 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
47973 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
47975 // Limit to allsignbits packing.
47976 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
47977 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
47978 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
47979 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
47980 return SDValue();
47982 SDLoc DL(N);
47983 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
47984 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
47985 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
47988 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
47989 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
47990 /// with a shift-right to eliminate loading the vector constant mask value.
47991 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
47992 const X86Subtarget &Subtarget) {
47993 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
47994 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
47995 EVT VT = Op0.getValueType();
47996 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
47997 return SDValue();
47999 // Try to convert an "is positive" signbit masking operation into arithmetic
48000 // shift and "andn". This saves a materialization of a -1 vector constant.
48001 // The "is negative" variant should be handled more generally because it only
48002 // requires "and" rather than "andn":
48003 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
48005 // This is limited to the original type to avoid producing even more bitcasts.
48006 // If the bitcasts can't be eliminated, then it is unlikely that this fold
48007 // will be profitable.
48008 if (N->getValueType(0) == VT &&
48009 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
48010 SDValue X, Y;
48011 if (Op1.getOpcode() == X86ISD::PCMPGT &&
48012 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
48013 X = Op1.getOperand(0);
48014 Y = Op0;
48015 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
48016 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
48017 X = Op0.getOperand(0);
48018 Y = Op1;
48020 if (X && Y) {
48021 SDLoc DL(N);
48022 SDValue Sra =
48023 getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
48024 VT.getScalarSizeInBits() - 1, DAG);
48025 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
48029 APInt SplatVal;
48030 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
48031 return SDValue();
48033 // Don't prevent creation of ANDN.
48034 if (isBitwiseNot(Op0))
48035 return SDValue();
48037 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
48038 return SDValue();
48040 unsigned EltBitWidth = VT.getScalarSizeInBits();
48041 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
48042 return SDValue();
48044 SDLoc DL(N);
48045 unsigned ShiftVal = SplatVal.countr_one();
48046 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
48047 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
48048 return DAG.getBitcast(N->getValueType(0), Shift);
48051 // Get the index node from the lowered DAG of a GEP IR instruction with one
48052 // indexing dimension.
48053 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
48054 if (Ld->isIndexed())
48055 return SDValue();
48057 SDValue Base = Ld->getBasePtr();
48059 if (Base.getOpcode() != ISD::ADD)
48060 return SDValue();
48062 SDValue ShiftedIndex = Base.getOperand(0);
48064 if (ShiftedIndex.getOpcode() != ISD::SHL)
48065 return SDValue();
48067 return ShiftedIndex.getOperand(0);
48071 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
48072 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
48073 switch (VT.getSizeInBits()) {
48074 default: return false;
48075 case 64: return Subtarget.is64Bit() ? true : false;
48076 case 32: return true;
48079 return false;
48082 // This function recognizes cases where X86 bzhi instruction can replace and
48083 // 'and-load' sequence.
48084 // In case of loading integer value from an array of constants which is defined
48085 // as follows:
48087 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
48089 // then applying a bitwise and on the result with another input.
48090 // It's equivalent to performing bzhi (zero high bits) on the input, with the
48091 // same index of the load.
48092 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
48093 const X86Subtarget &Subtarget) {
48094 MVT VT = Node->getSimpleValueType(0);
48095 SDLoc dl(Node);
48097 // Check if subtarget has BZHI instruction for the node's type
48098 if (!hasBZHI(Subtarget, VT))
48099 return SDValue();
48101 // Try matching the pattern for both operands.
48102 for (unsigned i = 0; i < 2; i++) {
48103 SDValue N = Node->getOperand(i);
48104 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
48106 // continue if the operand is not a load instruction
48107 if (!Ld)
48108 return SDValue();
48110 const Value *MemOp = Ld->getMemOperand()->getValue();
48112 if (!MemOp)
48113 return SDValue();
48115 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
48116 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
48117 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
48119 Constant *Init = GV->getInitializer();
48120 Type *Ty = Init->getType();
48121 if (!isa<ConstantDataArray>(Init) ||
48122 !Ty->getArrayElementType()->isIntegerTy() ||
48123 Ty->getArrayElementType()->getScalarSizeInBits() !=
48124 VT.getSizeInBits() ||
48125 Ty->getArrayNumElements() >
48126 Ty->getArrayElementType()->getScalarSizeInBits())
48127 continue;
48129 // Check if the array's constant elements are suitable to our case.
48130 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
48131 bool ConstantsMatch = true;
48132 for (uint64_t j = 0; j < ArrayElementCount; j++) {
48133 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
48134 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
48135 ConstantsMatch = false;
48136 break;
48139 if (!ConstantsMatch)
48140 continue;
48142 // Do the transformation (For 32-bit type):
48143 // -> (and (load arr[idx]), inp)
48144 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
48145 // that will be replaced with one bzhi instruction.
48146 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
48147 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
48149 // Get the Node which indexes into the array.
48150 SDValue Index = getIndexFromUnindexedLoad(Ld);
48151 if (!Index)
48152 return SDValue();
48153 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
48155 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
48156 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
48158 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
48159 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
48161 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
48166 return SDValue();
48169 // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
48170 // Where C is a mask containing the same number of bits as the setcc and
48171 // where the setcc will freely 0 upper bits of k-register. We can replace the
48172 // undef in the concat with 0s and remove the AND. This mainly helps with
48173 // v2i1/v4i1 setcc being casted to scalar.
48174 static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
48175 const X86Subtarget &Subtarget) {
48176 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
48178 EVT VT = N->getValueType(0);
48180 // Make sure this is an AND with constant. We will check the value of the
48181 // constant later.
48182 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
48183 if (!C1)
48184 return SDValue();
48186 // This is implied by the ConstantSDNode.
48187 assert(!VT.isVector() && "Expected scalar VT!");
48189 SDValue Src = N->getOperand(0);
48190 if (!Src.hasOneUse())
48191 return SDValue();
48193 // (Optionally) peek through any_extend().
48194 if (Src.getOpcode() == ISD::ANY_EXTEND) {
48195 if (!Src.getOperand(0).hasOneUse())
48196 return SDValue();
48197 Src = Src.getOperand(0);
48200 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
48201 return SDValue();
48203 Src = Src.getOperand(0);
48204 EVT SrcVT = Src.getValueType();
48206 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48207 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
48208 !TLI.isTypeLegal(SrcVT))
48209 return SDValue();
48211 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
48212 return SDValue();
48214 // We only care about the first subvector of the concat, we expect the
48215 // other subvectors to be ignored due to the AND if we make the change.
48216 SDValue SubVec = Src.getOperand(0);
48217 EVT SubVecVT = SubVec.getValueType();
48219 // The RHS of the AND should be a mask with as many bits as SubVec.
48220 if (!TLI.isTypeLegal(SubVecVT) ||
48221 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
48222 return SDValue();
48224 // First subvector should be a setcc with a legal result type or a
48225 // AND containing at least one setcc with a legal result type.
48226 auto IsLegalSetCC = [&](SDValue V) {
48227 if (V.getOpcode() != ISD::SETCC)
48228 return false;
48229 EVT SetccVT = V.getOperand(0).getValueType();
48230 if (!TLI.isTypeLegal(SetccVT) ||
48231 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
48232 return false;
48233 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
48234 return false;
48235 return true;
48237 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
48238 (IsLegalSetCC(SubVec.getOperand(0)) ||
48239 IsLegalSetCC(SubVec.getOperand(1))))))
48240 return SDValue();
48242 // We passed all the checks. Rebuild the concat_vectors with zeroes
48243 // and cast it back to VT.
48244 SDLoc dl(N);
48245 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
48246 DAG.getConstant(0, dl, SubVecVT));
48247 Ops[0] = SubVec;
48248 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
48249 Ops);
48250 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
48251 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
48254 static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
48255 SDValue OpMustEq, SDValue Op, unsigned Depth) {
48256 // We don't want to go crazy with the recursion here. This isn't a super
48257 // important optimization.
48258 static constexpr unsigned kMaxDepth = 2;
48260 // Only do this re-ordering if op has one use.
48261 if (!Op.hasOneUse())
48262 return SDValue();
48264 SDLoc DL(Op);
48265 // If we hit another assosiative op, recurse further.
48266 if (Op.getOpcode() == Opc) {
48267 // Done recursing.
48268 if (Depth++ >= kMaxDepth)
48269 return SDValue();
48271 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
48272 if (SDValue R =
48273 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
48274 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
48275 Op.getOperand(1 - OpIdx));
48277 } else if (Op.getOpcode() == ISD::SUB) {
48278 if (Opc == ISD::AND) {
48279 // BLSI: (and x, (sub 0, x))
48280 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
48281 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48283 // Opc must be ISD::AND or ISD::XOR
48284 // BLSR: (and x, (sub x, 1))
48285 // BLSMSK: (xor x, (sub x, 1))
48286 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
48287 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48289 } else if (Op.getOpcode() == ISD::ADD) {
48290 // Opc must be ISD::AND or ISD::XOR
48291 // BLSR: (and x, (add x, -1))
48292 // BLSMSK: (xor x, (add x, -1))
48293 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
48294 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48296 return SDValue();
48299 static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,
48300 const X86Subtarget &Subtarget) {
48301 EVT VT = N->getValueType(0);
48302 // Make sure this node is a candidate for BMI instructions.
48303 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
48304 (VT != MVT::i32 && VT != MVT::i64))
48305 return SDValue();
48307 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
48309 // Try and match LHS and RHS.
48310 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
48311 if (SDValue OpMatch =
48312 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
48313 N->getOperand(1 - OpIdx), 0))
48314 return OpMatch;
48315 return SDValue();
48318 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
48319 TargetLowering::DAGCombinerInfo &DCI,
48320 const X86Subtarget &Subtarget) {
48321 SDValue N0 = N->getOperand(0);
48322 SDValue N1 = N->getOperand(1);
48323 EVT VT = N->getValueType(0);
48324 SDLoc dl(N);
48325 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48327 // If this is SSE1 only convert to FAND to avoid scalarization.
48328 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
48329 return DAG.getBitcast(MVT::v4i32,
48330 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
48331 DAG.getBitcast(MVT::v4f32, N0),
48332 DAG.getBitcast(MVT::v4f32, N1)));
48335 // Use a 32-bit and+zext if upper bits known zero.
48336 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
48337 APInt HiMask = APInt::getHighBitsSet(64, 32);
48338 if (DAG.MaskedValueIsZero(N1, HiMask) ||
48339 DAG.MaskedValueIsZero(N0, HiMask)) {
48340 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
48341 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
48342 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
48343 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
48347 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
48348 // TODO: Support multiple SrcOps.
48349 if (VT == MVT::i1) {
48350 SmallVector<SDValue, 2> SrcOps;
48351 SmallVector<APInt, 2> SrcPartials;
48352 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
48353 SrcOps.size() == 1) {
48354 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
48355 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
48356 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
48357 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
48358 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
48359 if (Mask) {
48360 assert(SrcPartials[0].getBitWidth() == NumElts &&
48361 "Unexpected partial reduction mask");
48362 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
48363 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
48364 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
48369 // InstCombine converts:
48370 // `(-x << C0) & C1`
48371 // to
48372 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
48373 // This saves an IR instruction but on x86 the neg/shift version is preferable
48374 // so undo the transform.
48376 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
48377 // TODO: We don't actually need a splat for this, we just need the checks to
48378 // hold for each element.
48379 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
48380 /*AllowTruncation*/ false);
48381 ConstantSDNode *N01C =
48382 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
48383 /*AllowTruncation*/ false);
48384 if (N1C && N01C) {
48385 const APInt &MulC = N01C->getAPIntValue();
48386 const APInt &AndC = N1C->getAPIntValue();
48387 APInt MulCLowBit = MulC & (-MulC);
48388 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
48389 (MulCLowBit + MulC).isPowerOf2()) {
48390 SDValue Neg = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT),
48391 N0.getOperand(0));
48392 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
48393 assert(MulCLowBitLog != -1 &&
48394 "Isolated lowbit is somehow not a power of 2!");
48395 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
48396 DAG.getConstant(MulCLowBitLog, dl, VT));
48397 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
48402 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
48403 return V;
48405 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
48406 return R;
48408 if (SDValue R = combineBitOpWithShift(N, DAG))
48409 return R;
48411 if (SDValue R = combineBitOpWithPACK(N, DAG))
48412 return R;
48414 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
48415 return FPLogic;
48417 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
48418 return R;
48420 if (DCI.isBeforeLegalizeOps())
48421 return SDValue();
48423 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
48424 return R;
48426 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
48427 return R;
48429 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
48430 return ShiftRight;
48432 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
48433 return R;
48435 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
48436 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
48437 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
48438 if (VT.isVector() && getTargetConstantFromNode(N1)) {
48439 unsigned Opc0 = N0.getOpcode();
48440 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
48441 getTargetConstantFromNode(N0.getOperand(1)) &&
48442 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
48443 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
48444 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
48445 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
48449 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
48450 // avoids slow variable shift (moving shift amount to ECX etc.)
48451 if (isOneConstant(N1) && N0->hasOneUse()) {
48452 SDValue Src = N0;
48453 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
48454 Src.getOpcode() == ISD::TRUNCATE) &&
48455 Src.getOperand(0)->hasOneUse())
48456 Src = Src.getOperand(0);
48457 bool ContainsNOT = false;
48458 X86::CondCode X86CC = X86::COND_B;
48459 // Peek through AND(NOT(SRL(X,Y)),1).
48460 if (isBitwiseNot(Src)) {
48461 Src = Src.getOperand(0);
48462 X86CC = X86::COND_AE;
48463 ContainsNOT = true;
48465 if (Src.getOpcode() == ISD::SRL &&
48466 !isa<ConstantSDNode>(Src.getOperand(1))) {
48467 SDValue BitNo = Src.getOperand(1);
48468 Src = Src.getOperand(0);
48469 // Peek through AND(SRL(NOT(X),Y),1).
48470 if (isBitwiseNot(Src)) {
48471 Src = Src.getOperand(0);
48472 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
48473 ContainsNOT = true;
48475 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
48476 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
48477 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
48478 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
48482 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
48483 // Attempt to recursively combine a bitmask AND with shuffles.
48484 SDValue Op(N, 0);
48485 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48486 return Res;
48488 // If either operand is a constant mask, then only the elements that aren't
48489 // zero are actually demanded by the other operand.
48490 auto GetDemandedMasks = [&](SDValue Op) {
48491 APInt UndefElts;
48492 SmallVector<APInt> EltBits;
48493 int NumElts = VT.getVectorNumElements();
48494 int EltSizeInBits = VT.getScalarSizeInBits();
48495 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
48496 APInt DemandedElts = APInt::getAllOnes(NumElts);
48497 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
48498 EltBits)) {
48499 DemandedBits.clearAllBits();
48500 DemandedElts.clearAllBits();
48501 for (int I = 0; I != NumElts; ++I) {
48502 if (UndefElts[I]) {
48503 // We can't assume an undef src element gives an undef dst - the
48504 // other src might be zero.
48505 DemandedBits.setAllBits();
48506 DemandedElts.setBit(I);
48507 } else if (!EltBits[I].isZero()) {
48508 DemandedBits |= EltBits[I];
48509 DemandedElts.setBit(I);
48513 return std::make_pair(DemandedBits, DemandedElts);
48515 APInt Bits0, Elts0;
48516 APInt Bits1, Elts1;
48517 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
48518 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
48520 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
48521 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
48522 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
48523 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
48524 if (N->getOpcode() != ISD::DELETED_NODE)
48525 DCI.AddToWorklist(N);
48526 return SDValue(N, 0);
48529 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
48530 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
48531 if (NewN0 || NewN1)
48532 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
48533 NewN1 ? NewN1 : N1);
48536 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
48537 if ((VT.getScalarSizeInBits() % 8) == 0 &&
48538 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
48539 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
48540 SDValue BitMask = N1;
48541 SDValue SrcVec = N0.getOperand(0);
48542 EVT SrcVecVT = SrcVec.getValueType();
48544 // Check that the constant bitmask masks whole bytes.
48545 APInt UndefElts;
48546 SmallVector<APInt, 64> EltBits;
48547 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
48548 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
48549 llvm::all_of(EltBits, [](const APInt &M) {
48550 return M.isZero() || M.isAllOnes();
48551 })) {
48552 unsigned NumElts = SrcVecVT.getVectorNumElements();
48553 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
48554 unsigned Idx = N0.getConstantOperandVal(1);
48556 // Create a root shuffle mask from the byte mask and the extracted index.
48557 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
48558 for (unsigned i = 0; i != Scale; ++i) {
48559 if (UndefElts[i])
48560 continue;
48561 int VecIdx = Scale * Idx + i;
48562 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
48565 if (SDValue Shuffle = combineX86ShufflesRecursively(
48566 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
48567 X86::MaxShuffleCombineDepth,
48568 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
48569 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
48570 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
48571 N0.getOperand(1));
48575 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
48576 return R;
48578 return SDValue();
48581 // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
48582 static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
48583 const X86Subtarget &Subtarget) {
48584 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
48586 MVT VT = N->getSimpleValueType(0);
48587 unsigned EltSizeInBits = VT.getScalarSizeInBits();
48588 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
48589 return SDValue();
48591 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
48592 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
48593 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
48594 return SDValue();
48596 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
48597 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
48598 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
48599 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
48600 return SDValue();
48602 // Attempt to extract constant byte masks.
48603 APInt UndefElts0, UndefElts1;
48604 SmallVector<APInt, 32> EltBits0, EltBits1;
48605 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
48606 false, false))
48607 return SDValue();
48608 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
48609 false, false))
48610 return SDValue();
48612 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
48613 // TODO - add UNDEF elts support.
48614 if (UndefElts0[i] || UndefElts1[i])
48615 return SDValue();
48616 if (EltBits0[i] != ~EltBits1[i])
48617 return SDValue();
48620 SDLoc DL(N);
48622 if (useVPTERNLOG(Subtarget, VT)) {
48623 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
48624 // VPTERNLOG is only available as vXi32/64-bit types.
48625 MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;
48626 MVT OpVT =
48627 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
48628 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
48629 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
48630 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
48631 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
48632 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
48633 DAG, Subtarget);
48634 return DAG.getBitcast(VT, Res);
48637 SDValue X = N->getOperand(0);
48638 SDValue Y =
48639 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
48640 DAG.getBitcast(VT, N1.getOperand(0)));
48641 return DAG.getNode(ISD::OR, DL, VT, X, Y);
48644 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
48645 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
48646 if (N->getOpcode() != ISD::OR)
48647 return false;
48649 SDValue N0 = N->getOperand(0);
48650 SDValue N1 = N->getOperand(1);
48652 // Canonicalize AND to LHS.
48653 if (N1.getOpcode() == ISD::AND)
48654 std::swap(N0, N1);
48656 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
48657 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
48658 return false;
48660 Mask = N1.getOperand(0);
48661 X = N1.getOperand(1);
48663 // Check to see if the mask appeared in both the AND and ANDNP.
48664 if (N0.getOperand(0) == Mask)
48665 Y = N0.getOperand(1);
48666 else if (N0.getOperand(1) == Mask)
48667 Y = N0.getOperand(0);
48668 else
48669 return false;
48671 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
48672 // ANDNP combine allows other combines to happen that prevent matching.
48673 return true;
48676 // Try to fold:
48677 // (or (and (m, y), (pandn m, x)))
48678 // into:
48679 // (vselect m, x, y)
48680 // As a special case, try to fold:
48681 // (or (and (m, (sub 0, x)), (pandn m, x)))
48682 // into:
48683 // (sub (xor X, M), M)
48684 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
48685 const X86Subtarget &Subtarget) {
48686 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
48688 EVT VT = N->getValueType(0);
48689 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
48690 (VT.is256BitVector() && Subtarget.hasInt256())))
48691 return SDValue();
48693 SDValue X, Y, Mask;
48694 if (!matchLogicBlend(N, X, Y, Mask))
48695 return SDValue();
48697 // Validate that X, Y, and Mask are bitcasts, and see through them.
48698 Mask = peekThroughBitcasts(Mask);
48699 X = peekThroughBitcasts(X);
48700 Y = peekThroughBitcasts(Y);
48702 EVT MaskVT = Mask.getValueType();
48703 unsigned EltBits = MaskVT.getScalarSizeInBits();
48705 // TODO: Attempt to handle floating point cases as well?
48706 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
48707 return SDValue();
48709 SDLoc DL(N);
48711 // Attempt to combine to conditional negate: (sub (xor X, M), M)
48712 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
48713 DAG, Subtarget))
48714 return Res;
48716 // PBLENDVB is only available on SSE 4.1.
48717 if (!Subtarget.hasSSE41())
48718 return SDValue();
48720 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
48721 if (Subtarget.hasVLX())
48722 return SDValue();
48724 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
48726 X = DAG.getBitcast(BlendVT, X);
48727 Y = DAG.getBitcast(BlendVT, Y);
48728 Mask = DAG.getBitcast(BlendVT, Mask);
48729 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
48730 return DAG.getBitcast(VT, Mask);
48733 // Helper function for combineOrCmpEqZeroToCtlzSrl
48734 // Transforms:
48735 // seteq(cmp x, 0)
48736 // into:
48737 // srl(ctlz x), log2(bitsize(x))
48738 // Input pattern is checked by caller.
48739 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
48740 SDValue Cmp = Op.getOperand(1);
48741 EVT VT = Cmp.getOperand(0).getValueType();
48742 unsigned Log2b = Log2_32(VT.getSizeInBits());
48743 SDLoc dl(Op);
48744 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
48745 // The result of the shift is true or false, and on X86, the 32-bit
48746 // encoding of shr and lzcnt is more desirable.
48747 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
48748 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
48749 DAG.getConstant(Log2b, dl, MVT::i8));
48750 return Scc;
48753 // Try to transform:
48754 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
48755 // into:
48756 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
48757 // Will also attempt to match more generic cases, eg:
48758 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
48759 // Only applies if the target supports the FastLZCNT feature.
48760 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
48761 TargetLowering::DAGCombinerInfo &DCI,
48762 const X86Subtarget &Subtarget) {
48763 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
48764 return SDValue();
48766 auto isORCandidate = [](SDValue N) {
48767 return (N->getOpcode() == ISD::OR && N->hasOneUse());
48770 // Check the zero extend is extending to 32-bit or more. The code generated by
48771 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
48772 // instructions to clear the upper bits.
48773 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
48774 !isORCandidate(N->getOperand(0)))
48775 return SDValue();
48777 // Check the node matches: setcc(eq, cmp 0)
48778 auto isSetCCCandidate = [](SDValue N) {
48779 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
48780 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
48781 N->getOperand(1).getOpcode() == X86ISD::CMP &&
48782 isNullConstant(N->getOperand(1).getOperand(1)) &&
48783 N->getOperand(1).getValueType().bitsGE(MVT::i32);
48786 SDNode *OR = N->getOperand(0).getNode();
48787 SDValue LHS = OR->getOperand(0);
48788 SDValue RHS = OR->getOperand(1);
48790 // Save nodes matching or(or, setcc(eq, cmp 0)).
48791 SmallVector<SDNode *, 2> ORNodes;
48792 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
48793 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
48794 ORNodes.push_back(OR);
48795 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
48796 LHS = OR->getOperand(0);
48797 RHS = OR->getOperand(1);
48800 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
48801 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
48802 !isORCandidate(SDValue(OR, 0)))
48803 return SDValue();
48805 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
48806 // to
48807 // or(srl(ctlz),srl(ctlz)).
48808 // The dag combiner can then fold it into:
48809 // srl(or(ctlz, ctlz)).
48810 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
48811 SDValue Ret, NewRHS;
48812 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
48813 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
48815 if (!Ret)
48816 return SDValue();
48818 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
48819 while (!ORNodes.empty()) {
48820 OR = ORNodes.pop_back_val();
48821 LHS = OR->getOperand(0);
48822 RHS = OR->getOperand(1);
48823 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
48824 if (RHS->getOpcode() == ISD::OR)
48825 std::swap(LHS, RHS);
48826 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
48827 if (!NewRHS)
48828 return SDValue();
48829 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
48832 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
48835 static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
48836 SDValue And1_L, SDValue And1_R,
48837 const SDLoc &DL, SelectionDAG &DAG) {
48838 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
48839 return SDValue();
48840 SDValue NotOp = And0_L->getOperand(0);
48841 if (NotOp == And1_R)
48842 std::swap(And1_R, And1_L);
48843 if (NotOp != And1_L)
48844 return SDValue();
48846 // (~(NotOp) & And0_R) | (NotOp & And1_R)
48847 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
48848 EVT VT = And1_L->getValueType(0);
48849 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
48850 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
48851 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
48852 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
48853 return Xor1;
48856 /// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
48857 /// equivalent `((x ^ y) & m) ^ y)` pattern.
48858 /// This is typically a better representation for targets without a fused
48859 /// "and-not" operation. This function is intended to be called from a
48860 /// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
48861 static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
48862 // Note that masked-merge variants using XOR or ADD expressions are
48863 // normalized to OR by InstCombine so we only check for OR.
48864 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
48865 SDValue N0 = Node->getOperand(0);
48866 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
48867 return SDValue();
48868 SDValue N1 = Node->getOperand(1);
48869 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
48870 return SDValue();
48872 SDLoc DL(Node);
48873 SDValue N00 = N0->getOperand(0);
48874 SDValue N01 = N0->getOperand(1);
48875 SDValue N10 = N1->getOperand(0);
48876 SDValue N11 = N1->getOperand(1);
48877 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
48878 return Result;
48879 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
48880 return Result;
48881 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
48882 return Result;
48883 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
48884 return Result;
48885 return SDValue();
48888 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
48889 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
48890 /// with CMP+{ADC, SBB}.
48891 /// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
48892 static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
48893 SDValue X, SDValue Y,
48894 SelectionDAG &DAG,
48895 bool ZeroSecondOpOnly = false) {
48896 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
48897 return SDValue();
48899 // Look through a one-use zext.
48900 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
48901 Y = Y.getOperand(0);
48903 X86::CondCode CC;
48904 SDValue EFLAGS;
48905 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
48906 CC = (X86::CondCode)Y.getConstantOperandVal(0);
48907 EFLAGS = Y.getOperand(1);
48908 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
48909 Y.hasOneUse()) {
48910 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
48913 if (!EFLAGS)
48914 return SDValue();
48916 // If X is -1 or 0, then we have an opportunity to avoid constants required in
48917 // the general case below.
48918 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
48919 if (ConstantX && !ZeroSecondOpOnly) {
48920 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
48921 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
48922 // This is a complicated way to get -1 or 0 from the carry flag:
48923 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
48924 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
48925 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
48926 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
48927 EFLAGS);
48930 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
48931 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
48932 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
48933 EFLAGS.getValueType().isInteger() &&
48934 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
48935 // Swap the operands of a SUB, and we have the same pattern as above.
48936 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
48937 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
48938 SDValue NewSub = DAG.getNode(
48939 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
48940 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
48941 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
48942 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
48943 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
48944 NewEFLAGS);
48949 if (CC == X86::COND_B) {
48950 // X + SETB Z --> adc X, 0
48951 // X - SETB Z --> sbb X, 0
48952 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
48953 DAG.getVTList(VT, MVT::i32), X,
48954 DAG.getConstant(0, DL, VT), EFLAGS);
48957 if (ZeroSecondOpOnly)
48958 return SDValue();
48960 if (CC == X86::COND_A) {
48961 // Try to convert COND_A into COND_B in an attempt to facilitate
48962 // materializing "setb reg".
48964 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
48965 // cannot take an immediate as its first operand.
48967 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
48968 EFLAGS.getValueType().isInteger() &&
48969 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
48970 SDValue NewSub =
48971 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
48972 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
48973 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
48974 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
48975 DAG.getVTList(VT, MVT::i32), X,
48976 DAG.getConstant(0, DL, VT), NewEFLAGS);
48980 if (CC == X86::COND_AE) {
48981 // X + SETAE --> sbb X, -1
48982 // X - SETAE --> adc X, -1
48983 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
48984 DAG.getVTList(VT, MVT::i32), X,
48985 DAG.getConstant(-1, DL, VT), EFLAGS);
48988 if (CC == X86::COND_BE) {
48989 // X + SETBE --> sbb X, -1
48990 // X - SETBE --> adc X, -1
48991 // Try to convert COND_BE into COND_AE in an attempt to facilitate
48992 // materializing "setae reg".
48994 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
48995 // cannot take an immediate as its first operand.
48997 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
48998 EFLAGS.getValueType().isInteger() &&
48999 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49000 SDValue NewSub =
49001 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49002 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49003 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49004 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49005 DAG.getVTList(VT, MVT::i32), X,
49006 DAG.getConstant(-1, DL, VT), NewEFLAGS);
49010 if (CC != X86::COND_E && CC != X86::COND_NE)
49011 return SDValue();
49013 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
49014 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
49015 !EFLAGS.getOperand(0).getValueType().isInteger())
49016 return SDValue();
49018 SDValue Z = EFLAGS.getOperand(0);
49019 EVT ZVT = Z.getValueType();
49021 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49022 // the general case below.
49023 if (ConstantX) {
49024 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
49025 // fake operands:
49026 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
49027 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
49028 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
49029 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
49030 SDValue Zero = DAG.getConstant(0, DL, ZVT);
49031 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49032 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
49033 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49034 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49035 SDValue(Neg.getNode(), 1));
49038 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
49039 // with fake operands:
49040 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
49041 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
49042 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
49043 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
49044 SDValue One = DAG.getConstant(1, DL, ZVT);
49045 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49046 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49047 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49048 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49049 Cmp1.getValue(1));
49053 // (cmp Z, 1) sets the carry flag if Z is 0.
49054 SDValue One = DAG.getConstant(1, DL, ZVT);
49055 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49056 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49058 // Add the flags type for ADC/SBB nodes.
49059 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49061 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
49062 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
49063 if (CC == X86::COND_NE)
49064 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
49065 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
49067 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
49068 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
49069 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
49070 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
49073 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
49074 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49075 /// with CMP+{ADC, SBB}.
49076 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
49077 bool IsSub = N->getOpcode() == ISD::SUB;
49078 SDValue X = N->getOperand(0);
49079 SDValue Y = N->getOperand(1);
49080 EVT VT = N->getValueType(0);
49081 SDLoc DL(N);
49083 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
49084 return ADCOrSBB;
49086 // Commute and try again (negate the result for subtracts).
49087 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
49088 if (IsSub)
49089 ADCOrSBB =
49090 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
49091 return ADCOrSBB;
49094 return SDValue();
49097 static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1,
49098 SelectionDAG &DAG) {
49099 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&
49100 "Unexpected opcode");
49102 // Delegate to combineAddOrSubToADCOrSBB if we have:
49104 // (xor/or (zero_extend (setcc)) imm)
49106 // where imm is odd if and only if we have xor, in which case the XOR/OR are
49107 // equivalent to a SUB/ADD, respectively.
49108 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
49109 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
49110 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
49111 bool IsSub = N->getOpcode() == ISD::XOR;
49112 bool N1COdd = N1C->getZExtValue() & 1;
49113 if (IsSub ? N1COdd : !N1COdd) {
49114 SDLoc DL(N);
49115 EVT VT = N->getValueType(0);
49116 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
49117 return R;
49122 return SDValue();
49125 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
49126 TargetLowering::DAGCombinerInfo &DCI,
49127 const X86Subtarget &Subtarget) {
49128 SDValue N0 = N->getOperand(0);
49129 SDValue N1 = N->getOperand(1);
49130 EVT VT = N->getValueType(0);
49131 SDLoc dl(N);
49132 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49134 // If this is SSE1 only convert to FOR to avoid scalarization.
49135 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49136 return DAG.getBitcast(MVT::v4i32,
49137 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
49138 DAG.getBitcast(MVT::v4f32, N0),
49139 DAG.getBitcast(MVT::v4f32, N1)));
49142 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
49143 // TODO: Support multiple SrcOps.
49144 if (VT == MVT::i1) {
49145 SmallVector<SDValue, 2> SrcOps;
49146 SmallVector<APInt, 2> SrcPartials;
49147 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
49148 SrcOps.size() == 1) {
49149 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49150 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49151 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49152 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49153 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49154 if (Mask) {
49155 assert(SrcPartials[0].getBitWidth() == NumElts &&
49156 "Unexpected partial reduction mask");
49157 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
49158 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49159 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49160 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
49165 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49166 return R;
49168 if (SDValue R = combineBitOpWithShift(N, DAG))
49169 return R;
49171 if (SDValue R = combineBitOpWithPACK(N, DAG))
49172 return R;
49174 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49175 return FPLogic;
49177 if (DCI.isBeforeLegalizeOps())
49178 return SDValue();
49180 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49181 return R;
49183 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
49184 return R;
49186 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
49187 return R;
49189 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
49190 if ((VT == MVT::i32 || VT == MVT::i64) &&
49191 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
49192 isNullConstant(N0.getOperand(0))) {
49193 SDValue Cond = N0.getOperand(1);
49194 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
49195 Cond = Cond.getOperand(0);
49197 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
49198 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
49199 uint64_t Val = CN->getZExtValue();
49200 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
49201 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
49202 CCode = X86::GetOppositeBranchCondition(CCode);
49203 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
49205 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
49206 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
49207 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
49208 return R;
49214 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
49215 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
49216 // iff the upper elements of the non-shifted arg are zero.
49217 // KUNPCK require 16+ bool vector elements.
49218 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
49219 unsigned NumElts = VT.getVectorNumElements();
49220 unsigned HalfElts = NumElts / 2;
49221 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
49222 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
49223 N1.getConstantOperandAPInt(1) == HalfElts &&
49224 DAG.MaskedVectorIsZero(N0, UpperElts)) {
49225 return DAG.getNode(
49226 ISD::CONCAT_VECTORS, dl, VT,
49227 extractSubVector(N0, 0, DAG, dl, HalfElts),
49228 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
49230 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
49231 N0.getConstantOperandAPInt(1) == HalfElts &&
49232 DAG.MaskedVectorIsZero(N1, UpperElts)) {
49233 return DAG.getNode(
49234 ISD::CONCAT_VECTORS, dl, VT,
49235 extractSubVector(N1, 0, DAG, dl, HalfElts),
49236 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
49240 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
49241 // Attempt to recursively combine an OR of shuffles.
49242 SDValue Op(N, 0);
49243 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49244 return Res;
49246 // If either operand is a constant mask, then only the elements that aren't
49247 // allones are actually demanded by the other operand.
49248 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
49249 APInt UndefElts;
49250 SmallVector<APInt> EltBits;
49251 int NumElts = VT.getVectorNumElements();
49252 int EltSizeInBits = VT.getScalarSizeInBits();
49253 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
49254 return false;
49256 APInt DemandedElts = APInt::getZero(NumElts);
49257 for (int I = 0; I != NumElts; ++I)
49258 if (!EltBits[I].isAllOnes())
49259 DemandedElts.setBit(I);
49261 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
49263 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
49264 if (N->getOpcode() != ISD::DELETED_NODE)
49265 DCI.AddToWorklist(N);
49266 return SDValue(N, 0);
49270 // We should fold "masked merge" patterns when `andn` is not available.
49271 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
49272 if (SDValue R = foldMaskedMerge(N, DAG))
49273 return R;
49275 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
49276 return R;
49278 return SDValue();
49281 /// Try to turn tests against the signbit in the form of:
49282 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
49283 /// into:
49284 /// SETGT(X, -1)
49285 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
49286 // This is only worth doing if the output type is i8 or i1.
49287 EVT ResultType = N->getValueType(0);
49288 if (ResultType != MVT::i8 && ResultType != MVT::i1)
49289 return SDValue();
49291 SDValue N0 = N->getOperand(0);
49292 SDValue N1 = N->getOperand(1);
49294 // We should be performing an xor against a truncated shift.
49295 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
49296 return SDValue();
49298 // Make sure we are performing an xor against one.
49299 if (!isOneConstant(N1))
49300 return SDValue();
49302 // SetCC on x86 zero extends so only act on this if it's a logical shift.
49303 SDValue Shift = N0.getOperand(0);
49304 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
49305 return SDValue();
49307 // Make sure we are truncating from one of i16, i32 or i64.
49308 EVT ShiftTy = Shift.getValueType();
49309 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
49310 return SDValue();
49312 // Make sure the shift amount extracts the sign bit.
49313 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
49314 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
49315 return SDValue();
49317 // Create a greater-than comparison against -1.
49318 // N.B. Using SETGE against 0 works but we want a canonical looking
49319 // comparison, using SETGT matches up with what TranslateX86CC.
49320 SDLoc DL(N);
49321 SDValue ShiftOp = Shift.getOperand(0);
49322 EVT ShiftOpTy = ShiftOp.getValueType();
49323 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49324 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
49325 *DAG.getContext(), ResultType);
49326 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
49327 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
49328 if (SetCCResultType != ResultType)
49329 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
49330 return Cond;
49333 /// Turn vector tests of the signbit in the form of:
49334 /// xor (sra X, elt_size(X)-1), -1
49335 /// into:
49336 /// pcmpgt X, -1
49338 /// This should be called before type legalization because the pattern may not
49339 /// persist after that.
49340 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
49341 const X86Subtarget &Subtarget) {
49342 EVT VT = N->getValueType(0);
49343 if (!VT.isSimple())
49344 return SDValue();
49346 switch (VT.getSimpleVT().SimpleTy) {
49347 default: return SDValue();
49348 case MVT::v16i8:
49349 case MVT::v8i16:
49350 case MVT::v4i32:
49351 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
49352 case MVT::v32i8:
49353 case MVT::v16i16:
49354 case MVT::v8i32:
49355 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
49358 // There must be a shift right algebraic before the xor, and the xor must be a
49359 // 'not' operation.
49360 SDValue Shift = N->getOperand(0);
49361 SDValue Ones = N->getOperand(1);
49362 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
49363 !ISD::isBuildVectorAllOnes(Ones.getNode()))
49364 return SDValue();
49366 // The shift should be smearing the sign bit across each vector element.
49367 auto *ShiftAmt =
49368 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
49369 if (!ShiftAmt ||
49370 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
49371 return SDValue();
49373 // Create a greater-than comparison against -1. We don't use the more obvious
49374 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
49375 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
49378 /// Detect patterns of truncation with unsigned saturation:
49380 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
49381 /// Return the source value x to be truncated or SDValue() if the pattern was
49382 /// not matched.
49384 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
49385 /// where C1 >= 0 and C2 is unsigned max of destination type.
49387 /// (truncate (smax (smin (x, C2), C1)) to dest_type)
49388 /// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
49390 /// These two patterns are equivalent to:
49391 /// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
49392 /// So return the smax(x, C1) value to be truncated or SDValue() if the
49393 /// pattern was not matched.
49394 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
49395 const SDLoc &DL) {
49396 EVT InVT = In.getValueType();
49398 // Saturation with truncation. We truncate from InVT to VT.
49399 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
49400 "Unexpected types for truncate operation");
49402 // Match min/max and return limit value as a parameter.
49403 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
49404 if (V.getOpcode() == Opcode &&
49405 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
49406 return V.getOperand(0);
49407 return SDValue();
49410 APInt C1, C2;
49411 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
49412 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
49413 // the element size of the destination type.
49414 if (C2.isMask(VT.getScalarSizeInBits()))
49415 return UMin;
49417 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
49418 if (MatchMinMax(SMin, ISD::SMAX, C1))
49419 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
49420 return SMin;
49422 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
49423 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
49424 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
49425 C2.uge(C1)) {
49426 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
49429 return SDValue();
49432 /// Detect patterns of truncation with signed saturation:
49433 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
49434 /// signed_max_of_dest_type)) to dest_type)
49435 /// or:
49436 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
49437 /// signed_min_of_dest_type)) to dest_type).
49438 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
49439 /// Return the source value to be truncated or SDValue() if the pattern was not
49440 /// matched.
49441 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
49442 unsigned NumDstBits = VT.getScalarSizeInBits();
49443 unsigned NumSrcBits = In.getScalarValueSizeInBits();
49444 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
49446 auto MatchMinMax = [](SDValue V, unsigned Opcode,
49447 const APInt &Limit) -> SDValue {
49448 APInt C;
49449 if (V.getOpcode() == Opcode &&
49450 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
49451 return V.getOperand(0);
49452 return SDValue();
49455 APInt SignedMax, SignedMin;
49456 if (MatchPackUS) {
49457 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
49458 SignedMin = APInt(NumSrcBits, 0);
49459 } else {
49460 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
49461 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
49464 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
49465 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
49466 return SMax;
49468 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
49469 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
49470 return SMin;
49472 return SDValue();
49475 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
49476 SelectionDAG &DAG,
49477 const X86Subtarget &Subtarget) {
49478 if (!Subtarget.hasSSE2() || !VT.isVector())
49479 return SDValue();
49481 EVT SVT = VT.getVectorElementType();
49482 EVT InVT = In.getValueType();
49483 EVT InSVT = InVT.getVectorElementType();
49485 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
49486 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
49487 // and concatenate at the same time. Then we can use a final vpmovuswb to
49488 // clip to 0-255.
49489 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
49490 InVT == MVT::v16i32 && VT == MVT::v16i8) {
49491 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
49492 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
49493 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
49494 DL, DAG, Subtarget);
49495 assert(Mid && "Failed to pack!");
49496 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
49500 // vXi32 truncate instructions are available with AVX512F.
49501 // vXi16 truncate instructions are only available with AVX512BW.
49502 // For 256-bit or smaller vectors, we require VLX.
49503 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
49504 // If the result type is 256-bits or larger and we have disable 512-bit
49505 // registers, we should go ahead and use the pack instructions if possible.
49506 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
49507 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
49508 (InVT.getSizeInBits() > 128) &&
49509 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
49510 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
49512 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
49513 isPowerOf2_32(VT.getVectorNumElements()) &&
49514 (SVT == MVT::i8 || SVT == MVT::i16) &&
49515 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
49516 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
49517 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
49518 if (SVT == MVT::i8 && InSVT == MVT::i32) {
49519 EVT MidVT = VT.changeVectorElementType(MVT::i16);
49520 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
49521 DAG, Subtarget);
49522 assert(Mid && "Failed to pack!");
49523 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
49524 Subtarget);
49525 assert(V && "Failed to pack!");
49526 return V;
49527 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
49528 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
49529 Subtarget);
49531 if (SDValue SSatVal = detectSSatPattern(In, VT))
49532 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
49533 Subtarget);
49536 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49537 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
49538 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
49539 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
49540 unsigned TruncOpc = 0;
49541 SDValue SatVal;
49542 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
49543 SatVal = SSatVal;
49544 TruncOpc = X86ISD::VTRUNCS;
49545 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
49546 SatVal = USatVal;
49547 TruncOpc = X86ISD::VTRUNCUS;
49549 if (SatVal) {
49550 unsigned ResElts = VT.getVectorNumElements();
49551 // If the input type is less than 512 bits and we don't have VLX, we need
49552 // to widen to 512 bits.
49553 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
49554 unsigned NumConcats = 512 / InVT.getSizeInBits();
49555 ResElts *= NumConcats;
49556 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
49557 ConcatOps[0] = SatVal;
49558 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
49559 NumConcats * InVT.getVectorNumElements());
49560 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
49562 // Widen the result if its narrower than 128 bits.
49563 if (ResElts * SVT.getSizeInBits() < 128)
49564 ResElts = 128 / SVT.getSizeInBits();
49565 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
49566 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
49567 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
49568 DAG.getIntPtrConstant(0, DL));
49572 return SDValue();
49575 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
49576 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
49577 /// ISD::AVGCEILU (AVG) instruction.
49578 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
49579 const X86Subtarget &Subtarget,
49580 const SDLoc &DL) {
49581 if (!VT.isVector())
49582 return SDValue();
49583 EVT InVT = In.getValueType();
49584 unsigned NumElems = VT.getVectorNumElements();
49586 EVT ScalarVT = VT.getVectorElementType();
49587 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
49588 return SDValue();
49590 // InScalarVT is the intermediate type in AVG pattern and it should be greater
49591 // than the original input type (i8/i16).
49592 EVT InScalarVT = InVT.getVectorElementType();
49593 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
49594 return SDValue();
49596 if (!Subtarget.hasSSE2())
49597 return SDValue();
49599 // Detect the following pattern:
49601 // %1 = zext <N x i8> %a to <N x i32>
49602 // %2 = zext <N x i8> %b to <N x i32>
49603 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
49604 // %4 = add nuw nsw <N x i32> %3, %2
49605 // %5 = lshr <N x i32> %N, <i32 1 x N>
49606 // %6 = trunc <N x i32> %5 to <N x i8>
49608 // In AVX512, the last instruction can also be a trunc store.
49609 if (In.getOpcode() != ISD::SRL)
49610 return SDValue();
49612 // A lambda checking the given SDValue is a constant vector and each element
49613 // is in the range [Min, Max].
49614 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
49615 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
49616 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
49620 auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
49621 unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
49622 return MaxActiveBits <= ScalarVT.getSizeInBits();
49625 // Check if each element of the vector is right-shifted by one.
49626 SDValue LHS = In.getOperand(0);
49627 SDValue RHS = In.getOperand(1);
49628 if (!IsConstVectorInRange(RHS, 1, 1))
49629 return SDValue();
49630 if (LHS.getOpcode() != ISD::ADD)
49631 return SDValue();
49633 // Detect a pattern of a + b + 1 where the order doesn't matter.
49634 SDValue Operands[3];
49635 Operands[0] = LHS.getOperand(0);
49636 Operands[1] = LHS.getOperand(1);
49638 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49639 ArrayRef<SDValue> Ops) {
49640 return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
49643 auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
49644 for (SDValue &Op : Ops)
49645 if (Op.getValueType() != VT)
49646 Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
49647 // Pad to a power-of-2 vector, split+apply and extract the original vector.
49648 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
49649 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
49650 if (NumElemsPow2 != NumElems) {
49651 for (SDValue &Op : Ops) {
49652 SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
49653 for (unsigned i = 0; i != NumElems; ++i) {
49654 SDValue Idx = DAG.getIntPtrConstant(i, DL);
49655 EltsOfOp[i] =
49656 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
49658 Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
49661 SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
49662 if (NumElemsPow2 == NumElems)
49663 return Res;
49664 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
49665 DAG.getIntPtrConstant(0, DL));
49668 // Take care of the case when one of the operands is a constant vector whose
49669 // element is in the range [1, 256].
49670 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
49671 IsZExtLike(Operands[0])) {
49672 // The pattern is detected. Subtract one from the constant vector, then
49673 // demote it and emit X86ISD::AVG instruction.
49674 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
49675 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
49676 return AVGSplitter({Operands[0], Operands[1]});
49679 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
49680 // Match the or case only if its 'add-like' - can be replaced by an add.
49681 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
49682 if (ISD::ADD == V.getOpcode()) {
49683 Op0 = V.getOperand(0);
49684 Op1 = V.getOperand(1);
49685 return true;
49687 if (ISD::ZERO_EXTEND != V.getOpcode())
49688 return false;
49689 V = V.getOperand(0);
49690 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
49691 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
49692 return false;
49693 Op0 = V.getOperand(0);
49694 Op1 = V.getOperand(1);
49695 return true;
49698 SDValue Op0, Op1;
49699 if (FindAddLike(Operands[0], Op0, Op1))
49700 std::swap(Operands[0], Operands[1]);
49701 else if (!FindAddLike(Operands[1], Op0, Op1))
49702 return SDValue();
49703 Operands[2] = Op0;
49704 Operands[1] = Op1;
49706 // Now we have three operands of two additions. Check that one of them is a
49707 // constant vector with ones, and the other two can be promoted from i8/i16.
49708 for (SDValue &Op : Operands) {
49709 if (!IsConstVectorInRange(Op, 1, 1))
49710 continue;
49711 std::swap(Op, Operands[2]);
49713 // Check if Operands[0] and Operands[1] are results of type promotion.
49714 for (int j = 0; j < 2; ++j)
49715 if (Operands[j].getValueType() != VT)
49716 if (!IsZExtLike(Operands[j]))
49717 return SDValue();
49719 // The pattern is detected, emit X86ISD::AVG instruction(s).
49720 return AVGSplitter({Operands[0], Operands[1]});
49723 return SDValue();
49726 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
49727 TargetLowering::DAGCombinerInfo &DCI,
49728 const X86Subtarget &Subtarget) {
49729 LoadSDNode *Ld = cast<LoadSDNode>(N);
49730 EVT RegVT = Ld->getValueType(0);
49731 EVT MemVT = Ld->getMemoryVT();
49732 SDLoc dl(Ld);
49733 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49735 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
49736 // into two 16-byte operations. Also split non-temporal aligned loads on
49737 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
49738 ISD::LoadExtType Ext = Ld->getExtensionType();
49739 unsigned Fast;
49740 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
49741 Ext == ISD::NON_EXTLOAD &&
49742 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
49743 Ld->getAlign() >= Align(16)) ||
49744 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
49745 *Ld->getMemOperand(), &Fast) &&
49746 !Fast))) {
49747 unsigned NumElems = RegVT.getVectorNumElements();
49748 if (NumElems < 2)
49749 return SDValue();
49751 unsigned HalfOffset = 16;
49752 SDValue Ptr1 = Ld->getBasePtr();
49753 SDValue Ptr2 =
49754 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
49755 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
49756 NumElems / 2);
49757 SDValue Load1 =
49758 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
49759 Ld->getOriginalAlign(),
49760 Ld->getMemOperand()->getFlags());
49761 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
49762 Ld->getPointerInfo().getWithOffset(HalfOffset),
49763 Ld->getOriginalAlign(),
49764 Ld->getMemOperand()->getFlags());
49765 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
49766 Load1.getValue(1), Load2.getValue(1));
49768 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
49769 return DCI.CombineTo(N, NewVec, TF, true);
49772 // Bool vector load - attempt to cast to an integer, as we have good
49773 // (vXiY *ext(vXi1 bitcast(iX))) handling.
49774 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
49775 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
49776 unsigned NumElts = RegVT.getVectorNumElements();
49777 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49778 if (TLI.isTypeLegal(IntVT)) {
49779 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
49780 Ld->getPointerInfo(),
49781 Ld->getOriginalAlign(),
49782 Ld->getMemOperand()->getFlags());
49783 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
49784 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
49788 // If we also broadcast this as a subvector to a wider type, then just extract
49789 // the lowest subvector.
49790 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
49791 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
49792 SDValue Ptr = Ld->getBasePtr();
49793 SDValue Chain = Ld->getChain();
49794 for (SDNode *User : Ptr->uses()) {
49795 if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
49796 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
49797 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
49798 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
49799 MemVT.getSizeInBits() &&
49800 !User->hasAnyUseOfValue(1) &&
49801 User->getValueSizeInBits(0).getFixedValue() >
49802 RegVT.getFixedSizeInBits()) {
49803 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
49804 RegVT.getSizeInBits());
49805 Extract = DAG.getBitcast(RegVT, Extract);
49806 return DCI.CombineTo(N, Extract, SDValue(User, 1));
49811 // Cast ptr32 and ptr64 pointers to the default address space before a load.
49812 unsigned AddrSpace = Ld->getAddressSpace();
49813 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
49814 AddrSpace == X86AS::PTR32_UPTR) {
49815 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
49816 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
49817 SDValue Cast =
49818 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
49819 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
49820 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
49821 Ld->getMemOperand()->getFlags());
49825 return SDValue();
49828 /// If V is a build vector of boolean constants and exactly one of those
49829 /// constants is true, return the operand index of that true element.
49830 /// Otherwise, return -1.
49831 static int getOneTrueElt(SDValue V) {
49832 // This needs to be a build vector of booleans.
49833 // TODO: Checking for the i1 type matches the IR definition for the mask,
49834 // but the mask check could be loosened to i8 or other types. That might
49835 // also require checking more than 'allOnesValue'; eg, the x86 HW
49836 // instructions only require that the MSB is set for each mask element.
49837 // The ISD::MSTORE comments/definition do not specify how the mask operand
49838 // is formatted.
49839 auto *BV = dyn_cast<BuildVectorSDNode>(V);
49840 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
49841 return -1;
49843 int TrueIndex = -1;
49844 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
49845 for (unsigned i = 0; i < NumElts; ++i) {
49846 const SDValue &Op = BV->getOperand(i);
49847 if (Op.isUndef())
49848 continue;
49849 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
49850 if (!ConstNode)
49851 return -1;
49852 if (ConstNode->getAPIntValue().countr_one() >= 1) {
49853 // If we already found a one, this is too many.
49854 if (TrueIndex >= 0)
49855 return -1;
49856 TrueIndex = i;
49859 return TrueIndex;
49862 /// Given a masked memory load/store operation, return true if it has one mask
49863 /// bit set. If it has one mask bit set, then also return the memory address of
49864 /// the scalar element to load/store, the vector index to insert/extract that
49865 /// scalar element, and the alignment for the scalar memory access.
49866 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
49867 SelectionDAG &DAG, SDValue &Addr,
49868 SDValue &Index, Align &Alignment,
49869 unsigned &Offset) {
49870 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
49871 if (TrueMaskElt < 0)
49872 return false;
49874 // Get the address of the one scalar element that is specified by the mask
49875 // using the appropriate offset from the base pointer.
49876 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
49877 Offset = 0;
49878 Addr = MaskedOp->getBasePtr();
49879 if (TrueMaskElt != 0) {
49880 Offset = TrueMaskElt * EltVT.getStoreSize();
49881 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
49882 SDLoc(MaskedOp));
49885 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
49886 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
49887 EltVT.getStoreSize());
49888 return true;
49891 /// If exactly one element of the mask is set for a non-extending masked load,
49892 /// it is a scalar load and vector insert.
49893 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
49894 /// mask have already been optimized in IR, so we don't bother with those here.
49895 static SDValue
49896 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
49897 TargetLowering::DAGCombinerInfo &DCI,
49898 const X86Subtarget &Subtarget) {
49899 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
49900 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
49901 // However, some target hooks may need to be added to know when the transform
49902 // is profitable. Endianness would also have to be considered.
49904 SDValue Addr, VecIndex;
49905 Align Alignment;
49906 unsigned Offset;
49907 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
49908 return SDValue();
49910 // Load the one scalar element that is specified by the mask using the
49911 // appropriate offset from the base pointer.
49912 SDLoc DL(ML);
49913 EVT VT = ML->getValueType(0);
49914 EVT EltVT = VT.getVectorElementType();
49916 EVT CastVT = VT;
49917 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
49918 EltVT = MVT::f64;
49919 CastVT = VT.changeVectorElementType(EltVT);
49922 SDValue Load =
49923 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
49924 ML->getPointerInfo().getWithOffset(Offset),
49925 Alignment, ML->getMemOperand()->getFlags());
49927 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
49929 // Insert the loaded element into the appropriate place in the vector.
49930 SDValue Insert =
49931 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
49932 Insert = DAG.getBitcast(VT, Insert);
49933 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
49936 static SDValue
49937 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
49938 TargetLowering::DAGCombinerInfo &DCI) {
49939 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
49940 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
49941 return SDValue();
49943 SDLoc DL(ML);
49944 EVT VT = ML->getValueType(0);
49946 // If we are loading the first and last elements of a vector, it is safe and
49947 // always faster to load the whole vector. Replace the masked load with a
49948 // vector load and select.
49949 unsigned NumElts = VT.getVectorNumElements();
49950 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
49951 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
49952 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
49953 if (LoadFirstElt && LoadLastElt) {
49954 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
49955 ML->getMemOperand());
49956 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
49957 ML->getPassThru());
49958 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
49961 // Convert a masked load with a constant mask into a masked load and a select.
49962 // This allows the select operation to use a faster kind of select instruction
49963 // (for example, vblendvps -> vblendps).
49965 // Don't try this if the pass-through operand is already undefined. That would
49966 // cause an infinite loop because that's what we're about to create.
49967 if (ML->getPassThru().isUndef())
49968 return SDValue();
49970 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
49971 return SDValue();
49973 // The new masked load has an undef pass-through operand. The select uses the
49974 // original pass-through operand.
49975 SDValue NewML = DAG.getMaskedLoad(
49976 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
49977 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
49978 ML->getAddressingMode(), ML->getExtensionType());
49979 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
49980 ML->getPassThru());
49982 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
49985 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
49986 TargetLowering::DAGCombinerInfo &DCI,
49987 const X86Subtarget &Subtarget) {
49988 auto *Mld = cast<MaskedLoadSDNode>(N);
49990 // TODO: Expanding load with constant mask may be optimized as well.
49991 if (Mld->isExpandingLoad())
49992 return SDValue();
49994 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
49995 if (SDValue ScalarLoad =
49996 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
49997 return ScalarLoad;
49999 // TODO: Do some AVX512 subsets benefit from this transform?
50000 if (!Subtarget.hasAVX512())
50001 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
50002 return Blend;
50005 // If the mask value has been legalized to a non-boolean vector, try to
50006 // simplify ops leading up to it. We only demand the MSB of each lane.
50007 SDValue Mask = Mld->getMask();
50008 if (Mask.getScalarValueSizeInBits() != 1) {
50009 EVT VT = Mld->getValueType(0);
50010 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50011 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
50012 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50013 if (N->getOpcode() != ISD::DELETED_NODE)
50014 DCI.AddToWorklist(N);
50015 return SDValue(N, 0);
50017 if (SDValue NewMask =
50018 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
50019 return DAG.getMaskedLoad(
50020 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
50021 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
50022 Mld->getAddressingMode(), Mld->getExtensionType());
50025 return SDValue();
50028 /// If exactly one element of the mask is set for a non-truncating masked store,
50029 /// it is a vector extract and scalar store.
50030 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
50031 /// mask have already been optimized in IR, so we don't bother with those here.
50032 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
50033 SelectionDAG &DAG,
50034 const X86Subtarget &Subtarget) {
50035 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
50036 // However, some target hooks may need to be added to know when the transform
50037 // is profitable. Endianness would also have to be considered.
50039 SDValue Addr, VecIndex;
50040 Align Alignment;
50041 unsigned Offset;
50042 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
50043 return SDValue();
50045 // Extract the one scalar element that is actually being stored.
50046 SDLoc DL(MS);
50047 SDValue Value = MS->getValue();
50048 EVT VT = Value.getValueType();
50049 EVT EltVT = VT.getVectorElementType();
50050 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
50051 EltVT = MVT::f64;
50052 EVT CastVT = VT.changeVectorElementType(EltVT);
50053 Value = DAG.getBitcast(CastVT, Value);
50055 SDValue Extract =
50056 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
50058 // Store that element at the appropriate offset from the base pointer.
50059 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
50060 MS->getPointerInfo().getWithOffset(Offset),
50061 Alignment, MS->getMemOperand()->getFlags());
50064 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
50065 TargetLowering::DAGCombinerInfo &DCI,
50066 const X86Subtarget &Subtarget) {
50067 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
50068 if (Mst->isCompressingStore())
50069 return SDValue();
50071 EVT VT = Mst->getValue().getValueType();
50072 SDLoc dl(Mst);
50073 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50075 if (Mst->isTruncatingStore())
50076 return SDValue();
50078 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
50079 return ScalarStore;
50081 // If the mask value has been legalized to a non-boolean vector, try to
50082 // simplify ops leading up to it. We only demand the MSB of each lane.
50083 SDValue Mask = Mst->getMask();
50084 if (Mask.getScalarValueSizeInBits() != 1) {
50085 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
50086 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50087 if (N->getOpcode() != ISD::DELETED_NODE)
50088 DCI.AddToWorklist(N);
50089 return SDValue(N, 0);
50091 if (SDValue NewMask =
50092 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
50093 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
50094 Mst->getBasePtr(), Mst->getOffset(), NewMask,
50095 Mst->getMemoryVT(), Mst->getMemOperand(),
50096 Mst->getAddressingMode());
50099 SDValue Value = Mst->getValue();
50100 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
50101 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
50102 Mst->getMemoryVT())) {
50103 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
50104 Mst->getBasePtr(), Mst->getOffset(), Mask,
50105 Mst->getMemoryVT(), Mst->getMemOperand(),
50106 Mst->getAddressingMode(), true);
50109 return SDValue();
50112 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
50113 TargetLowering::DAGCombinerInfo &DCI,
50114 const X86Subtarget &Subtarget) {
50115 StoreSDNode *St = cast<StoreSDNode>(N);
50116 EVT StVT = St->getMemoryVT();
50117 SDLoc dl(St);
50118 SDValue StoredVal = St->getValue();
50119 EVT VT = StoredVal.getValueType();
50120 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50122 // Convert a store of vXi1 into a store of iX and a bitcast.
50123 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
50124 VT.getVectorElementType() == MVT::i1) {
50126 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
50127 StoredVal = DAG.getBitcast(NewVT, StoredVal);
50129 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50130 St->getPointerInfo(), St->getOriginalAlign(),
50131 St->getMemOperand()->getFlags());
50134 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
50135 // This will avoid a copy to k-register.
50136 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
50137 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
50138 StoredVal.getOperand(0).getValueType() == MVT::i8) {
50139 SDValue Val = StoredVal.getOperand(0);
50140 // We must store zeros to the unused bits.
50141 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
50142 return DAG.getStore(St->getChain(), dl, Val,
50143 St->getBasePtr(), St->getPointerInfo(),
50144 St->getOriginalAlign(),
50145 St->getMemOperand()->getFlags());
50148 // Widen v2i1/v4i1 stores to v8i1.
50149 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
50150 Subtarget.hasAVX512()) {
50151 unsigned NumConcats = 8 / VT.getVectorNumElements();
50152 // We must store zeros to the unused bits.
50153 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
50154 Ops[0] = StoredVal;
50155 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
50156 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50157 St->getPointerInfo(), St->getOriginalAlign(),
50158 St->getMemOperand()->getFlags());
50161 // Turn vXi1 stores of constants into a scalar store.
50162 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
50163 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
50164 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
50165 // If its a v64i1 store without 64-bit support, we need two stores.
50166 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
50167 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
50168 StoredVal->ops().slice(0, 32));
50169 Lo = combinevXi1ConstantToInteger(Lo, DAG);
50170 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
50171 StoredVal->ops().slice(32, 32));
50172 Hi = combinevXi1ConstantToInteger(Hi, DAG);
50174 SDValue Ptr0 = St->getBasePtr();
50175 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
50177 SDValue Ch0 =
50178 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
50179 St->getOriginalAlign(),
50180 St->getMemOperand()->getFlags());
50181 SDValue Ch1 =
50182 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
50183 St->getPointerInfo().getWithOffset(4),
50184 St->getOriginalAlign(),
50185 St->getMemOperand()->getFlags());
50186 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
50189 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
50190 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50191 St->getPointerInfo(), St->getOriginalAlign(),
50192 St->getMemOperand()->getFlags());
50195 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
50196 // Sandy Bridge, perform two 16-byte stores.
50197 unsigned Fast;
50198 if (VT.is256BitVector() && StVT == VT &&
50199 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
50200 *St->getMemOperand(), &Fast) &&
50201 !Fast) {
50202 unsigned NumElems = VT.getVectorNumElements();
50203 if (NumElems < 2)
50204 return SDValue();
50206 return splitVectorStore(St, DAG);
50209 // Split under-aligned vector non-temporal stores.
50210 if (St->isNonTemporal() && StVT == VT &&
50211 St->getAlign().value() < VT.getStoreSize()) {
50212 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
50213 // vectors or the legalizer can scalarize it to use MOVNTI.
50214 if (VT.is256BitVector() || VT.is512BitVector()) {
50215 unsigned NumElems = VT.getVectorNumElements();
50216 if (NumElems < 2)
50217 return SDValue();
50218 return splitVectorStore(St, DAG);
50221 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
50222 // to use MOVNTI.
50223 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
50224 MVT NTVT = Subtarget.hasSSE4A()
50225 ? MVT::v2f64
50226 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
50227 return scalarizeVectorStore(St, NTVT, DAG);
50231 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
50232 // supported, but avx512f is by extending to v16i32 and truncating.
50233 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
50234 St->getValue().getOpcode() == ISD::TRUNCATE &&
50235 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
50236 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
50237 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
50238 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
50239 St->getValue().getOperand(0));
50240 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
50241 MVT::v16i8, St->getMemOperand());
50244 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
50245 if (!St->isTruncatingStore() &&
50246 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
50247 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
50248 StoredVal.hasOneUse() &&
50249 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
50250 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
50251 return EmitTruncSStore(IsSigned, St->getChain(),
50252 dl, StoredVal.getOperand(0), St->getBasePtr(),
50253 VT, St->getMemOperand(), DAG);
50256 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
50257 if (!St->isTruncatingStore()) {
50258 auto IsExtractedElement = [](SDValue V) {
50259 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
50260 V = V.getOperand(0);
50261 unsigned Opc = V.getOpcode();
50262 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
50263 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
50264 V.getOperand(0).hasOneUse())
50265 return V.getOperand(0);
50266 return SDValue();
50268 if (SDValue Extract = IsExtractedElement(StoredVal)) {
50269 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
50270 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
50271 SDValue Src = Trunc.getOperand(0);
50272 MVT DstVT = Trunc.getSimpleValueType();
50273 MVT SrcVT = Src.getSimpleValueType();
50274 unsigned NumSrcElts = SrcVT.getVectorNumElements();
50275 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
50276 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
50277 if (NumTruncBits == VT.getSizeInBits() &&
50278 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
50279 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
50280 TruncVT, St->getMemOperand());
50286 // Optimize trunc store (of multiple scalars) to shuffle and store.
50287 // First, pack all of the elements in one place. Next, store to memory
50288 // in fewer chunks.
50289 if (St->isTruncatingStore() && VT.isVector()) {
50290 // Check if we can detect an AVG pattern from the truncation. If yes,
50291 // replace the trunc store by a normal store with the result of X86ISD::AVG
50292 // instruction.
50293 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
50294 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
50295 Subtarget, dl))
50296 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
50297 St->getPointerInfo(), St->getOriginalAlign(),
50298 St->getMemOperand()->getFlags());
50300 if (TLI.isTruncStoreLegal(VT, StVT)) {
50301 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
50302 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
50303 dl, Val, St->getBasePtr(),
50304 St->getMemoryVT(), St->getMemOperand(), DAG);
50305 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
50306 DAG, dl))
50307 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
50308 dl, Val, St->getBasePtr(),
50309 St->getMemoryVT(), St->getMemOperand(), DAG);
50312 return SDValue();
50315 // Cast ptr32 and ptr64 pointers to the default address space before a store.
50316 unsigned AddrSpace = St->getAddressSpace();
50317 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
50318 AddrSpace == X86AS::PTR32_UPTR) {
50319 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
50320 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
50321 SDValue Cast =
50322 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
50323 return DAG.getTruncStore(
50324 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
50325 St->getOriginalAlign(), St->getMemOperand()->getFlags(),
50326 St->getAAInfo());
50330 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
50331 // the FP state in cases where an emms may be missing.
50332 // A preferable solution to the general problem is to figure out the right
50333 // places to insert EMMS. This qualifies as a quick hack.
50335 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
50336 if (VT.getSizeInBits() != 64)
50337 return SDValue();
50339 const Function &F = DAG.getMachineFunction().getFunction();
50340 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
50341 bool F64IsLegal =
50342 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
50343 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
50344 isa<LoadSDNode>(St->getValue()) &&
50345 cast<LoadSDNode>(St->getValue())->isSimple() &&
50346 St->getChain().hasOneUse() && St->isSimple()) {
50347 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
50349 if (!ISD::isNormalLoad(Ld))
50350 return SDValue();
50352 // Avoid the transformation if there are multiple uses of the loaded value.
50353 if (!Ld->hasNUsesOfValue(1, 0))
50354 return SDValue();
50356 SDLoc LdDL(Ld);
50357 SDLoc StDL(N);
50358 // Lower to a single movq load/store pair.
50359 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
50360 Ld->getBasePtr(), Ld->getMemOperand());
50362 // Make sure new load is placed in same chain order.
50363 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
50364 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
50365 St->getMemOperand());
50368 // This is similar to the above case, but here we handle a scalar 64-bit
50369 // integer store that is extracted from a vector on a 32-bit target.
50370 // If we have SSE2, then we can treat it like a floating-point double
50371 // to get past legalization. The execution dependencies fixup pass will
50372 // choose the optimal machine instruction for the store if this really is
50373 // an integer or v2f32 rather than an f64.
50374 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
50375 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
50376 SDValue OldExtract = St->getOperand(1);
50377 SDValue ExtOp0 = OldExtract.getOperand(0);
50378 unsigned VecSize = ExtOp0.getValueSizeInBits();
50379 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
50380 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
50381 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
50382 BitCast, OldExtract.getOperand(1));
50383 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
50384 St->getPointerInfo(), St->getOriginalAlign(),
50385 St->getMemOperand()->getFlags());
50388 return SDValue();
50391 static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
50392 TargetLowering::DAGCombinerInfo &DCI,
50393 const X86Subtarget &Subtarget) {
50394 auto *St = cast<MemIntrinsicSDNode>(N);
50396 SDValue StoredVal = N->getOperand(1);
50397 MVT VT = StoredVal.getSimpleValueType();
50398 EVT MemVT = St->getMemoryVT();
50400 // Figure out which elements we demand.
50401 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
50402 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
50404 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50405 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
50406 if (N->getOpcode() != ISD::DELETED_NODE)
50407 DCI.AddToWorklist(N);
50408 return SDValue(N, 0);
50411 return SDValue();
50414 /// Return 'true' if this vector operation is "horizontal"
50415 /// and return the operands for the horizontal operation in LHS and RHS. A
50416 /// horizontal operation performs the binary operation on successive elements
50417 /// of its first operand, then on successive elements of its second operand,
50418 /// returning the resulting values in a vector. For example, if
50419 /// A = < float a0, float a1, float a2, float a3 >
50420 /// and
50421 /// B = < float b0, float b1, float b2, float b3 >
50422 /// then the result of doing a horizontal operation on A and B is
50423 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
50424 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
50425 /// A horizontal-op B, for some already available A and B, and if so then LHS is
50426 /// set to A, RHS to B, and the routine returns 'true'.
50427 static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
50428 SelectionDAG &DAG, const X86Subtarget &Subtarget,
50429 bool IsCommutative,
50430 SmallVectorImpl<int> &PostShuffleMask) {
50431 // If either operand is undef, bail out. The binop should be simplified.
50432 if (LHS.isUndef() || RHS.isUndef())
50433 return false;
50435 // Look for the following pattern:
50436 // A = < float a0, float a1, float a2, float a3 >
50437 // B = < float b0, float b1, float b2, float b3 >
50438 // and
50439 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
50440 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
50441 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
50442 // which is A horizontal-op B.
50444 MVT VT = LHS.getSimpleValueType();
50445 assert((VT.is128BitVector() || VT.is256BitVector()) &&
50446 "Unsupported vector type for horizontal add/sub");
50447 unsigned NumElts = VT.getVectorNumElements();
50449 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
50450 SmallVectorImpl<int> &ShuffleMask) {
50451 bool UseSubVector = false;
50452 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50453 Op.getOperand(0).getValueType().is256BitVector() &&
50454 llvm::isNullConstant(Op.getOperand(1))) {
50455 Op = Op.getOperand(0);
50456 UseSubVector = true;
50458 SmallVector<SDValue, 2> SrcOps;
50459 SmallVector<int, 16> SrcMask, ScaledMask;
50460 SDValue BC = peekThroughBitcasts(Op);
50461 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
50462 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
50463 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
50464 })) {
50465 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
50466 if (!UseSubVector && SrcOps.size() <= 2 &&
50467 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
50468 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
50469 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
50470 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
50472 if (UseSubVector && SrcOps.size() == 1 &&
50473 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
50474 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
50475 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
50476 ShuffleMask.assign(Mask.begin(), Mask.end());
50481 // View LHS in the form
50482 // LHS = VECTOR_SHUFFLE A, B, LMask
50483 // If LHS is not a shuffle, then pretend it is the identity shuffle:
50484 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
50485 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
50486 SDValue A, B;
50487 SmallVector<int, 16> LMask;
50488 GetShuffle(LHS, A, B, LMask);
50490 // Likewise, view RHS in the form
50491 // RHS = VECTOR_SHUFFLE C, D, RMask
50492 SDValue C, D;
50493 SmallVector<int, 16> RMask;
50494 GetShuffle(RHS, C, D, RMask);
50496 // At least one of the operands should be a vector shuffle.
50497 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
50498 if (NumShuffles == 0)
50499 return false;
50501 if (LMask.empty()) {
50502 A = LHS;
50503 for (unsigned i = 0; i != NumElts; ++i)
50504 LMask.push_back(i);
50507 if (RMask.empty()) {
50508 C = RHS;
50509 for (unsigned i = 0; i != NumElts; ++i)
50510 RMask.push_back(i);
50513 // If we have an unary mask, ensure the other op is set to null.
50514 if (isUndefOrInRange(LMask, 0, NumElts))
50515 B = SDValue();
50516 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
50517 A = SDValue();
50519 if (isUndefOrInRange(RMask, 0, NumElts))
50520 D = SDValue();
50521 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
50522 C = SDValue();
50524 // If A and B occur in reverse order in RHS, then canonicalize by commuting
50525 // RHS operands and shuffle mask.
50526 if (A != C) {
50527 std::swap(C, D);
50528 ShuffleVectorSDNode::commuteMask(RMask);
50530 // Check that the shuffles are both shuffling the same vectors.
50531 if (!(A == C && B == D))
50532 return false;
50534 PostShuffleMask.clear();
50535 PostShuffleMask.append(NumElts, SM_SentinelUndef);
50537 // LHS and RHS are now:
50538 // LHS = shuffle A, B, LMask
50539 // RHS = shuffle A, B, RMask
50540 // Check that the masks correspond to performing a horizontal operation.
50541 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
50542 // so we just repeat the inner loop if this is a 256-bit op.
50543 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
50544 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
50545 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
50546 assert((NumEltsPer128BitChunk % 2 == 0) &&
50547 "Vector type should have an even number of elements in each lane");
50548 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
50549 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
50550 // Ignore undefined components.
50551 int LIdx = LMask[i + j], RIdx = RMask[i + j];
50552 if (LIdx < 0 || RIdx < 0 ||
50553 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
50554 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
50555 continue;
50557 // Check that successive odd/even elements are being operated on. If not,
50558 // this is not a horizontal operation.
50559 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
50560 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
50561 return false;
50563 // Compute the post-shuffle mask index based on where the element
50564 // is stored in the HOP result, and where it needs to be moved to.
50565 int Base = LIdx & ~1u;
50566 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
50567 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
50569 // The low half of the 128-bit result must choose from A.
50570 // The high half of the 128-bit result must choose from B,
50571 // unless B is undef. In that case, we are always choosing from A.
50572 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
50573 Index += NumEltsPer64BitChunk;
50574 PostShuffleMask[i + j] = Index;
50578 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
50579 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
50581 bool IsIdentityPostShuffle =
50582 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
50583 if (IsIdentityPostShuffle)
50584 PostShuffleMask.clear();
50586 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
50587 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
50588 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
50589 return false;
50591 // If the source nodes are already used in HorizOps then always accept this.
50592 // Shuffle folding should merge these back together.
50593 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
50594 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
50596 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
50597 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
50599 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
50601 // Assume a SingleSource HOP if we only shuffle one input and don't need to
50602 // shuffle the result.
50603 if (!ForceHorizOp &&
50604 !shouldUseHorizontalOp(NewLHS == NewRHS &&
50605 (NumShuffles < 2 || !IsIdentityPostShuffle),
50606 DAG, Subtarget))
50607 return false;
50609 LHS = DAG.getBitcast(VT, NewLHS);
50610 RHS = DAG.getBitcast(VT, NewRHS);
50611 return true;
50614 // Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
50615 static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
50616 const X86Subtarget &Subtarget) {
50617 EVT VT = N->getValueType(0);
50618 unsigned Opcode = N->getOpcode();
50619 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
50620 SmallVector<int, 8> PostShuffleMask;
50622 switch (Opcode) {
50623 case ISD::FADD:
50624 case ISD::FSUB:
50625 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
50626 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
50627 SDValue LHS = N->getOperand(0);
50628 SDValue RHS = N->getOperand(1);
50629 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
50630 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
50631 PostShuffleMask)) {
50632 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
50633 if (!PostShuffleMask.empty())
50634 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
50635 DAG.getUNDEF(VT), PostShuffleMask);
50636 return HorizBinOp;
50639 break;
50640 case ISD::ADD:
50641 case ISD::SUB:
50642 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
50643 VT == MVT::v16i16 || VT == MVT::v8i32)) {
50644 SDValue LHS = N->getOperand(0);
50645 SDValue RHS = N->getOperand(1);
50646 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
50647 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
50648 PostShuffleMask)) {
50649 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
50650 ArrayRef<SDValue> Ops) {
50651 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
50653 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
50654 {LHS, RHS}, HOpBuilder);
50655 if (!PostShuffleMask.empty())
50656 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
50657 DAG.getUNDEF(VT), PostShuffleMask);
50658 return HorizBinOp;
50661 break;
50664 return SDValue();
50667 // Try to combine the following nodes
50668 // t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
50669 // <i32 -2147483648[float -0.000000e+00]> 0
50670 // t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
50671 // <(load 4 from constant-pool)> t0, t29
50672 // [t30: v16i32 = bitcast t27]
50673 // t6: v16i32 = xor t7, t27[t30]
50674 // t11: v16f32 = bitcast t6
50675 // t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
50676 // into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
50677 // t22: v16f32 = bitcast t7
50678 // t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
50679 // t24: v32f16 = bitcast t23
50680 static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
50681 const X86Subtarget &Subtarget) {
50682 EVT VT = N->getValueType(0);
50683 SDValue LHS = N->getOperand(0);
50684 SDValue RHS = N->getOperand(1);
50685 int CombineOpcode =
50686 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
50687 auto isConjugationConstant = [](const Constant *c) {
50688 if (const auto *CI = dyn_cast<ConstantInt>(c)) {
50689 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
50690 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
50691 switch (CI->getBitWidth()) {
50692 case 16:
50693 return false;
50694 case 32:
50695 return CI->getValue() == ConjugationInt32;
50696 case 64:
50697 return CI->getValue() == ConjugationInt64;
50698 default:
50699 llvm_unreachable("Unexpected bit width");
50702 if (const auto *CF = dyn_cast<ConstantFP>(c))
50703 return CF->getType()->isFloatTy() && CF->isNegativeZeroValue();
50704 return false;
50706 auto combineConjugation = [&](SDValue &r) {
50707 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
50708 SDValue XOR = LHS.getOperand(0);
50709 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
50710 SDValue XORRHS = XOR.getOperand(1);
50711 if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
50712 XORRHS = XORRHS.getOperand(0);
50713 if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
50714 XORRHS.getOperand(1).getNumOperands()) {
50715 ConstantPoolSDNode *CP =
50716 dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
50717 if (CP && isConjugationConstant(CP->getConstVal())) {
50718 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
50719 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
50720 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
50721 r = DAG.getBitcast(VT, FCMulC);
50722 return true;
50727 return false;
50729 SDValue Res;
50730 if (combineConjugation(Res))
50731 return Res;
50732 std::swap(LHS, RHS);
50733 if (combineConjugation(Res))
50734 return Res;
50735 return Res;
50738 // Try to combine the following nodes:
50739 // FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
50740 static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
50741 const X86Subtarget &Subtarget) {
50742 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
50743 return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
50744 Flags.hasAllowContract();
50747 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
50748 return DAG.getTarget().Options.NoSignedZerosFPMath ||
50749 Flags.hasNoSignedZeros();
50751 auto IsVectorAllNegativeZero = [](const SDNode *N) {
50752 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
50753 return false;
50754 assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&
50755 "Unexpected vector type!");
50756 if (ConstantPoolSDNode *CP =
50757 dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
50758 APInt AI = APInt(32, 0x80008000, true);
50759 if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
50760 return CI->getValue() == AI;
50761 if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
50762 return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
50764 return false;
50767 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
50768 !AllowContract(N->getFlags()))
50769 return SDValue();
50771 EVT VT = N->getValueType(0);
50772 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
50773 return SDValue();
50775 SDValue LHS = N->getOperand(0);
50776 SDValue RHS = N->getOperand(1);
50777 bool IsConj;
50778 SDValue FAddOp1, MulOp0, MulOp1;
50779 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
50780 &IsVectorAllNegativeZero,
50781 &HasNoSignedZero](SDValue N) -> bool {
50782 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
50783 return false;
50784 SDValue Op0 = N.getOperand(0);
50785 unsigned Opcode = Op0.getOpcode();
50786 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
50787 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
50788 MulOp0 = Op0.getOperand(0);
50789 MulOp1 = Op0.getOperand(1);
50790 IsConj = Opcode == X86ISD::VFCMULC;
50791 return true;
50793 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
50794 ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
50795 HasNoSignedZero(Op0->getFlags())) ||
50796 IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
50797 MulOp0 = Op0.getOperand(0);
50798 MulOp1 = Op0.getOperand(1);
50799 IsConj = Opcode == X86ISD::VFCMADDC;
50800 return true;
50803 return false;
50806 if (GetCFmulFrom(LHS))
50807 FAddOp1 = RHS;
50808 else if (GetCFmulFrom(RHS))
50809 FAddOp1 = LHS;
50810 else
50811 return SDValue();
50813 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
50814 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
50815 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
50816 // FIXME: How do we handle when fast math flags of FADD are different from
50817 // CFMUL's?
50818 SDValue CFmul =
50819 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
50820 return DAG.getBitcast(VT, CFmul);
50823 /// Do target-specific dag combines on floating-point adds/subs.
50824 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
50825 const X86Subtarget &Subtarget) {
50826 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
50827 return HOp;
50829 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
50830 return COp;
50832 return SDValue();
50835 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
50836 /// the codegen.
50837 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
50838 /// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
50839 /// anything that is guaranteed to be transformed by DAGCombiner.
50840 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
50841 const X86Subtarget &Subtarget,
50842 const SDLoc &DL) {
50843 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
50844 SDValue Src = N->getOperand(0);
50845 unsigned SrcOpcode = Src.getOpcode();
50846 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50848 EVT VT = N->getValueType(0);
50849 EVT SrcVT = Src.getValueType();
50851 auto IsFreeTruncation = [VT](SDValue Op) {
50852 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
50854 // See if this has been extended from a smaller/equal size to
50855 // the truncation size, allowing a truncation to combine with the extend.
50856 unsigned Opcode = Op.getOpcode();
50857 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
50858 Opcode == ISD::ZERO_EXTEND) &&
50859 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
50860 return true;
50862 // See if this is a single use constant which can be constant folded.
50863 // NOTE: We don't peek throught bitcasts here because there is currently
50864 // no support for constant folding truncate+bitcast+vector_of_constants. So
50865 // we'll just send up with a truncate on both operands which will
50866 // get turned back into (truncate (binop)) causing an infinite loop.
50867 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
50870 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
50871 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
50872 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
50873 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
50876 // Don't combine if the operation has other uses.
50877 if (!Src.hasOneUse())
50878 return SDValue();
50880 // Only support vector truncation for now.
50881 // TODO: i64 scalar math would benefit as well.
50882 if (!VT.isVector())
50883 return SDValue();
50885 // In most cases its only worth pre-truncating if we're only facing the cost
50886 // of one truncation.
50887 // i.e. if one of the inputs will constant fold or the input is repeated.
50888 switch (SrcOpcode) {
50889 case ISD::MUL:
50890 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
50891 // better to truncate if we have the chance.
50892 if (SrcVT.getScalarType() == MVT::i64 &&
50893 TLI.isOperationLegal(SrcOpcode, VT) &&
50894 !TLI.isOperationLegal(SrcOpcode, SrcVT))
50895 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
50896 [[fallthrough]];
50897 case ISD::AND:
50898 case ISD::XOR:
50899 case ISD::OR:
50900 case ISD::ADD:
50901 case ISD::SUB: {
50902 SDValue Op0 = Src.getOperand(0);
50903 SDValue Op1 = Src.getOperand(1);
50904 if (TLI.isOperationLegal(SrcOpcode, VT) &&
50905 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
50906 return TruncateArithmetic(Op0, Op1);
50907 break;
50911 return SDValue();
50914 // Try to form a MULHU or MULHS node by looking for
50915 // (trunc (srl (mul ext, ext), 16))
50916 // TODO: This is X86 specific because we want to be able to handle wide types
50917 // before type legalization. But we can only do it if the vector will be
50918 // legalized via widening/splitting. Type legalization can't handle promotion
50919 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
50920 // combiner.
50921 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
50922 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
50923 // First instruction should be a right shift of a multiply.
50924 if (Src.getOpcode() != ISD::SRL ||
50925 Src.getOperand(0).getOpcode() != ISD::MUL)
50926 return SDValue();
50928 if (!Subtarget.hasSSE2())
50929 return SDValue();
50931 // Only handle vXi16 types that are at least 128-bits unless they will be
50932 // widened.
50933 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
50934 return SDValue();
50936 // Input type should be at least vXi32.
50937 EVT InVT = Src.getValueType();
50938 if (InVT.getVectorElementType().getSizeInBits() < 32)
50939 return SDValue();
50941 // Need a shift by 16.
50942 APInt ShiftAmt;
50943 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
50944 ShiftAmt != 16)
50945 return SDValue();
50947 SDValue LHS = Src.getOperand(0).getOperand(0);
50948 SDValue RHS = Src.getOperand(0).getOperand(1);
50950 // Count leading sign/zero bits on both inputs - if there are enough then
50951 // truncation back to vXi16 will be cheap - either as a pack/shuffle
50952 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
50953 // truncations may actually be free by peeking through to the ext source.
50954 auto IsSext = [&DAG](SDValue V) {
50955 return DAG.ComputeMaxSignificantBits(V) <= 16;
50957 auto IsZext = [&DAG](SDValue V) {
50958 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
50961 bool IsSigned = IsSext(LHS) && IsSext(RHS);
50962 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
50963 if (!IsSigned && !IsUnsigned)
50964 return SDValue();
50966 // Check if both inputs are extensions, which will be removed by truncation.
50967 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
50968 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
50969 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
50970 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
50971 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
50972 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
50974 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
50975 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
50976 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
50977 // will have to split anyway.
50978 unsigned InSizeInBits = InVT.getSizeInBits();
50979 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
50980 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
50981 (InSizeInBits % 16) == 0) {
50982 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
50983 InVT.getSizeInBits() / 16);
50984 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
50985 DAG.getBitcast(BCVT, RHS));
50986 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
50989 // Truncate back to source type.
50990 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
50991 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
50993 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
50994 return DAG.getNode(Opc, DL, VT, LHS, RHS);
50997 // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
50998 // from one vector with signed bytes from another vector, adds together
50999 // adjacent pairs of 16-bit products, and saturates the result before
51000 // truncating to 16-bits.
51002 // Which looks something like this:
51003 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
51004 // (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
51005 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
51006 const X86Subtarget &Subtarget,
51007 const SDLoc &DL) {
51008 if (!VT.isVector() || !Subtarget.hasSSSE3())
51009 return SDValue();
51011 unsigned NumElems = VT.getVectorNumElements();
51012 EVT ScalarVT = VT.getVectorElementType();
51013 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
51014 return SDValue();
51016 SDValue SSatVal = detectSSatPattern(In, VT);
51017 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
51018 return SDValue();
51020 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
51021 // of multiplies from even/odd elements.
51022 SDValue N0 = SSatVal.getOperand(0);
51023 SDValue N1 = SSatVal.getOperand(1);
51025 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
51026 return SDValue();
51028 SDValue N00 = N0.getOperand(0);
51029 SDValue N01 = N0.getOperand(1);
51030 SDValue N10 = N1.getOperand(0);
51031 SDValue N11 = N1.getOperand(1);
51033 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
51034 // Canonicalize zero_extend to LHS.
51035 if (N01.getOpcode() == ISD::ZERO_EXTEND)
51036 std::swap(N00, N01);
51037 if (N11.getOpcode() == ISD::ZERO_EXTEND)
51038 std::swap(N10, N11);
51040 // Ensure we have a zero_extend and a sign_extend.
51041 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
51042 N01.getOpcode() != ISD::SIGN_EXTEND ||
51043 N10.getOpcode() != ISD::ZERO_EXTEND ||
51044 N11.getOpcode() != ISD::SIGN_EXTEND)
51045 return SDValue();
51047 // Peek through the extends.
51048 N00 = N00.getOperand(0);
51049 N01 = N01.getOperand(0);
51050 N10 = N10.getOperand(0);
51051 N11 = N11.getOperand(0);
51053 // Ensure the extend is from vXi8.
51054 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
51055 N01.getValueType().getVectorElementType() != MVT::i8 ||
51056 N10.getValueType().getVectorElementType() != MVT::i8 ||
51057 N11.getValueType().getVectorElementType() != MVT::i8)
51058 return SDValue();
51060 // All inputs should be build_vectors.
51061 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
51062 N01.getOpcode() != ISD::BUILD_VECTOR ||
51063 N10.getOpcode() != ISD::BUILD_VECTOR ||
51064 N11.getOpcode() != ISD::BUILD_VECTOR)
51065 return SDValue();
51067 // N00/N10 are zero extended. N01/N11 are sign extended.
51069 // For each element, we need to ensure we have an odd element from one vector
51070 // multiplied by the odd element of another vector and the even element from
51071 // one of the same vectors being multiplied by the even element from the
51072 // other vector. So we need to make sure for each element i, this operator
51073 // is being performed:
51074 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
51075 SDValue ZExtIn, SExtIn;
51076 for (unsigned i = 0; i != NumElems; ++i) {
51077 SDValue N00Elt = N00.getOperand(i);
51078 SDValue N01Elt = N01.getOperand(i);
51079 SDValue N10Elt = N10.getOperand(i);
51080 SDValue N11Elt = N11.getOperand(i);
51081 // TODO: Be more tolerant to undefs.
51082 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51083 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51084 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51085 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
51086 return SDValue();
51087 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
51088 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
51089 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
51090 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
51091 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
51092 return SDValue();
51093 unsigned IdxN00 = ConstN00Elt->getZExtValue();
51094 unsigned IdxN01 = ConstN01Elt->getZExtValue();
51095 unsigned IdxN10 = ConstN10Elt->getZExtValue();
51096 unsigned IdxN11 = ConstN11Elt->getZExtValue();
51097 // Add is commutative so indices can be reordered.
51098 if (IdxN00 > IdxN10) {
51099 std::swap(IdxN00, IdxN10);
51100 std::swap(IdxN01, IdxN11);
51102 // N0 indices be the even element. N1 indices must be the next odd element.
51103 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
51104 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
51105 return SDValue();
51106 SDValue N00In = N00Elt.getOperand(0);
51107 SDValue N01In = N01Elt.getOperand(0);
51108 SDValue N10In = N10Elt.getOperand(0);
51109 SDValue N11In = N11Elt.getOperand(0);
51110 // First time we find an input capture it.
51111 if (!ZExtIn) {
51112 ZExtIn = N00In;
51113 SExtIn = N01In;
51115 if (ZExtIn != N00In || SExtIn != N01In ||
51116 ZExtIn != N10In || SExtIn != N11In)
51117 return SDValue();
51120 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
51121 ArrayRef<SDValue> Ops) {
51122 // Shrink by adding truncate nodes and let DAGCombine fold with the
51123 // sources.
51124 EVT InVT = Ops[0].getValueType();
51125 assert(InVT.getScalarType() == MVT::i8 &&
51126 "Unexpected scalar element type");
51127 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
51128 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51129 InVT.getVectorNumElements() / 2);
51130 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
51132 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
51133 PMADDBuilder);
51136 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
51137 const X86Subtarget &Subtarget) {
51138 EVT VT = N->getValueType(0);
51139 SDValue Src = N->getOperand(0);
51140 SDLoc DL(N);
51142 // Attempt to pre-truncate inputs to arithmetic ops instead.
51143 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
51144 return V;
51146 // Try to detect AVG pattern first.
51147 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
51148 return Avg;
51150 // Try to detect PMADD
51151 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
51152 return PMAdd;
51154 // Try to combine truncation with signed/unsigned saturation.
51155 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
51156 return Val;
51158 // Try to combine PMULHUW/PMULHW for vXi16.
51159 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
51160 return V;
51162 // The bitcast source is a direct mmx result.
51163 // Detect bitcasts between i32 to x86mmx
51164 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
51165 SDValue BCSrc = Src.getOperand(0);
51166 if (BCSrc.getValueType() == MVT::x86mmx)
51167 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
51170 return SDValue();
51173 static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
51174 TargetLowering::DAGCombinerInfo &DCI) {
51175 EVT VT = N->getValueType(0);
51176 SDValue In = N->getOperand(0);
51177 SDLoc DL(N);
51179 if (SDValue SSatVal = detectSSatPattern(In, VT))
51180 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
51181 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
51182 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
51184 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51185 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
51186 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
51187 return SDValue(N, 0);
51189 return SDValue();
51192 /// Returns the negated value if the node \p N flips sign of FP value.
51194 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
51195 /// or FSUB(0, x)
51196 /// AVX512F does not have FXOR, so FNEG is lowered as
51197 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
51198 /// In this case we go though all bitcasts.
51199 /// This also recognizes splat of a negated value and returns the splat of that
51200 /// value.
51201 static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
51202 if (N->getOpcode() == ISD::FNEG)
51203 return N->getOperand(0);
51205 // Don't recurse exponentially.
51206 if (Depth > SelectionDAG::MaxRecursionDepth)
51207 return SDValue();
51209 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
51211 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
51212 EVT VT = Op->getValueType(0);
51214 // Make sure the element size doesn't change.
51215 if (VT.getScalarSizeInBits() != ScalarSize)
51216 return SDValue();
51218 unsigned Opc = Op.getOpcode();
51219 switch (Opc) {
51220 case ISD::VECTOR_SHUFFLE: {
51221 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
51222 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
51223 if (!Op.getOperand(1).isUndef())
51224 return SDValue();
51225 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
51226 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
51227 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
51228 cast<ShuffleVectorSDNode>(Op)->getMask());
51229 break;
51231 case ISD::INSERT_VECTOR_ELT: {
51232 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
51233 // -V, INDEX).
51234 SDValue InsVector = Op.getOperand(0);
51235 SDValue InsVal = Op.getOperand(1);
51236 if (!InsVector.isUndef())
51237 return SDValue();
51238 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
51239 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
51240 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
51241 NegInsVal, Op.getOperand(2));
51242 break;
51244 case ISD::FSUB:
51245 case ISD::XOR:
51246 case X86ISD::FXOR: {
51247 SDValue Op1 = Op.getOperand(1);
51248 SDValue Op0 = Op.getOperand(0);
51250 // For XOR and FXOR, we want to check if constant
51251 // bits of Op1 are sign bit masks. For FSUB, we
51252 // have to check if constant bits of Op0 are sign
51253 // bit masks and hence we swap the operands.
51254 if (Opc == ISD::FSUB)
51255 std::swap(Op0, Op1);
51257 APInt UndefElts;
51258 SmallVector<APInt, 16> EltBits;
51259 // Extract constant bits and see if they are all
51260 // sign bit masks. Ignore the undef elements.
51261 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
51262 /* AllowWholeUndefs */ true,
51263 /* AllowPartialUndefs */ false)) {
51264 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
51265 if (!UndefElts[I] && !EltBits[I].isSignMask())
51266 return SDValue();
51268 // Only allow bitcast from correctly-sized constant.
51269 Op0 = peekThroughBitcasts(Op0);
51270 if (Op0.getScalarValueSizeInBits() == ScalarSize)
51271 return Op0;
51273 break;
51274 } // case
51275 } // switch
51277 return SDValue();
51280 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
51281 bool NegRes) {
51282 if (NegMul) {
51283 switch (Opcode) {
51284 default: llvm_unreachable("Unexpected opcode");
51285 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
51286 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
51287 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
51288 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
51289 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
51290 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
51291 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
51292 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
51293 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
51294 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
51295 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
51296 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
51300 if (NegAcc) {
51301 switch (Opcode) {
51302 default: llvm_unreachable("Unexpected opcode");
51303 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
51304 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
51305 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
51306 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
51307 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
51308 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
51309 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
51310 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
51311 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
51312 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
51313 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
51314 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
51315 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
51316 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
51317 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
51318 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
51322 if (NegRes) {
51323 switch (Opcode) {
51324 // For accuracy reason, we never combine fneg and fma under strict FP.
51325 default: llvm_unreachable("Unexpected opcode");
51326 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
51327 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
51328 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
51329 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
51330 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
51331 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
51332 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
51333 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
51337 return Opcode;
51340 /// Do target-specific dag combines on floating point negations.
51341 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
51342 TargetLowering::DAGCombinerInfo &DCI,
51343 const X86Subtarget &Subtarget) {
51344 EVT OrigVT = N->getValueType(0);
51345 SDValue Arg = isFNEG(DAG, N);
51346 if (!Arg)
51347 return SDValue();
51349 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51350 EVT VT = Arg.getValueType();
51351 EVT SVT = VT.getScalarType();
51352 SDLoc DL(N);
51354 // Let legalize expand this if it isn't a legal type yet.
51355 if (!TLI.isTypeLegal(VT))
51356 return SDValue();
51358 // If we're negating a FMUL node on a target with FMA, then we can avoid the
51359 // use of a constant by performing (-0 - A*B) instead.
51360 // FIXME: Check rounding control flags as well once it becomes available.
51361 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
51362 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
51363 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
51364 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
51365 Arg.getOperand(1), Zero);
51366 return DAG.getBitcast(OrigVT, NewNode);
51369 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
51370 bool LegalOperations = !DCI.isBeforeLegalizeOps();
51371 if (SDValue NegArg =
51372 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
51373 return DAG.getBitcast(OrigVT, NegArg);
51375 return SDValue();
51378 SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
51379 bool LegalOperations,
51380 bool ForCodeSize,
51381 NegatibleCost &Cost,
51382 unsigned Depth) const {
51383 // fneg patterns are removable even if they have multiple uses.
51384 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
51385 Cost = NegatibleCost::Cheaper;
51386 return DAG.getBitcast(Op.getValueType(), Arg);
51389 EVT VT = Op.getValueType();
51390 EVT SVT = VT.getScalarType();
51391 unsigned Opc = Op.getOpcode();
51392 SDNodeFlags Flags = Op.getNode()->getFlags();
51393 switch (Opc) {
51394 case ISD::FMA:
51395 case X86ISD::FMSUB:
51396 case X86ISD::FNMADD:
51397 case X86ISD::FNMSUB:
51398 case X86ISD::FMADD_RND:
51399 case X86ISD::FMSUB_RND:
51400 case X86ISD::FNMADD_RND:
51401 case X86ISD::FNMSUB_RND: {
51402 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
51403 !(SVT == MVT::f32 || SVT == MVT::f64) ||
51404 !isOperationLegal(ISD::FMA, VT))
51405 break;
51407 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
51408 // if it may have signed zeros.
51409 if (!Flags.hasNoSignedZeros())
51410 break;
51412 // This is always negatible for free but we might be able to remove some
51413 // extra operand negations as well.
51414 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
51415 for (int i = 0; i != 3; ++i)
51416 NewOps[i] = getCheaperNegatedExpression(
51417 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
51419 bool NegA = !!NewOps[0];
51420 bool NegB = !!NewOps[1];
51421 bool NegC = !!NewOps[2];
51422 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
51424 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
51425 : NegatibleCost::Neutral;
51427 // Fill in the non-negated ops with the original values.
51428 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
51429 if (!NewOps[i])
51430 NewOps[i] = Op.getOperand(i);
51431 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
51433 case X86ISD::FRCP:
51434 if (SDValue NegOp0 =
51435 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
51436 ForCodeSize, Cost, Depth + 1))
51437 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
51438 break;
51441 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
51442 ForCodeSize, Cost, Depth);
51445 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
51446 const X86Subtarget &Subtarget) {
51447 MVT VT = N->getSimpleValueType(0);
51448 // If we have integer vector types available, use the integer opcodes.
51449 if (!VT.isVector() || !Subtarget.hasSSE2())
51450 return SDValue();
51452 SDLoc dl(N);
51454 unsigned IntBits = VT.getScalarSizeInBits();
51455 MVT IntSVT = MVT::getIntegerVT(IntBits);
51456 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
51458 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
51459 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
51460 unsigned IntOpcode;
51461 switch (N->getOpcode()) {
51462 default: llvm_unreachable("Unexpected FP logic op");
51463 case X86ISD::FOR: IntOpcode = ISD::OR; break;
51464 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
51465 case X86ISD::FAND: IntOpcode = ISD::AND; break;
51466 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
51468 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
51469 return DAG.getBitcast(VT, IntOp);
51473 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
51474 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
51475 if (N->getOpcode() != ISD::XOR)
51476 return SDValue();
51478 SDValue LHS = N->getOperand(0);
51479 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
51480 return SDValue();
51482 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
51483 X86::CondCode(LHS->getConstantOperandVal(0)));
51484 SDLoc DL(N);
51485 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
51488 static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,
51489 const X86Subtarget &Subtarget) {
51490 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
51491 "Invalid opcode for combing with CTLZ");
51492 if (Subtarget.hasFastLZCNT())
51493 return SDValue();
51495 EVT VT = N->getValueType(0);
51496 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
51497 (VT != MVT::i64 || !Subtarget.is64Bit()))
51498 return SDValue();
51500 SDValue N0 = N->getOperand(0);
51501 SDValue N1 = N->getOperand(1);
51503 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
51504 N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)
51505 return SDValue();
51507 SDValue OpCTLZ;
51508 SDValue OpSizeTM1;
51510 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
51511 OpCTLZ = N1;
51512 OpSizeTM1 = N0;
51513 } else if (N->getOpcode() == ISD::SUB) {
51514 return SDValue();
51515 } else {
51516 OpCTLZ = N0;
51517 OpSizeTM1 = N1;
51520 if (!OpCTLZ.hasOneUse())
51521 return SDValue();
51522 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
51523 if (!C)
51524 return SDValue();
51526 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
51527 return SDValue();
51528 SDLoc DL(N);
51529 EVT OpVT = VT;
51530 SDValue Op = OpCTLZ.getOperand(0);
51531 if (VT == MVT::i8) {
51532 // Zero extend to i32 since there is not an i8 bsr.
51533 OpVT = MVT::i32;
51534 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
51537 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
51538 Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
51539 if (VT == MVT::i8)
51540 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
51542 return Op;
51545 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
51546 TargetLowering::DAGCombinerInfo &DCI,
51547 const X86Subtarget &Subtarget) {
51548 SDValue N0 = N->getOperand(0);
51549 SDValue N1 = N->getOperand(1);
51550 EVT VT = N->getValueType(0);
51552 // If this is SSE1 only convert to FXOR to avoid scalarization.
51553 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51554 return DAG.getBitcast(MVT::v4i32,
51555 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
51556 DAG.getBitcast(MVT::v4f32, N0),
51557 DAG.getBitcast(MVT::v4f32, N1)));
51560 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
51561 return Cmp;
51563 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
51564 return R;
51566 if (SDValue R = combineBitOpWithShift(N, DAG))
51567 return R;
51569 if (SDValue R = combineBitOpWithPACK(N, DAG))
51570 return R;
51572 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
51573 return FPLogic;
51575 if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
51576 return R;
51578 if (DCI.isBeforeLegalizeOps())
51579 return SDValue();
51581 if (SDValue SetCC = foldXor1SetCC(N, DAG))
51582 return SetCC;
51584 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
51585 return R;
51587 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
51588 return RV;
51590 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
51591 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51592 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
51593 N0.getOperand(0).getValueType().isVector() &&
51594 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
51595 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
51596 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
51597 N0.getOperand(0).getValueType()));
51600 // Handle AVX512 mask widening.
51601 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
51602 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
51603 VT.getVectorElementType() == MVT::i1 &&
51604 N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
51605 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
51606 return DAG.getNode(
51607 ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
51608 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
51609 N0.getOperand(2));
51612 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
51613 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
51614 // TODO: Under what circumstances could this be performed in DAGCombine?
51615 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
51616 N0.getOperand(0).getOpcode() == N->getOpcode()) {
51617 SDValue TruncExtSrc = N0.getOperand(0);
51618 auto *N1C = dyn_cast<ConstantSDNode>(N1);
51619 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
51620 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
51621 SDLoc DL(N);
51622 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
51623 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
51624 return DAG.getNode(ISD::XOR, DL, VT, LHS,
51625 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
51629 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
51630 return R;
51632 return combineFneg(N, DAG, DCI, Subtarget);
51635 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
51636 TargetLowering::DAGCombinerInfo &DCI,
51637 const X86Subtarget &Subtarget) {
51638 EVT VT = N->getValueType(0);
51639 unsigned NumBits = VT.getSizeInBits();
51641 // TODO - Constant Folding.
51643 // Simplify the inputs.
51644 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51645 APInt DemandedMask(APInt::getAllOnes(NumBits));
51646 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
51647 return SDValue(N, 0);
51649 return SDValue();
51652 static bool isNullFPScalarOrVectorConst(SDValue V) {
51653 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
51656 /// If a value is a scalar FP zero or a vector FP zero (potentially including
51657 /// undefined elements), return a zero constant that may be used to fold away
51658 /// that value. In the case of a vector, the returned constant will not contain
51659 /// undefined elements even if the input parameter does. This makes it suitable
51660 /// to be used as a replacement operand with operations (eg, bitwise-and) where
51661 /// an undef should not propagate.
51662 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
51663 const X86Subtarget &Subtarget) {
51664 if (!isNullFPScalarOrVectorConst(V))
51665 return SDValue();
51667 if (V.getValueType().isVector())
51668 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
51670 return V;
51673 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
51674 const X86Subtarget &Subtarget) {
51675 SDValue N0 = N->getOperand(0);
51676 SDValue N1 = N->getOperand(1);
51677 EVT VT = N->getValueType(0);
51678 SDLoc DL(N);
51680 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
51681 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
51682 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
51683 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
51684 return SDValue();
51686 auto isAllOnesConstantFP = [](SDValue V) {
51687 if (V.getSimpleValueType().isVector())
51688 return ISD::isBuildVectorAllOnes(V.getNode());
51689 auto *C = dyn_cast<ConstantFPSDNode>(V);
51690 return C && C->getConstantFPValue()->isAllOnesValue();
51693 // fand (fxor X, -1), Y --> fandn X, Y
51694 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
51695 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
51697 // fand X, (fxor Y, -1) --> fandn Y, X
51698 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
51699 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
51701 return SDValue();
51704 /// Do target-specific dag combines on X86ISD::FAND nodes.
51705 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
51706 const X86Subtarget &Subtarget) {
51707 // FAND(0.0, x) -> 0.0
51708 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
51709 return V;
51711 // FAND(x, 0.0) -> 0.0
51712 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
51713 return V;
51715 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
51716 return V;
51718 return lowerX86FPLogicOp(N, DAG, Subtarget);
51721 /// Do target-specific dag combines on X86ISD::FANDN nodes.
51722 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
51723 const X86Subtarget &Subtarget) {
51724 // FANDN(0.0, x) -> x
51725 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
51726 return N->getOperand(1);
51728 // FANDN(x, 0.0) -> 0.0
51729 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
51730 return V;
51732 return lowerX86FPLogicOp(N, DAG, Subtarget);
51735 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
51736 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
51737 TargetLowering::DAGCombinerInfo &DCI,
51738 const X86Subtarget &Subtarget) {
51739 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
51741 // F[X]OR(0.0, x) -> x
51742 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
51743 return N->getOperand(1);
51745 // F[X]OR(x, 0.0) -> x
51746 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
51747 return N->getOperand(0);
51749 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
51750 return NewVal;
51752 return lowerX86FPLogicOp(N, DAG, Subtarget);
51755 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
51756 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
51757 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
51759 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
51760 if (!DAG.getTarget().Options.NoNaNsFPMath ||
51761 !DAG.getTarget().Options.NoSignedZerosFPMath)
51762 return SDValue();
51764 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
51765 // into FMINC and FMAXC, which are Commutative operations.
51766 unsigned NewOp = 0;
51767 switch (N->getOpcode()) {
51768 default: llvm_unreachable("unknown opcode");
51769 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
51770 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
51773 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
51774 N->getOperand(0), N->getOperand(1));
51777 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
51778 const X86Subtarget &Subtarget) {
51779 EVT VT = N->getValueType(0);
51780 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
51781 return SDValue();
51783 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51785 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
51786 (Subtarget.hasSSE2() && VT == MVT::f64) ||
51787 (Subtarget.hasFP16() && VT == MVT::f16) ||
51788 (VT.isVector() && TLI.isTypeLegal(VT))))
51789 return SDValue();
51791 SDValue Op0 = N->getOperand(0);
51792 SDValue Op1 = N->getOperand(1);
51793 SDLoc DL(N);
51794 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
51796 // If we don't have to respect NaN inputs, this is a direct translation to x86
51797 // min/max instructions.
51798 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
51799 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
51801 // If one of the operands is known non-NaN use the native min/max instructions
51802 // with the non-NaN input as second operand.
51803 if (DAG.isKnownNeverNaN(Op1))
51804 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
51805 if (DAG.isKnownNeverNaN(Op0))
51806 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
51808 // If we have to respect NaN inputs, this takes at least 3 instructions.
51809 // Favor a library call when operating on a scalar and minimizing code size.
51810 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
51811 return SDValue();
51813 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
51814 VT);
51816 // There are 4 possibilities involving NaN inputs, and these are the required
51817 // outputs:
51818 // Op1
51819 // Num NaN
51820 // ----------------
51821 // Num | Max | Op0 |
51822 // Op0 ----------------
51823 // NaN | Op1 | NaN |
51824 // ----------------
51826 // The SSE FP max/min instructions were not designed for this case, but rather
51827 // to implement:
51828 // Min = Op1 < Op0 ? Op1 : Op0
51829 // Max = Op1 > Op0 ? Op1 : Op0
51831 // So they always return Op0 if either input is a NaN. However, we can still
51832 // use those instructions for fmaxnum by selecting away a NaN input.
51834 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
51835 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
51836 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
51838 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
51839 // are NaN, the NaN value of Op1 is the result.
51840 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
51843 static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
51844 TargetLowering::DAGCombinerInfo &DCI) {
51845 EVT VT = N->getValueType(0);
51846 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51848 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
51849 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
51850 return SDValue(N, 0);
51852 // Convert a full vector load into vzload when not all bits are needed.
51853 SDValue In = N->getOperand(0);
51854 MVT InVT = In.getSimpleValueType();
51855 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
51856 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
51857 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
51858 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
51859 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
51860 MVT MemVT = MVT::getIntegerVT(NumBits);
51861 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
51862 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
51863 SDLoc dl(N);
51864 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
51865 DAG.getBitcast(InVT, VZLoad));
51866 DCI.CombineTo(N, Convert);
51867 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
51868 DCI.recursivelyDeleteUnusedNodes(LN);
51869 return SDValue(N, 0);
51873 return SDValue();
51876 static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
51877 TargetLowering::DAGCombinerInfo &DCI) {
51878 bool IsStrict = N->isTargetStrictFPOpcode();
51879 EVT VT = N->getValueType(0);
51881 // Convert a full vector load into vzload when not all bits are needed.
51882 SDValue In = N->getOperand(IsStrict ? 1 : 0);
51883 MVT InVT = In.getSimpleValueType();
51884 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
51885 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
51886 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
51887 LoadSDNode *LN = cast<LoadSDNode>(In);
51888 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
51889 MVT MemVT = MVT::getFloatingPointVT(NumBits);
51890 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
51891 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
51892 SDLoc dl(N);
51893 if (IsStrict) {
51894 SDValue Convert =
51895 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
51896 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
51897 DCI.CombineTo(N, Convert, Convert.getValue(1));
51898 } else {
51899 SDValue Convert =
51900 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
51901 DCI.CombineTo(N, Convert);
51903 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
51904 DCI.recursivelyDeleteUnusedNodes(LN);
51905 return SDValue(N, 0);
51909 return SDValue();
51912 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
51913 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
51914 TargetLowering::DAGCombinerInfo &DCI,
51915 const X86Subtarget &Subtarget) {
51916 SDValue N0 = N->getOperand(0);
51917 SDValue N1 = N->getOperand(1);
51918 MVT VT = N->getSimpleValueType(0);
51919 int NumElts = VT.getVectorNumElements();
51920 unsigned EltSizeInBits = VT.getScalarSizeInBits();
51921 SDLoc DL(N);
51923 // ANDNP(undef, x) -> 0
51924 // ANDNP(x, undef) -> 0
51925 if (N0.isUndef() || N1.isUndef())
51926 return DAG.getConstant(0, DL, VT);
51928 // ANDNP(0, x) -> x
51929 if (ISD::isBuildVectorAllZeros(N0.getNode()))
51930 return N1;
51932 // ANDNP(x, 0) -> 0
51933 if (ISD::isBuildVectorAllZeros(N1.getNode()))
51934 return DAG.getConstant(0, DL, VT);
51936 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
51937 if (ISD::isBuildVectorAllOnes(N1.getNode()))
51938 return DAG.getNOT(DL, N0, VT);
51940 // Turn ANDNP back to AND if input is inverted.
51941 if (SDValue Not = IsNOT(N0, DAG))
51942 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
51944 // Fold for better commutatvity:
51945 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
51946 if (N1->hasOneUse())
51947 if (SDValue Not = IsNOT(N1, DAG))
51948 return DAG.getNOT(
51949 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
51951 // Constant Folding
51952 APInt Undefs0, Undefs1;
51953 SmallVector<APInt> EltBits0, EltBits1;
51954 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {
51955 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {
51956 SmallVector<APInt> ResultBits;
51957 for (int I = 0; I != NumElts; ++I)
51958 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
51959 return getConstVector(ResultBits, VT, DAG, DL);
51962 // Constant fold NOT(N0) to allow us to use AND.
51963 // Ensure this is only performed if we can confirm that the bitcasted source
51964 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
51965 if (N0->hasOneUse()) {
51966 SDValue BC0 = peekThroughOneUseBitcasts(N0);
51967 if (BC0.getOpcode() != ISD::BITCAST) {
51968 for (APInt &Elt : EltBits0)
51969 Elt = ~Elt;
51970 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
51971 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
51976 // Attempt to recursively combine a bitmask ANDNP with shuffles.
51977 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51978 SDValue Op(N, 0);
51979 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51980 return Res;
51982 // If either operand is a constant mask, then only the elements that aren't
51983 // zero are actually demanded by the other operand.
51984 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
51985 APInt UndefElts;
51986 SmallVector<APInt> EltBits;
51987 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51988 APInt DemandedElts = APInt::getAllOnes(NumElts);
51989 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51990 EltBits)) {
51991 DemandedBits.clearAllBits();
51992 DemandedElts.clearAllBits();
51993 for (int I = 0; I != NumElts; ++I) {
51994 if (UndefElts[I]) {
51995 // We can't assume an undef src element gives an undef dst - the
51996 // other src might be zero.
51997 DemandedBits.setAllBits();
51998 DemandedElts.setBit(I);
51999 } else if ((Invert && !EltBits[I].isAllOnes()) ||
52000 (!Invert && !EltBits[I].isZero())) {
52001 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
52002 DemandedElts.setBit(I);
52006 return std::make_pair(DemandedBits, DemandedElts);
52008 APInt Bits0, Elts0;
52009 APInt Bits1, Elts1;
52010 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52011 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
52013 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52014 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52015 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52016 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52017 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52018 if (N->getOpcode() != ISD::DELETED_NODE)
52019 DCI.AddToWorklist(N);
52020 return SDValue(N, 0);
52024 return SDValue();
52027 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
52028 TargetLowering::DAGCombinerInfo &DCI) {
52029 SDValue N1 = N->getOperand(1);
52031 // BT ignores high bits in the bit index operand.
52032 unsigned BitWidth = N1.getValueSizeInBits();
52033 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
52034 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
52035 if (N->getOpcode() != ISD::DELETED_NODE)
52036 DCI.AddToWorklist(N);
52037 return SDValue(N, 0);
52040 return SDValue();
52043 static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
52044 TargetLowering::DAGCombinerInfo &DCI) {
52045 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
52046 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
52048 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
52049 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52050 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
52051 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
52052 if (N->getOpcode() != ISD::DELETED_NODE)
52053 DCI.AddToWorklist(N);
52054 return SDValue(N, 0);
52057 // Convert a full vector load into vzload when not all bits are needed.
52058 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
52059 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
52060 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
52061 SDLoc dl(N);
52062 if (IsStrict) {
52063 SDValue Convert = DAG.getNode(
52064 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
52065 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
52066 DCI.CombineTo(N, Convert, Convert.getValue(1));
52067 } else {
52068 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
52069 DAG.getBitcast(MVT::v8i16, VZLoad));
52070 DCI.CombineTo(N, Convert);
52073 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52074 DCI.recursivelyDeleteUnusedNodes(LN);
52075 return SDValue(N, 0);
52080 return SDValue();
52083 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
52084 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
52085 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
52087 EVT DstVT = N->getValueType(0);
52089 SDValue N0 = N->getOperand(0);
52090 SDValue N1 = N->getOperand(1);
52091 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
52093 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
52094 return SDValue();
52096 // Look through single use any_extends / truncs.
52097 SDValue IntermediateBitwidthOp;
52098 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
52099 N0.hasOneUse()) {
52100 IntermediateBitwidthOp = N0;
52101 N0 = N0.getOperand(0);
52104 // See if we have a single use cmov.
52105 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
52106 return SDValue();
52108 SDValue CMovOp0 = N0.getOperand(0);
52109 SDValue CMovOp1 = N0.getOperand(1);
52111 // Make sure both operands are constants.
52112 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
52113 !isa<ConstantSDNode>(CMovOp1.getNode()))
52114 return SDValue();
52116 SDLoc DL(N);
52118 // If we looked through an any_extend/trunc above, add one to the constants.
52119 if (IntermediateBitwidthOp) {
52120 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
52121 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
52122 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
52125 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
52126 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
52128 EVT CMovVT = DstVT;
52129 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
52130 if (DstVT == MVT::i16) {
52131 CMovVT = MVT::i32;
52132 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
52133 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
52136 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
52137 N0.getOperand(2), N0.getOperand(3));
52139 if (CMovVT != DstVT)
52140 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
52142 return CMov;
52145 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
52146 const X86Subtarget &Subtarget) {
52147 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
52149 if (SDValue V = combineSextInRegCmov(N, DAG))
52150 return V;
52152 EVT VT = N->getValueType(0);
52153 SDValue N0 = N->getOperand(0);
52154 SDValue N1 = N->getOperand(1);
52155 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
52156 SDLoc dl(N);
52158 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
52159 // both SSE and AVX2 since there is no sign-extended shift right
52160 // operation on a vector with 64-bit elements.
52161 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
52162 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
52163 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
52164 N0.getOpcode() == ISD::SIGN_EXTEND)) {
52165 SDValue N00 = N0.getOperand(0);
52167 // EXTLOAD has a better solution on AVX2,
52168 // it may be replaced with X86ISD::VSEXT node.
52169 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
52170 if (!ISD::isNormalLoad(N00.getNode()))
52171 return SDValue();
52173 // Attempt to promote any comparison mask ops before moving the
52174 // SIGN_EXTEND_INREG in the way.
52175 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
52176 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
52178 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
52179 SDValue Tmp =
52180 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
52181 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
52184 return SDValue();
52187 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
52188 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
52189 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
52190 /// opportunities to combine math ops, use an LEA, or use a complex addressing
52191 /// mode. This can eliminate extend, add, and shift instructions.
52192 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
52193 const X86Subtarget &Subtarget) {
52194 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
52195 Ext->getOpcode() != ISD::ZERO_EXTEND)
52196 return SDValue();
52198 // TODO: This should be valid for other integer types.
52199 EVT VT = Ext->getValueType(0);
52200 if (VT != MVT::i64)
52201 return SDValue();
52203 SDValue Add = Ext->getOperand(0);
52204 if (Add.getOpcode() != ISD::ADD)
52205 return SDValue();
52207 SDValue AddOp0 = Add.getOperand(0);
52208 SDValue AddOp1 = Add.getOperand(1);
52209 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
52210 bool NSW = Add->getFlags().hasNoSignedWrap();
52211 bool NUW = Add->getFlags().hasNoUnsignedWrap();
52212 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
52213 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
52215 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
52216 // into the 'zext'
52217 if ((Sext && !NSW) || (!Sext && !NUW))
52218 return SDValue();
52220 // Having a constant operand to the 'add' ensures that we are not increasing
52221 // the instruction count because the constant is extended for free below.
52222 // A constant operand can also become the displacement field of an LEA.
52223 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
52224 if (!AddOp1C)
52225 return SDValue();
52227 // Don't make the 'add' bigger if there's no hope of combining it with some
52228 // other 'add' or 'shl' instruction.
52229 // TODO: It may be profitable to generate simpler LEA instructions in place
52230 // of single 'add' instructions, but the cost model for selecting an LEA
52231 // currently has a high threshold.
52232 bool HasLEAPotential = false;
52233 for (auto *User : Ext->uses()) {
52234 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
52235 HasLEAPotential = true;
52236 break;
52239 if (!HasLEAPotential)
52240 return SDValue();
52242 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
52243 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
52244 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
52245 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
52247 // The wider add is guaranteed to not wrap because both operands are
52248 // sign-extended.
52249 SDNodeFlags Flags;
52250 Flags.setNoSignedWrap(NSW);
52251 Flags.setNoUnsignedWrap(NUW);
52252 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
52255 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
52256 // operands and the result of CMOV is not used anywhere else - promote CMOV
52257 // itself instead of promoting its result. This could be beneficial, because:
52258 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
52259 // (or more) pseudo-CMOVs only when they go one-after-another and
52260 // getting rid of result extension code after CMOV will help that.
52261 // 2) Promotion of constant CMOV arguments is free, hence the
52262 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
52263 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
52264 // promotion is also good in terms of code-size.
52265 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
52266 // promotion).
52267 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
52268 SDValue CMovN = Extend->getOperand(0);
52269 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
52270 return SDValue();
52272 EVT TargetVT = Extend->getValueType(0);
52273 unsigned ExtendOpcode = Extend->getOpcode();
52274 SDLoc DL(Extend);
52276 EVT VT = CMovN.getValueType();
52277 SDValue CMovOp0 = CMovN.getOperand(0);
52278 SDValue CMovOp1 = CMovN.getOperand(1);
52280 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
52281 !isa<ConstantSDNode>(CMovOp1.getNode()))
52282 return SDValue();
52284 // Only extend to i32 or i64.
52285 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
52286 return SDValue();
52288 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
52289 // are free.
52290 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
52291 return SDValue();
52293 // If this a zero extend to i64, we should only extend to i32 and use a free
52294 // zero extend to finish.
52295 EVT ExtendVT = TargetVT;
52296 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
52297 ExtendVT = MVT::i32;
52299 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
52300 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
52302 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
52303 CMovN.getOperand(2), CMovN.getOperand(3));
52305 // Finish extending if needed.
52306 if (ExtendVT != TargetVT)
52307 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
52309 return Res;
52312 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
52313 // result type.
52314 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
52315 const X86Subtarget &Subtarget) {
52316 SDValue N0 = N->getOperand(0);
52317 EVT VT = N->getValueType(0);
52318 SDLoc dl(N);
52320 // Only do this combine with AVX512 for vector extends.
52321 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
52322 return SDValue();
52324 // Only combine legal element types.
52325 EVT SVT = VT.getVectorElementType();
52326 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
52327 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
52328 return SDValue();
52330 // We don't have CMPP Instruction for vxf16
52331 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
52332 return SDValue();
52333 // We can only do this if the vector size in 256 bits or less.
52334 unsigned Size = VT.getSizeInBits();
52335 if (Size > 256 && Subtarget.useAVX512Regs())
52336 return SDValue();
52338 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
52339 // that's the only integer compares with we have.
52340 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
52341 if (ISD::isUnsignedIntSetCC(CC))
52342 return SDValue();
52344 // Only do this combine if the extension will be fully consumed by the setcc.
52345 EVT N00VT = N0.getOperand(0).getValueType();
52346 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
52347 if (Size != MatchingVecType.getSizeInBits())
52348 return SDValue();
52350 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
52352 if (N->getOpcode() == ISD::ZERO_EXTEND)
52353 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
52355 return Res;
52358 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
52359 TargetLowering::DAGCombinerInfo &DCI,
52360 const X86Subtarget &Subtarget) {
52361 SDValue N0 = N->getOperand(0);
52362 EVT VT = N->getValueType(0);
52363 SDLoc DL(N);
52365 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
52366 if (!DCI.isBeforeLegalizeOps() &&
52367 N0.getOpcode() == X86ISD::SETCC_CARRY) {
52368 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
52369 N0->getOperand(1));
52370 bool ReplaceOtherUses = !N0.hasOneUse();
52371 DCI.CombineTo(N, Setcc);
52372 // Replace other uses with a truncate of the widened setcc_carry.
52373 if (ReplaceOtherUses) {
52374 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
52375 N0.getValueType(), Setcc);
52376 DCI.CombineTo(N0.getNode(), Trunc);
52379 return SDValue(N, 0);
52382 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
52383 return NewCMov;
52385 if (!DCI.isBeforeLegalizeOps())
52386 return SDValue();
52388 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
52389 return V;
52391 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
52392 DAG, DCI, Subtarget))
52393 return V;
52395 if (VT.isVector()) {
52396 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
52397 return R;
52399 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
52400 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
52403 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
52404 return NewAdd;
52406 return SDValue();
52409 // Inverting a constant vector is profitable if it can be eliminated and the
52410 // inverted vector is already present in DAG. Otherwise, it will be loaded
52411 // anyway.
52413 // We determine which of the values can be completely eliminated and invert it.
52414 // If both are eliminable, select a vector with the first negative element.
52415 static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG) {
52416 assert(ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()) &&
52417 "ConstantFP build vector expected");
52418 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
52419 // can eliminate it. Since this function is invoked for each FMA with this
52420 // vector.
52421 auto IsNotFMA = [](SDNode *Use) {
52422 return Use->getOpcode() != ISD::FMA && Use->getOpcode() != ISD::STRICT_FMA;
52424 if (llvm::any_of(V->uses(), IsNotFMA))
52425 return SDValue();
52427 SmallVector<SDValue, 8> Ops;
52428 EVT VT = V.getValueType();
52429 EVT EltVT = VT.getVectorElementType();
52430 for (auto Op : V->op_values()) {
52431 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
52432 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
52433 } else {
52434 assert(Op.isUndef());
52435 Ops.push_back(DAG.getUNDEF(EltVT));
52439 SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
52440 if (!NV)
52441 return SDValue();
52443 // If an inverted version cannot be eliminated, choose it instead of the
52444 // original version.
52445 if (llvm::any_of(NV->uses(), IsNotFMA))
52446 return SDValue(NV, 0);
52448 // If the inverted version also can be eliminated, we have to consistently
52449 // prefer one of the values. We prefer a constant with a negative value on
52450 // the first place.
52451 // N.B. We need to skip undefs that may precede a value.
52452 for (auto op : V->op_values()) {
52453 if (auto *Cst = dyn_cast<ConstantFPSDNode>(op)) {
52454 if (Cst->isNegative())
52455 return SDValue();
52456 break;
52459 return SDValue(NV, 0);
52462 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
52463 TargetLowering::DAGCombinerInfo &DCI,
52464 const X86Subtarget &Subtarget) {
52465 SDLoc dl(N);
52466 EVT VT = N->getValueType(0);
52467 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
52469 // Let legalize expand this if it isn't a legal type yet.
52470 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52471 if (!TLI.isTypeLegal(VT))
52472 return SDValue();
52474 SDValue A = N->getOperand(IsStrict ? 1 : 0);
52475 SDValue B = N->getOperand(IsStrict ? 2 : 1);
52476 SDValue C = N->getOperand(IsStrict ? 3 : 2);
52478 // If the operation allows fast-math and the target does not support FMA,
52479 // split this into mul+add to avoid libcall(s).
52480 SDNodeFlags Flags = N->getFlags();
52481 if (!IsStrict && Flags.hasAllowReassociation() &&
52482 TLI.isOperationExpand(ISD::FMA, VT)) {
52483 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
52484 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
52487 EVT ScalarVT = VT.getScalarType();
52488 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
52489 !Subtarget.hasAnyFMA()) &&
52490 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
52491 return SDValue();
52493 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
52494 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
52495 bool LegalOperations = !DCI.isBeforeLegalizeOps();
52496 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
52497 CodeSize)) {
52498 V = NegV;
52499 return true;
52501 // Look through extract_vector_elts. If it comes from an FNEG, create a
52502 // new extract from the FNEG input.
52503 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
52504 isNullConstant(V.getOperand(1))) {
52505 SDValue Vec = V.getOperand(0);
52506 if (SDValue NegV = TLI.getCheaperNegatedExpression(
52507 Vec, DAG, LegalOperations, CodeSize)) {
52508 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
52509 NegV, V.getOperand(1));
52510 return true;
52513 // Lookup if there is an inverted version of constant vector V in DAG.
52514 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
52515 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
52516 V = NegV;
52517 return true;
52520 return false;
52523 // Do not convert the passthru input of scalar intrinsics.
52524 // FIXME: We could allow negations of the lower element only.
52525 bool NegA = invertIfNegative(A);
52526 bool NegB = invertIfNegative(B);
52527 bool NegC = invertIfNegative(C);
52529 if (!NegA && !NegB && !NegC)
52530 return SDValue();
52532 unsigned NewOpcode =
52533 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
52535 // Propagate fast-math-flags to new FMA node.
52536 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
52537 if (IsStrict) {
52538 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
52539 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
52540 {N->getOperand(0), A, B, C});
52541 } else {
52542 if (N->getNumOperands() == 4)
52543 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
52544 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
52548 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
52549 // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
52550 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
52551 TargetLowering::DAGCombinerInfo &DCI) {
52552 SDLoc dl(N);
52553 EVT VT = N->getValueType(0);
52554 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52555 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
52556 bool LegalOperations = !DCI.isBeforeLegalizeOps();
52558 SDValue N2 = N->getOperand(2);
52560 SDValue NegN2 =
52561 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
52562 if (!NegN2)
52563 return SDValue();
52564 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
52566 if (N->getNumOperands() == 4)
52567 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
52568 NegN2, N->getOperand(3));
52569 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
52570 NegN2);
52573 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
52574 TargetLowering::DAGCombinerInfo &DCI,
52575 const X86Subtarget &Subtarget) {
52576 SDLoc dl(N);
52577 SDValue N0 = N->getOperand(0);
52578 EVT VT = N->getValueType(0);
52580 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
52581 // FIXME: Is this needed? We don't seem to have any tests for it.
52582 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
52583 N0.getOpcode() == X86ISD::SETCC_CARRY) {
52584 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
52585 N0->getOperand(1));
52586 bool ReplaceOtherUses = !N0.hasOneUse();
52587 DCI.CombineTo(N, Setcc);
52588 // Replace other uses with a truncate of the widened setcc_carry.
52589 if (ReplaceOtherUses) {
52590 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
52591 N0.getValueType(), Setcc);
52592 DCI.CombineTo(N0.getNode(), Trunc);
52595 return SDValue(N, 0);
52598 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
52599 return NewCMov;
52601 if (DCI.isBeforeLegalizeOps())
52602 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
52603 return V;
52605 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
52606 DAG, DCI, Subtarget))
52607 return V;
52609 if (VT.isVector())
52610 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
52611 return R;
52613 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
52614 return NewAdd;
52616 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
52617 return R;
52619 // TODO: Combine with any target/faux shuffle.
52620 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
52621 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
52622 SDValue N00 = N0.getOperand(0);
52623 SDValue N01 = N0.getOperand(1);
52624 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
52625 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
52626 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
52627 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
52628 return concatSubVectors(N00, N01, DAG, dl);
52632 return SDValue();
52635 /// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
52636 /// pre-promote its result type since vXi1 vectors don't get promoted
52637 /// during type legalization.
52638 static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,
52639 SDValue RHS, ISD::CondCode CC,
52640 const SDLoc &DL, SelectionDAG &DAG,
52641 const X86Subtarget &Subtarget) {
52642 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
52643 VT.getVectorElementType() == MVT::i1 &&
52644 (OpVT.getVectorElementType() == MVT::i8 ||
52645 OpVT.getVectorElementType() == MVT::i16)) {
52646 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
52647 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
52649 return SDValue();
52652 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
52653 TargetLowering::DAGCombinerInfo &DCI,
52654 const X86Subtarget &Subtarget) {
52655 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
52656 const SDValue LHS = N->getOperand(0);
52657 const SDValue RHS = N->getOperand(1);
52658 EVT VT = N->getValueType(0);
52659 EVT OpVT = LHS.getValueType();
52660 SDLoc DL(N);
52662 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
52663 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
52664 Subtarget))
52665 return V;
52667 if (VT == MVT::i1) {
52668 X86::CondCode X86CC;
52669 if (SDValue V =
52670 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
52671 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
52674 if (OpVT.isScalarInteger()) {
52675 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
52676 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
52677 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
52678 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
52679 if (N0.getOperand(0) == N1)
52680 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
52681 N0.getOperand(1));
52682 if (N0.getOperand(1) == N1)
52683 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
52684 N0.getOperand(0));
52686 return SDValue();
52688 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
52689 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
52690 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
52691 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
52693 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
52694 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
52695 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
52696 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
52697 if (N0.getOperand(0) == N1)
52698 return DAG.getNode(ISD::AND, DL, OpVT, N1,
52699 DAG.getNOT(DL, N0.getOperand(1), OpVT));
52700 if (N0.getOperand(1) == N1)
52701 return DAG.getNode(ISD::AND, DL, OpVT, N1,
52702 DAG.getNOT(DL, N0.getOperand(0), OpVT));
52704 return SDValue();
52706 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
52707 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
52708 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
52709 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
52711 // cmpeq(trunc(x),C) --> cmpeq(x,C)
52712 // cmpne(trunc(x),C) --> cmpne(x,C)
52713 // iff x upper bits are zero.
52714 if (LHS.getOpcode() == ISD::TRUNCATE &&
52715 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
52716 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
52717 EVT SrcVT = LHS.getOperand(0).getValueType();
52718 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
52719 OpVT.getScalarSizeInBits());
52720 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52721 auto *C = cast<ConstantSDNode>(RHS);
52722 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
52723 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
52724 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
52725 DAG.getConstant(C->getAPIntValue().zextOrTrunc(
52726 SrcVT.getScalarSizeInBits()),
52727 DL, SrcVT),
52728 CC);
52731 // With C as a power of 2 and C != 0 and C != INT_MIN:
52732 // icmp eq Abs(X) C ->
52733 // (icmp eq A, C) | (icmp eq A, -C)
52734 // icmp ne Abs(X) C ->
52735 // (icmp ne A, C) & (icmp ne A, -C)
52736 // Both of these patterns can be better optimized in
52737 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
52738 // integers which is checked above.
52739 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
52740 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
52741 const APInt &CInt = C->getAPIntValue();
52742 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
52743 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
52744 SDValue BaseOp = LHS.getOperand(0);
52745 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
52746 SDValue SETCC1 = DAG.getSetCC(
52747 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
52748 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
52749 SETCC0, SETCC1);
52756 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
52757 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
52758 // Using temporaries to avoid messing up operand ordering for later
52759 // transformations if this doesn't work.
52760 SDValue Op0 = LHS;
52761 SDValue Op1 = RHS;
52762 ISD::CondCode TmpCC = CC;
52763 // Put build_vector on the right.
52764 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
52765 std::swap(Op0, Op1);
52766 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
52769 bool IsSEXT0 =
52770 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
52771 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
52772 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
52774 if (IsSEXT0 && IsVZero1) {
52775 assert(VT == Op0.getOperand(0).getValueType() &&
52776 "Unexpected operand type");
52777 if (TmpCC == ISD::SETGT)
52778 return DAG.getConstant(0, DL, VT);
52779 if (TmpCC == ISD::SETLE)
52780 return DAG.getConstant(1, DL, VT);
52781 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
52782 return DAG.getNOT(DL, Op0.getOperand(0), VT);
52784 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
52785 "Unexpected condition code!");
52786 return Op0.getOperand(0);
52790 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
52791 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
52792 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
52793 // a mask, there are signed AVX512 comparisons).
52794 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
52795 bool CanMakeSigned = false;
52796 if (ISD::isUnsignedIntSetCC(CC)) {
52797 KnownBits CmpKnown =
52798 DAG.computeKnownBits(LHS).intersectWith(DAG.computeKnownBits(RHS));
52799 // If we know LHS/RHS share the same sign bit at each element we can
52800 // make this signed.
52801 // NOTE: `computeKnownBits` on a vector type aggregates common bits
52802 // across all lanes. So a pattern where the sign varies from lane to
52803 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
52804 // missed. We could get around this by demanding each lane
52805 // independently, but this isn't the most important optimization and
52806 // that may eat into compile time.
52807 CanMakeSigned =
52808 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
52810 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
52811 SDValue LHSOut = LHS;
52812 SDValue RHSOut = RHS;
52813 ISD::CondCode NewCC = CC;
52814 switch (CC) {
52815 case ISD::SETGE:
52816 case ISD::SETUGE:
52817 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
52818 /*NSW*/ true))
52819 LHSOut = NewLHS;
52820 else if (SDValue NewRHS = incDecVectorConstant(
52821 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
52822 RHSOut = NewRHS;
52823 else
52824 break;
52826 [[fallthrough]];
52827 case ISD::SETUGT:
52828 NewCC = ISD::SETGT;
52829 break;
52831 case ISD::SETLE:
52832 case ISD::SETULE:
52833 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
52834 /*NSW*/ true))
52835 LHSOut = NewLHS;
52836 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
52837 /*NSW*/ true))
52838 RHSOut = NewRHS;
52839 else
52840 break;
52842 [[fallthrough]];
52843 case ISD::SETULT:
52844 // Will be swapped to SETGT in LowerVSETCC*.
52845 NewCC = ISD::SETLT;
52846 break;
52847 default:
52848 break;
52850 if (NewCC != CC) {
52851 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
52852 NewCC, DL, DAG, Subtarget))
52853 return R;
52854 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
52859 if (SDValue R =
52860 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
52861 return R;
52863 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
52864 // to avoid scalarization via legalization because v4i32 is not a legal type.
52865 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
52866 LHS.getValueType() == MVT::v4f32)
52867 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
52869 // X pred 0.0 --> X pred -X
52870 // If the negation of X already exists, use it in the comparison. This removes
52871 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
52872 // instructions in patterns with a 'select' node.
52873 if (isNullFPScalarOrVectorConst(RHS)) {
52874 SDVTList FNegVT = DAG.getVTList(OpVT);
52875 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
52876 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
52879 return SDValue();
52882 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
52883 TargetLowering::DAGCombinerInfo &DCI,
52884 const X86Subtarget &Subtarget) {
52885 SDValue Src = N->getOperand(0);
52886 MVT SrcVT = Src.getSimpleValueType();
52887 MVT VT = N->getSimpleValueType(0);
52888 unsigned NumBits = VT.getScalarSizeInBits();
52889 unsigned NumElts = SrcVT.getVectorNumElements();
52890 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
52891 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
52893 // Perform constant folding.
52894 APInt UndefElts;
52895 SmallVector<APInt, 32> EltBits;
52896 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {
52897 APInt Imm(32, 0);
52898 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
52899 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
52900 Imm.setBit(Idx);
52902 return DAG.getConstant(Imm, SDLoc(N), VT);
52905 // Look through int->fp bitcasts that don't change the element width.
52906 unsigned EltWidth = SrcVT.getScalarSizeInBits();
52907 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
52908 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
52909 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
52911 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
52912 // with scalar comparisons.
52913 if (SDValue NotSrc = IsNOT(Src, DAG)) {
52914 SDLoc DL(N);
52915 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
52916 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
52917 return DAG.getNode(ISD::XOR, DL, VT,
52918 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
52919 DAG.getConstant(NotMask, DL, VT));
52922 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
52923 // results with scalar comparisons.
52924 if (Src.getOpcode() == X86ISD::PCMPGT &&
52925 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
52926 SDLoc DL(N);
52927 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
52928 return DAG.getNode(ISD::XOR, DL, VT,
52929 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
52930 DAG.getConstant(NotMask, DL, VT));
52933 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
52934 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
52935 // iff pow2splat(c1).
52936 // Use KnownBits to determine if only a single bit is non-zero
52937 // in each element (pow2 or zero), and shift that bit to the msb.
52938 if (Src.getOpcode() == X86ISD::PCMPEQ) {
52939 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
52940 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
52941 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
52942 if (KnownLHS.countMaxPopulation() == 1 &&
52943 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
52944 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
52945 SDLoc DL(N);
52946 MVT ShiftVT = SrcVT;
52947 SDValue ShiftLHS = Src.getOperand(0);
52948 SDValue ShiftRHS = Src.getOperand(1);
52949 if (ShiftVT.getScalarType() == MVT::i8) {
52950 // vXi8 shifts - we only care about the signbit so can use PSLLW.
52951 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
52952 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
52953 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
52955 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
52956 ShiftLHS, ShiftAmt, DAG);
52957 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
52958 ShiftRHS, ShiftAmt, DAG);
52959 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
52960 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
52961 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
52962 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
52966 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
52967 if (N->isOnlyUserOf(Src.getNode())) {
52968 SDValue SrcBC = peekThroughOneUseBitcasts(Src);
52969 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
52970 APInt UndefElts;
52971 SmallVector<APInt, 32> EltBits;
52972 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
52973 UndefElts, EltBits)) {
52974 APInt Mask = APInt::getZero(NumBits);
52975 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
52976 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
52977 Mask.setBit(Idx);
52979 SDLoc DL(N);
52980 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
52981 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
52982 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
52983 DAG.getConstant(Mask, DL, VT));
52988 // Simplify the inputs.
52989 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52990 APInt DemandedMask(APInt::getAllOnes(NumBits));
52991 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52992 return SDValue(N, 0);
52994 return SDValue();
52997 static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG,
52998 TargetLowering::DAGCombinerInfo &DCI,
52999 const X86Subtarget &Subtarget) {
53000 MVT VT = N->getSimpleValueType(0);
53001 unsigned NumBits = VT.getScalarSizeInBits();
53003 // Simplify the inputs.
53004 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53005 APInt DemandedMask(APInt::getAllOnes(NumBits));
53006 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53007 return SDValue(N, 0);
53009 return SDValue();
53012 static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
53013 TargetLowering::DAGCombinerInfo &DCI) {
53014 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
53015 SDValue Mask = MemOp->getMask();
53017 // With vector masks we only demand the upper bit of the mask.
53018 if (Mask.getScalarValueSizeInBits() != 1) {
53019 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53020 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
53021 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
53022 if (N->getOpcode() != ISD::DELETED_NODE)
53023 DCI.AddToWorklist(N);
53024 return SDValue(N, 0);
53028 return SDValue();
53031 static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
53032 SDValue Index, SDValue Base, SDValue Scale,
53033 SelectionDAG &DAG) {
53034 SDLoc DL(GorS);
53036 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
53037 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
53038 Gather->getMask(), Base, Index, Scale } ;
53039 return DAG.getMaskedGather(Gather->getVTList(),
53040 Gather->getMemoryVT(), DL, Ops,
53041 Gather->getMemOperand(),
53042 Gather->getIndexType(),
53043 Gather->getExtensionType());
53045 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
53046 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
53047 Scatter->getMask(), Base, Index, Scale };
53048 return DAG.getMaskedScatter(Scatter->getVTList(),
53049 Scatter->getMemoryVT(), DL,
53050 Ops, Scatter->getMemOperand(),
53051 Scatter->getIndexType(),
53052 Scatter->isTruncatingStore());
53055 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
53056 TargetLowering::DAGCombinerInfo &DCI) {
53057 SDLoc DL(N);
53058 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
53059 SDValue Index = GorS->getIndex();
53060 SDValue Base = GorS->getBasePtr();
53061 SDValue Scale = GorS->getScale();
53063 if (DCI.isBeforeLegalize()) {
53064 unsigned IndexWidth = Index.getScalarValueSizeInBits();
53066 // Shrink constant indices if they are larger than 32-bits.
53067 // Only do this before legalize types since v2i64 could become v2i32.
53068 // FIXME: We could check that the type is legal if we're after legalize
53069 // types, but then we would need to construct test cases where that happens.
53070 // FIXME: We could support more than just constant vectors, but we need to
53071 // careful with costing. A truncate that can be optimized out would be fine.
53072 // Otherwise we might only want to create a truncate if it avoids a split.
53073 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
53074 if (BV->isConstant() && IndexWidth > 32 &&
53075 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
53076 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
53077 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
53078 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53082 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
53083 // there are sufficient sign bits. Only do this before legalize types to
53084 // avoid creating illegal types in truncate.
53085 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
53086 Index.getOpcode() == ISD::ZERO_EXTEND) &&
53087 IndexWidth > 32 &&
53088 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
53089 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
53090 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
53091 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
53092 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53096 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53097 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53098 // Try to move splat constant adders from the index operand to the base
53099 // pointer operand. Taking care to multiply by the scale. We can only do
53100 // this when index element type is the same as the pointer type.
53101 // Otherwise we need to be sure the math doesn't wrap before the scale.
53102 if (Index.getOpcode() == ISD::ADD &&
53103 Index.getValueType().getVectorElementType() == PtrVT &&
53104 isa<ConstantSDNode>(Scale)) {
53105 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
53106 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
53107 BitVector UndefElts;
53108 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
53109 // FIXME: Allow non-constant?
53110 if (UndefElts.none()) {
53111 // Apply the scale.
53112 APInt Adder = C->getAPIntValue() * ScaleAmt;
53113 // Add it to the existing base.
53114 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
53115 DAG.getConstant(Adder, DL, PtrVT));
53116 Index = Index.getOperand(0);
53117 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53121 // It's also possible base is just a constant. In that case, just
53122 // replace it with 0 and move the displacement into the index.
53123 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
53124 isOneConstant(Scale)) {
53125 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
53126 // Combine the constant build_vector and the constant base.
53127 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
53128 Index.getOperand(1), Splat);
53129 // Add to the LHS of the original Index add.
53130 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
53131 Index.getOperand(0), Splat);
53132 Base = DAG.getConstant(0, DL, Base.getValueType());
53133 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53138 if (DCI.isBeforeLegalizeOps()) {
53139 unsigned IndexWidth = Index.getScalarValueSizeInBits();
53141 // Make sure the index is either i32 or i64
53142 if (IndexWidth != 32 && IndexWidth != 64) {
53143 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
53144 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
53145 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
53146 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53150 // With vector masks we only demand the upper bit of the mask.
53151 SDValue Mask = GorS->getMask();
53152 if (Mask.getScalarValueSizeInBits() != 1) {
53153 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53154 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
53155 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
53156 if (N->getOpcode() != ISD::DELETED_NODE)
53157 DCI.AddToWorklist(N);
53158 return SDValue(N, 0);
53162 return SDValue();
53165 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
53166 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
53167 const X86Subtarget &Subtarget) {
53168 SDLoc DL(N);
53169 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
53170 SDValue EFLAGS = N->getOperand(1);
53172 // Try to simplify the EFLAGS and condition code operands.
53173 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
53174 return getSETCC(CC, Flags, DL, DAG);
53176 return SDValue();
53179 /// Optimize branch condition evaluation.
53180 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
53181 const X86Subtarget &Subtarget) {
53182 SDLoc DL(N);
53183 SDValue EFLAGS = N->getOperand(3);
53184 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
53186 // Try to simplify the EFLAGS and condition code operands.
53187 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
53188 // RAUW them under us.
53189 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
53190 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
53191 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
53192 N->getOperand(1), Cond, Flags);
53195 return SDValue();
53198 // TODO: Could we move this to DAGCombine?
53199 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
53200 SelectionDAG &DAG) {
53201 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
53202 // to optimize away operation when it's from a constant.
53204 // The general transformation is:
53205 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
53206 // AND(VECTOR_CMP(x,y), constant2)
53207 // constant2 = UNARYOP(constant)
53209 // Early exit if this isn't a vector operation, the operand of the
53210 // unary operation isn't a bitwise AND, or if the sizes of the operations
53211 // aren't the same.
53212 EVT VT = N->getValueType(0);
53213 bool IsStrict = N->isStrictFPOpcode();
53214 unsigned NumEltBits = VT.getScalarSizeInBits();
53215 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
53216 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
53217 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
53218 VT.getSizeInBits() != Op0.getValueSizeInBits())
53219 return SDValue();
53221 // Now check that the other operand of the AND is a constant. We could
53222 // make the transformation for non-constant splats as well, but it's unclear
53223 // that would be a benefit as it would not eliminate any operations, just
53224 // perform one more step in scalar code before moving to the vector unit.
53225 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
53226 // Bail out if the vector isn't a constant.
53227 if (!BV->isConstant())
53228 return SDValue();
53230 // Everything checks out. Build up the new and improved node.
53231 SDLoc DL(N);
53232 EVT IntVT = BV->getValueType(0);
53233 // Create a new constant of the appropriate type for the transformed
53234 // DAG.
53235 SDValue SourceConst;
53236 if (IsStrict)
53237 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
53238 {N->getOperand(0), SDValue(BV, 0)});
53239 else
53240 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
53241 // The AND node needs bitcasts to/from an integer vector type around it.
53242 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
53243 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
53244 MaskConst);
53245 SDValue Res = DAG.getBitcast(VT, NewAnd);
53246 if (IsStrict)
53247 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
53248 return Res;
53251 return SDValue();
53254 /// If we are converting a value to floating-point, try to replace scalar
53255 /// truncate of an extracted vector element with a bitcast. This tries to keep
53256 /// the sequence on XMM registers rather than moving between vector and GPRs.
53257 static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
53258 // TODO: This is currently only used by combineSIntToFP, but it is generalized
53259 // to allow being called by any similar cast opcode.
53260 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
53261 SDValue Trunc = N->getOperand(0);
53262 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
53263 return SDValue();
53265 SDValue ExtElt = Trunc.getOperand(0);
53266 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53267 !isNullConstant(ExtElt.getOperand(1)))
53268 return SDValue();
53270 EVT TruncVT = Trunc.getValueType();
53271 EVT SrcVT = ExtElt.getValueType();
53272 unsigned DestWidth = TruncVT.getSizeInBits();
53273 unsigned SrcWidth = SrcVT.getSizeInBits();
53274 if (SrcWidth % DestWidth != 0)
53275 return SDValue();
53277 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
53278 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
53279 unsigned VecWidth = SrcVecVT.getSizeInBits();
53280 unsigned NumElts = VecWidth / DestWidth;
53281 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
53282 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
53283 SDLoc DL(N);
53284 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
53285 BitcastVec, ExtElt.getOperand(1));
53286 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
53289 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
53290 const X86Subtarget &Subtarget) {
53291 bool IsStrict = N->isStrictFPOpcode();
53292 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
53293 EVT VT = N->getValueType(0);
53294 EVT InVT = Op0.getValueType();
53296 // Using i16 as an intermediate type is a bad idea, unless we have HW support
53297 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
53298 // if hasFP16 support:
53299 // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
53300 // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
53301 // else
53302 // UINT_TO_FP(vXi1~31) -> UINT_TO_FP(ZEXT(vXi1~31 to vXi32))
53303 // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
53304 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
53305 unsigned ScalarSize = InVT.getScalarSizeInBits();
53306 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
53307 ScalarSize >= 64)
53308 return SDValue();
53309 SDLoc dl(N);
53310 EVT DstVT =
53311 EVT::getVectorVT(*DAG.getContext(),
53312 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
53313 : ScalarSize < 32 ? MVT::i32
53314 : MVT::i64,
53315 InVT.getVectorNumElements());
53316 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
53317 if (IsStrict)
53318 return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
53319 {N->getOperand(0), P});
53320 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
53323 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
53324 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
53325 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
53326 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
53327 VT.getScalarType() != MVT::f16) {
53328 SDLoc dl(N);
53329 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
53330 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
53332 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
53333 if (IsStrict)
53334 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
53335 {N->getOperand(0), P});
53336 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
53339 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
53340 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
53341 // the optimization here.
53342 if (DAG.SignBitIsZero(Op0)) {
53343 if (IsStrict)
53344 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
53345 {N->getOperand(0), Op0});
53346 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
53349 return SDValue();
53352 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
53353 TargetLowering::DAGCombinerInfo &DCI,
53354 const X86Subtarget &Subtarget) {
53355 // First try to optimize away the conversion entirely when it's
53356 // conditionally from a constant. Vectors only.
53357 bool IsStrict = N->isStrictFPOpcode();
53358 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
53359 return Res;
53361 // Now move on to more general possibilities.
53362 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
53363 EVT VT = N->getValueType(0);
53364 EVT InVT = Op0.getValueType();
53366 // Using i16 as an intermediate type is a bad idea, unless we have HW support
53367 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
53368 // if hasFP16 support:
53369 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
53370 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
53371 // else
53372 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
53373 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
53374 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
53375 unsigned ScalarSize = InVT.getScalarSizeInBits();
53376 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
53377 ScalarSize >= 64)
53378 return SDValue();
53379 SDLoc dl(N);
53380 EVT DstVT =
53381 EVT::getVectorVT(*DAG.getContext(),
53382 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
53383 : ScalarSize < 32 ? MVT::i32
53384 : MVT::i64,
53385 InVT.getVectorNumElements());
53386 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
53387 if (IsStrict)
53388 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
53389 {N->getOperand(0), P});
53390 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
53393 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
53394 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
53395 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
53396 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
53397 VT.getScalarType() != MVT::f16) {
53398 SDLoc dl(N);
53399 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
53400 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
53401 if (IsStrict)
53402 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
53403 {N->getOperand(0), P});
53404 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
53407 // Without AVX512DQ we only support i64 to float scalar conversion. For both
53408 // vectors and scalars, see if we know that the upper bits are all the sign
53409 // bit, in which case we can truncate the input to i32 and convert from that.
53410 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
53411 unsigned BitWidth = InVT.getScalarSizeInBits();
53412 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
53413 if (NumSignBits >= (BitWidth - 31)) {
53414 EVT TruncVT = MVT::i32;
53415 if (InVT.isVector())
53416 TruncVT = InVT.changeVectorElementType(TruncVT);
53417 SDLoc dl(N);
53418 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
53419 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
53420 if (IsStrict)
53421 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
53422 {N->getOperand(0), Trunc});
53423 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
53425 // If we're after legalize and the type is v2i32 we need to shuffle and
53426 // use CVTSI2P.
53427 assert(InVT == MVT::v2i64 && "Unexpected VT!");
53428 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
53429 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
53430 { 0, 2, -1, -1 });
53431 if (IsStrict)
53432 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
53433 {N->getOperand(0), Shuf});
53434 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
53438 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
53439 // a 32-bit target where SSE doesn't support i64->FP operations.
53440 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
53441 Op0.getOpcode() == ISD::LOAD) {
53442 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
53444 // This transformation is not supported if the result type is f16 or f128.
53445 if (VT == MVT::f16 || VT == MVT::f128)
53446 return SDValue();
53448 // If we have AVX512DQ we can use packed conversion instructions unless
53449 // the VT is f80.
53450 if (Subtarget.hasDQI() && VT != MVT::f80)
53451 return SDValue();
53453 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
53454 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
53455 std::pair<SDValue, SDValue> Tmp =
53456 Subtarget.getTargetLowering()->BuildFILD(
53457 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
53458 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
53459 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
53460 return Tmp.first;
53464 if (IsStrict)
53465 return SDValue();
53467 if (SDValue V = combineToFPTruncExtElt(N, DAG))
53468 return V;
53470 return SDValue();
53473 static bool needCarryOrOverflowFlag(SDValue Flags) {
53474 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
53476 for (const SDNode *User : Flags->uses()) {
53477 X86::CondCode CC;
53478 switch (User->getOpcode()) {
53479 default:
53480 // Be conservative.
53481 return true;
53482 case X86ISD::SETCC:
53483 case X86ISD::SETCC_CARRY:
53484 CC = (X86::CondCode)User->getConstantOperandVal(0);
53485 break;
53486 case X86ISD::BRCOND:
53487 case X86ISD::CMOV:
53488 CC = (X86::CondCode)User->getConstantOperandVal(2);
53489 break;
53492 switch (CC) {
53493 default: break;
53494 case X86::COND_A: case X86::COND_AE:
53495 case X86::COND_B: case X86::COND_BE:
53496 case X86::COND_O: case X86::COND_NO:
53497 case X86::COND_G: case X86::COND_GE:
53498 case X86::COND_L: case X86::COND_LE:
53499 return true;
53503 return false;
53506 static bool onlyZeroFlagUsed(SDValue Flags) {
53507 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
53509 for (const SDNode *User : Flags->uses()) {
53510 unsigned CCOpNo;
53511 switch (User->getOpcode()) {
53512 default:
53513 // Be conservative.
53514 return false;
53515 case X86ISD::SETCC:
53516 case X86ISD::SETCC_CARRY:
53517 CCOpNo = 0;
53518 break;
53519 case X86ISD::BRCOND:
53520 case X86ISD::CMOV:
53521 CCOpNo = 2;
53522 break;
53525 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
53526 if (CC != X86::COND_E && CC != X86::COND_NE)
53527 return false;
53530 return true;
53533 static SDValue combineCMP(SDNode *N, SelectionDAG &DAG,
53534 const X86Subtarget &Subtarget) {
53535 // Only handle test patterns.
53536 if (!isNullConstant(N->getOperand(1)))
53537 return SDValue();
53539 // If we have a CMP of a truncated binop, see if we can make a smaller binop
53540 // and use its flags directly.
53541 // TODO: Maybe we should try promoting compares that only use the zero flag
53542 // first if we can prove the upper bits with computeKnownBits?
53543 SDLoc dl(N);
53544 SDValue Op = N->getOperand(0);
53545 EVT VT = Op.getValueType();
53546 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53548 // If we have a constant logical shift that's only used in a comparison
53549 // against zero turn it into an equivalent AND. This allows turning it into
53550 // a TEST instruction later.
53551 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
53552 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
53553 onlyZeroFlagUsed(SDValue(N, 0))) {
53554 unsigned BitWidth = VT.getSizeInBits();
53555 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
53556 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
53557 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
53558 APInt Mask = Op.getOpcode() == ISD::SRL
53559 ? APInt::getHighBitsSet(BitWidth, MaskBits)
53560 : APInt::getLowBitsSet(BitWidth, MaskBits);
53561 if (Mask.isSignedIntN(32)) {
53562 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
53563 DAG.getConstant(Mask, dl, VT));
53564 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
53565 DAG.getConstant(0, dl, VT));
53570 // If we're extracting from a avx512 bool vector and comparing against zero,
53571 // then try to just bitcast the vector to an integer to use TEST/BT directly.
53572 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
53573 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
53574 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
53575 SDValue Src = Op.getOperand(0);
53576 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
53577 isNullConstant(Src.getOperand(1)) &&
53578 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
53579 SDValue BoolVec = Src.getOperand(0);
53580 unsigned ShAmt = 0;
53581 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
53582 ShAmt = BoolVec.getConstantOperandVal(1);
53583 BoolVec = BoolVec.getOperand(0);
53585 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
53586 EVT VecVT = BoolVec.getValueType();
53587 unsigned BitWidth = VecVT.getVectorNumElements();
53588 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
53589 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
53590 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
53591 Op = DAG.getBitcast(BCVT, BoolVec);
53592 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
53593 DAG.getConstant(Mask, dl, BCVT));
53594 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
53595 DAG.getConstant(0, dl, BCVT));
53600 // Peek through any zero-extend if we're only testing for a zero result.
53601 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
53602 SDValue Src = Op.getOperand(0);
53603 EVT SrcVT = Src.getValueType();
53604 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
53605 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
53606 DAG.getConstant(0, dl, SrcVT));
53609 // Look for a truncate.
53610 if (Op.getOpcode() != ISD::TRUNCATE)
53611 return SDValue();
53613 SDValue Trunc = Op;
53614 Op = Op.getOperand(0);
53616 // See if we can compare with zero against the truncation source,
53617 // which should help using the Z flag from many ops. Only do this for
53618 // i32 truncated op to prevent partial-reg compares of promoted ops.
53619 EVT OpVT = Op.getValueType();
53620 APInt UpperBits =
53621 APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
53622 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
53623 onlyZeroFlagUsed(SDValue(N, 0))) {
53624 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
53625 DAG.getConstant(0, dl, OpVT));
53628 // After this the truncate and arithmetic op must have a single use.
53629 if (!Trunc.hasOneUse() || !Op.hasOneUse())
53630 return SDValue();
53632 unsigned NewOpc;
53633 switch (Op.getOpcode()) {
53634 default: return SDValue();
53635 case ISD::AND:
53636 // Skip and with constant. We have special handling for and with immediate
53637 // during isel to generate test instructions.
53638 if (isa<ConstantSDNode>(Op.getOperand(1)))
53639 return SDValue();
53640 NewOpc = X86ISD::AND;
53641 break;
53642 case ISD::OR: NewOpc = X86ISD::OR; break;
53643 case ISD::XOR: NewOpc = X86ISD::XOR; break;
53644 case ISD::ADD:
53645 // If the carry or overflow flag is used, we can't truncate.
53646 if (needCarryOrOverflowFlag(SDValue(N, 0)))
53647 return SDValue();
53648 NewOpc = X86ISD::ADD;
53649 break;
53650 case ISD::SUB:
53651 // If the carry or overflow flag is used, we can't truncate.
53652 if (needCarryOrOverflowFlag(SDValue(N, 0)))
53653 return SDValue();
53654 NewOpc = X86ISD::SUB;
53655 break;
53658 // We found an op we can narrow. Truncate its inputs.
53659 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
53660 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
53662 // Use a X86 specific opcode to avoid DAG combine messing with it.
53663 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
53664 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
53666 // For AND, keep a CMP so that we can match the test pattern.
53667 if (NewOpc == X86ISD::AND)
53668 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
53669 DAG.getConstant(0, dl, VT));
53671 // Return the flags.
53672 return Op.getValue(1);
53675 static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
53676 TargetLowering::DAGCombinerInfo &DCI) {
53677 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
53678 "Expected X86ISD::ADD or X86ISD::SUB");
53680 SDLoc DL(N);
53681 SDValue LHS = N->getOperand(0);
53682 SDValue RHS = N->getOperand(1);
53683 MVT VT = LHS.getSimpleValueType();
53684 bool IsSub = X86ISD::SUB == N->getOpcode();
53685 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
53687 // If we don't use the flag result, simplify back to a generic ADD/SUB.
53688 if (!N->hasAnyUseOfValue(1)) {
53689 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
53690 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
53693 // Fold any similar generic ADD/SUB opcodes to reuse this node.
53694 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
53695 SDValue Ops[] = {N0, N1};
53696 SDVTList VTs = DAG.getVTList(N->getValueType(0));
53697 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
53698 SDValue Op(N, 0);
53699 if (Negate)
53700 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
53701 DCI.CombineTo(GenericAddSub, Op);
53704 MatchGeneric(LHS, RHS, false);
53705 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
53707 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
53708 // EFLAGS result doesn't change.
53709 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
53710 /*ZeroSecondOpOnly*/ true);
53713 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
53714 SDValue LHS = N->getOperand(0);
53715 SDValue RHS = N->getOperand(1);
53716 SDValue BorrowIn = N->getOperand(2);
53718 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
53719 MVT VT = N->getSimpleValueType(0);
53720 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
53721 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
53724 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
53725 // iff the flag result is dead.
53726 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
53727 !N->hasAnyUseOfValue(1))
53728 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
53729 LHS.getOperand(1), BorrowIn);
53731 return SDValue();
53734 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
53735 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
53736 TargetLowering::DAGCombinerInfo &DCI) {
53737 SDValue LHS = N->getOperand(0);
53738 SDValue RHS = N->getOperand(1);
53739 SDValue CarryIn = N->getOperand(2);
53740 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
53741 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
53743 // Canonicalize constant to RHS.
53744 if (LHSC && !RHSC)
53745 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
53746 CarryIn);
53748 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
53749 // the result is either zero or one (depending on the input carry bit).
53750 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
53751 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
53752 // We don't have a good way to replace an EFLAGS use, so only do this when
53753 // dead right now.
53754 SDValue(N, 1).use_empty()) {
53755 SDLoc DL(N);
53756 EVT VT = N->getValueType(0);
53757 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
53758 SDValue Res1 = DAG.getNode(
53759 ISD::AND, DL, VT,
53760 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
53761 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
53762 DAG.getConstant(1, DL, VT));
53763 return DCI.CombineTo(N, Res1, CarryOut);
53766 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
53767 // iff the flag result is dead.
53768 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
53769 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
53770 SDLoc DL(N);
53771 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
53772 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
53773 DAG.getConstant(0, DL, LHS.getValueType()),
53774 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
53777 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
53778 MVT VT = N->getSimpleValueType(0);
53779 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
53780 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
53783 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
53784 // iff the flag result is dead.
53785 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
53786 !N->hasAnyUseOfValue(1))
53787 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
53788 LHS.getOperand(1), CarryIn);
53790 return SDValue();
53793 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
53794 const SDLoc &DL, EVT VT,
53795 const X86Subtarget &Subtarget) {
53796 // Example of pattern we try to detect:
53797 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
53798 //(add (build_vector (extract_elt t, 0),
53799 // (extract_elt t, 2),
53800 // (extract_elt t, 4),
53801 // (extract_elt t, 6)),
53802 // (build_vector (extract_elt t, 1),
53803 // (extract_elt t, 3),
53804 // (extract_elt t, 5),
53805 // (extract_elt t, 7)))
53807 if (!Subtarget.hasSSE2())
53808 return SDValue();
53810 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
53811 Op1.getOpcode() != ISD::BUILD_VECTOR)
53812 return SDValue();
53814 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
53815 VT.getVectorNumElements() < 4 ||
53816 !isPowerOf2_32(VT.getVectorNumElements()))
53817 return SDValue();
53819 // Check if one of Op0,Op1 is of the form:
53820 // (build_vector (extract_elt Mul, 0),
53821 // (extract_elt Mul, 2),
53822 // (extract_elt Mul, 4),
53823 // ...
53824 // the other is of the form:
53825 // (build_vector (extract_elt Mul, 1),
53826 // (extract_elt Mul, 3),
53827 // (extract_elt Mul, 5),
53828 // ...
53829 // and identify Mul.
53830 SDValue Mul;
53831 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
53832 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
53833 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
53834 // TODO: Be more tolerant to undefs.
53835 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53836 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53837 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53838 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
53839 return SDValue();
53840 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
53841 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
53842 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
53843 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
53844 if (!Const0L || !Const1L || !Const0H || !Const1H)
53845 return SDValue();
53846 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
53847 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
53848 // Commutativity of mul allows factors of a product to reorder.
53849 if (Idx0L > Idx1L)
53850 std::swap(Idx0L, Idx1L);
53851 if (Idx0H > Idx1H)
53852 std::swap(Idx0H, Idx1H);
53853 // Commutativity of add allows pairs of factors to reorder.
53854 if (Idx0L > Idx0H) {
53855 std::swap(Idx0L, Idx0H);
53856 std::swap(Idx1L, Idx1H);
53858 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
53859 Idx1H != 2 * i + 3)
53860 return SDValue();
53861 if (!Mul) {
53862 // First time an extract_elt's source vector is visited. Must be a MUL
53863 // with 2X number of vector elements than the BUILD_VECTOR.
53864 // Both extracts must be from same MUL.
53865 Mul = Op0L->getOperand(0);
53866 if (Mul->getOpcode() != ISD::MUL ||
53867 Mul.getValueType().getVectorNumElements() != 2 * e)
53868 return SDValue();
53870 // Check that the extract is from the same MUL previously seen.
53871 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
53872 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
53873 return SDValue();
53876 // Check if the Mul source can be safely shrunk.
53877 ShrinkMode Mode;
53878 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
53879 Mode == ShrinkMode::MULU16)
53880 return SDValue();
53882 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53883 VT.getVectorNumElements() * 2);
53884 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
53885 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
53887 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
53888 ArrayRef<SDValue> Ops) {
53889 EVT InVT = Ops[0].getValueType();
53890 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
53891 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
53892 InVT.getVectorNumElements() / 2);
53893 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
53895 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
53898 // Attempt to turn this pattern into PMADDWD.
53899 // (add (mul (sext (build_vector)), (sext (build_vector))),
53900 // (mul (sext (build_vector)), (sext (build_vector)))
53901 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
53902 const SDLoc &DL, EVT VT,
53903 const X86Subtarget &Subtarget) {
53904 if (!Subtarget.hasSSE2())
53905 return SDValue();
53907 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
53908 return SDValue();
53910 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
53911 VT.getVectorNumElements() < 4 ||
53912 !isPowerOf2_32(VT.getVectorNumElements()))
53913 return SDValue();
53915 SDValue N00 = N0.getOperand(0);
53916 SDValue N01 = N0.getOperand(1);
53917 SDValue N10 = N1.getOperand(0);
53918 SDValue N11 = N1.getOperand(1);
53920 // All inputs need to be sign extends.
53921 // TODO: Support ZERO_EXTEND from known positive?
53922 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
53923 N01.getOpcode() != ISD::SIGN_EXTEND ||
53924 N10.getOpcode() != ISD::SIGN_EXTEND ||
53925 N11.getOpcode() != ISD::SIGN_EXTEND)
53926 return SDValue();
53928 // Peek through the extends.
53929 N00 = N00.getOperand(0);
53930 N01 = N01.getOperand(0);
53931 N10 = N10.getOperand(0);
53932 N11 = N11.getOperand(0);
53934 // Must be extending from vXi16.
53935 EVT InVT = N00.getValueType();
53936 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
53937 N10.getValueType() != InVT || N11.getValueType() != InVT)
53938 return SDValue();
53940 // All inputs should be build_vectors.
53941 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
53942 N01.getOpcode() != ISD::BUILD_VECTOR ||
53943 N10.getOpcode() != ISD::BUILD_VECTOR ||
53944 N11.getOpcode() != ISD::BUILD_VECTOR)
53945 return SDValue();
53947 // For each element, we need to ensure we have an odd element from one vector
53948 // multiplied by the odd element of another vector and the even element from
53949 // one of the same vectors being multiplied by the even element from the
53950 // other vector. So we need to make sure for each element i, this operator
53951 // is being performed:
53952 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
53953 SDValue In0, In1;
53954 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
53955 SDValue N00Elt = N00.getOperand(i);
53956 SDValue N01Elt = N01.getOperand(i);
53957 SDValue N10Elt = N10.getOperand(i);
53958 SDValue N11Elt = N11.getOperand(i);
53959 // TODO: Be more tolerant to undefs.
53960 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53961 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53962 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53963 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
53964 return SDValue();
53965 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
53966 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
53967 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
53968 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
53969 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
53970 return SDValue();
53971 unsigned IdxN00 = ConstN00Elt->getZExtValue();
53972 unsigned IdxN01 = ConstN01Elt->getZExtValue();
53973 unsigned IdxN10 = ConstN10Elt->getZExtValue();
53974 unsigned IdxN11 = ConstN11Elt->getZExtValue();
53975 // Add is commutative so indices can be reordered.
53976 if (IdxN00 > IdxN10) {
53977 std::swap(IdxN00, IdxN10);
53978 std::swap(IdxN01, IdxN11);
53980 // N0 indices be the even element. N1 indices must be the next odd element.
53981 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
53982 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
53983 return SDValue();
53984 SDValue N00In = N00Elt.getOperand(0);
53985 SDValue N01In = N01Elt.getOperand(0);
53986 SDValue N10In = N10Elt.getOperand(0);
53987 SDValue N11In = N11Elt.getOperand(0);
53989 // First time we find an input capture it.
53990 if (!In0) {
53991 In0 = N00In;
53992 In1 = N01In;
53994 // The input vectors must be at least as wide as the output.
53995 // If they are larger than the output, we extract subvector below.
53996 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
53997 In1.getValueSizeInBits() < VT.getSizeInBits())
53998 return SDValue();
54000 // Mul is commutative so the input vectors can be in any order.
54001 // Canonicalize to make the compares easier.
54002 if (In0 != N00In)
54003 std::swap(N00In, N01In);
54004 if (In0 != N10In)
54005 std::swap(N10In, N11In);
54006 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
54007 return SDValue();
54010 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54011 ArrayRef<SDValue> Ops) {
54012 EVT OpVT = Ops[0].getValueType();
54013 assert(OpVT.getScalarType() == MVT::i16 &&
54014 "Unexpected scalar element type");
54015 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
54016 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
54017 OpVT.getVectorNumElements() / 2);
54018 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
54021 // If the output is narrower than an input, extract the low part of the input
54022 // vector.
54023 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54024 VT.getVectorNumElements() * 2);
54025 if (OutVT16.bitsLT(In0.getValueType())) {
54026 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
54027 DAG.getIntPtrConstant(0, DL));
54029 if (OutVT16.bitsLT(In1.getValueType())) {
54030 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
54031 DAG.getIntPtrConstant(0, DL));
54033 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
54034 PMADDBuilder);
54037 // ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
54038 // If upper element in each pair of both VPMADDWD are zero then we can merge
54039 // the operand elements and use the implicit add of VPMADDWD.
54040 // TODO: Add support for VPMADDUBSW (which isn't commutable).
54041 static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
54042 const SDLoc &DL, EVT VT) {
54043 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
54044 return SDValue();
54046 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
54047 if (VT.getSizeInBits() > 128)
54048 return SDValue();
54050 unsigned NumElts = VT.getVectorNumElements();
54051 MVT OpVT = N0.getOperand(0).getSimpleValueType();
54052 APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
54053 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
54055 bool Op0HiZero =
54056 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
54057 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
54058 bool Op1HiZero =
54059 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
54060 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
54062 // TODO: Check for zero lower elements once we have actual codegen that
54063 // creates them.
54064 if (!Op0HiZero || !Op1HiZero)
54065 return SDValue();
54067 // Create a shuffle mask packing the lower elements from each VPMADDWD.
54068 SmallVector<int> Mask;
54069 for (int i = 0; i != (int)NumElts; ++i) {
54070 Mask.push_back(2 * i);
54071 Mask.push_back(2 * (i + NumElts));
54074 SDValue LHS =
54075 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
54076 SDValue RHS =
54077 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
54078 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
54081 /// CMOV of constants requires materializing constant operands in registers.
54082 /// Try to fold those constants into an 'add' instruction to reduce instruction
54083 /// count. We do this with CMOV rather the generic 'select' because there are
54084 /// earlier folds that may be used to turn select-of-constants into logic hacks.
54085 static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
54086 const X86Subtarget &Subtarget) {
54087 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
54088 // better because we eliminate 1-2 instructions. This transform is still
54089 // an improvement without zero operands because we trade 2 move constants and
54090 // 1 add for 2 adds (LEA) as long as the constants can be represented as
54091 // immediate asm operands (fit in 32-bits).
54092 auto isSuitableCmov = [](SDValue V) {
54093 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
54094 return false;
54095 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
54096 !isa<ConstantSDNode>(V.getOperand(1)))
54097 return false;
54098 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
54099 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
54100 V.getConstantOperandAPInt(1).isSignedIntN(32));
54103 // Match an appropriate CMOV as the first operand of the add.
54104 SDValue Cmov = N->getOperand(0);
54105 SDValue OtherOp = N->getOperand(1);
54106 if (!isSuitableCmov(Cmov))
54107 std::swap(Cmov, OtherOp);
54108 if (!isSuitableCmov(Cmov))
54109 return SDValue();
54111 // Don't remove a load folding opportunity for the add. That would neutralize
54112 // any improvements from removing constant materializations.
54113 if (X86::mayFoldLoad(OtherOp, Subtarget))
54114 return SDValue();
54116 EVT VT = N->getValueType(0);
54117 SDLoc DL(N);
54118 SDValue FalseOp = Cmov.getOperand(0);
54119 SDValue TrueOp = Cmov.getOperand(1);
54121 // We will push the add through the select, but we can potentially do better
54122 // if we know there is another add in the sequence and this is pointer math.
54123 // In that case, we can absorb an add into the trailing memory op and avoid
54124 // a 3-operand LEA which is likely slower than a 2-operand LEA.
54125 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
54126 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
54127 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
54128 all_of(N->uses(), [&](SDNode *Use) {
54129 auto *MemNode = dyn_cast<MemSDNode>(Use);
54130 return MemNode && MemNode->getBasePtr().getNode() == N;
54131 })) {
54132 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
54133 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
54134 // it is possible that choosing op1 might be better.
54135 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
54136 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
54137 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
54138 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
54139 Cmov.getOperand(2), Cmov.getOperand(3));
54140 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
54143 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
54144 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
54145 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
54146 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
54147 Cmov.getOperand(3));
54150 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
54151 TargetLowering::DAGCombinerInfo &DCI,
54152 const X86Subtarget &Subtarget) {
54153 EVT VT = N->getValueType(0);
54154 SDValue Op0 = N->getOperand(0);
54155 SDValue Op1 = N->getOperand(1);
54156 SDLoc DL(N);
54158 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
54159 return Select;
54161 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
54162 return MAdd;
54163 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
54164 return MAdd;
54165 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
54166 return MAdd;
54168 // Try to synthesize horizontal adds from adds of shuffles.
54169 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
54170 return V;
54172 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
54173 // (sub Y, (sext (vXi1 X))).
54174 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
54175 // generic DAG combine without a legal type check, but adding this there
54176 // caused regressions.
54177 if (VT.isVector()) {
54178 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54179 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
54180 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
54181 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
54182 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
54183 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
54186 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
54187 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
54188 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
54189 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
54190 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
54194 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
54195 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
54196 X86::isZeroNode(Op0.getOperand(1))) {
54197 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
54198 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
54199 Op0.getOperand(0), Op0.getOperand(2));
54202 return combineAddOrSubToADCOrSBB(N, DAG);
54205 // Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
54206 // condition comes from the subtract node that produced -X. This matches the
54207 // cmov expansion for absolute value. By swapping the operands we convert abs
54208 // to nabs.
54209 static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
54210 SDValue N0 = N->getOperand(0);
54211 SDValue N1 = N->getOperand(1);
54213 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
54214 return SDValue();
54216 X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
54217 if (CC != X86::COND_S && CC != X86::COND_NS)
54218 return SDValue();
54220 // Condition should come from a negate operation.
54221 SDValue Cond = N1.getOperand(3);
54222 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
54223 return SDValue();
54224 assert(Cond.getResNo() == 1 && "Unexpected result number");
54226 // Get the X and -X from the negate.
54227 SDValue NegX = Cond.getValue(0);
54228 SDValue X = Cond.getOperand(1);
54230 SDValue FalseOp = N1.getOperand(0);
54231 SDValue TrueOp = N1.getOperand(1);
54233 // Cmov operands should be X and NegX. Order doesn't matter.
54234 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
54235 return SDValue();
54237 // Build a new CMOV with the operands swapped.
54238 SDLoc DL(N);
54239 MVT VT = N->getSimpleValueType(0);
54240 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
54241 N1.getOperand(2), Cond);
54242 // Convert sub to add.
54243 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
54246 static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {
54247 SDValue Op0 = N->getOperand(0);
54248 SDValue Op1 = N->getOperand(1);
54250 // (sub C (zero_extend (setcc)))
54251 // =>
54252 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
54253 // Don't disturb (sub 0 setcc), which is easily done with neg.
54254 EVT VT = N->getValueType(0);
54255 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
54256 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
54257 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
54258 Op1.getOperand(0).hasOneUse()) {
54259 SDValue SetCC = Op1.getOperand(0);
54260 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
54261 X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC);
54262 APInt NewImm = Op0C->getAPIntValue() - 1;
54263 SDLoc DL(Op1);
54264 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
54265 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
54266 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
54267 DAG.getConstant(NewImm, DL, VT));
54270 return SDValue();
54273 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
54274 TargetLowering::DAGCombinerInfo &DCI,
54275 const X86Subtarget &Subtarget) {
54276 SDValue Op0 = N->getOperand(0);
54277 SDValue Op1 = N->getOperand(1);
54279 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
54280 auto IsNonOpaqueConstant = [&](SDValue Op) {
54281 if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
54282 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
54283 return !Cst->isOpaque();
54284 return true;
54286 return false;
54289 // X86 can't encode an immediate LHS of a sub. See if we can push the
54290 // negation into a preceding instruction. If the RHS of the sub is a XOR with
54291 // one use and a constant, invert the immediate, saving one register.
54292 // However, ignore cases where C1 is 0, as those will become a NEG.
54293 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
54294 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
54295 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
54296 Op1->hasOneUse()) {
54297 SDLoc DL(N);
54298 EVT VT = Op0.getValueType();
54299 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
54300 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
54301 SDValue NewAdd =
54302 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
54303 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
54306 if (SDValue V = combineSubABS(N, DAG))
54307 return V;
54309 // Try to synthesize horizontal subs from subs of shuffles.
54310 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
54311 return V;
54313 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
54314 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
54315 X86::isZeroNode(Op1.getOperand(1))) {
54316 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
54317 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
54318 Op1.getOperand(0), Op1.getOperand(2));
54321 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
54322 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
54323 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
54324 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
54325 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
54326 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
54327 Op1.getOperand(1), Op1.getOperand(2));
54328 return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
54329 Op1.getOperand(0));
54332 if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
54333 return V;
54335 if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
54336 return V;
54338 return combineSubSetcc(N, DAG);
54341 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
54342 const X86Subtarget &Subtarget) {
54343 MVT VT = N->getSimpleValueType(0);
54344 SDLoc DL(N);
54346 if (N->getOperand(0) == N->getOperand(1)) {
54347 if (N->getOpcode() == X86ISD::PCMPEQ)
54348 return DAG.getConstant(-1, DL, VT);
54349 if (N->getOpcode() == X86ISD::PCMPGT)
54350 return DAG.getConstant(0, DL, VT);
54353 return SDValue();
54356 /// Helper that combines an array of subvector ops as if they were the operands
54357 /// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
54358 /// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
54359 static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
54360 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
54361 TargetLowering::DAGCombinerInfo &DCI,
54362 const X86Subtarget &Subtarget) {
54363 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
54364 unsigned EltSizeInBits = VT.getScalarSizeInBits();
54366 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
54367 return DAG.getUNDEF(VT);
54369 if (llvm::all_of(Ops, [](SDValue Op) {
54370 return ISD::isBuildVectorAllZeros(Op.getNode());
54372 return getZeroVector(VT, Subtarget, DAG, DL);
54374 SDValue Op0 = Ops[0];
54375 bool IsSplat = llvm::all_equal(Ops);
54376 unsigned NumOps = Ops.size();
54378 // Repeated subvectors.
54379 if (IsSplat &&
54380 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
54381 // If this broadcast is inserted into both halves, use a larger broadcast.
54382 if (Op0.getOpcode() == X86ISD::VBROADCAST)
54383 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
54385 // If this simple subvector or scalar/subvector broadcast_load is inserted
54386 // into both halves, use a larger broadcast_load. Update other uses to use
54387 // an extracted subvector.
54388 if (ISD::isNormalLoad(Op0.getNode()) ||
54389 Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
54390 Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
54391 auto *Mem = cast<MemSDNode>(Op0);
54392 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
54393 ? X86ISD::VBROADCAST_LOAD
54394 : X86ISD::SUBV_BROADCAST_LOAD;
54395 if (SDValue BcastLd =
54396 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
54397 SDValue BcastSrc =
54398 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
54399 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
54400 return BcastLd;
54404 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
54405 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
54406 (Subtarget.hasAVX2() ||
54407 X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),
54408 VT.getScalarType(), Subtarget)))
54409 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
54410 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
54411 Op0.getOperand(0),
54412 DAG.getIntPtrConstant(0, DL)));
54414 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
54415 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
54416 (Subtarget.hasAVX2() ||
54417 (EltSizeInBits >= 32 &&
54418 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
54419 Op0.getOperand(0).getValueType() == VT.getScalarType())
54420 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
54422 // concat_vectors(extract_subvector(broadcast(x)),
54423 // extract_subvector(broadcast(x))) -> broadcast(x)
54424 // concat_vectors(extract_subvector(subv_broadcast(x)),
54425 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
54426 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
54427 Op0.getOperand(0).getValueType() == VT) {
54428 SDValue SrcVec = Op0.getOperand(0);
54429 if (SrcVec.getOpcode() == X86ISD::VBROADCAST ||
54430 SrcVec.getOpcode() == X86ISD::VBROADCAST_LOAD)
54431 return Op0.getOperand(0);
54432 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
54433 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
54434 return Op0.getOperand(0);
54438 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
54439 // Only concat of subvector high halves which vperm2x128 is best at.
54440 // TODO: This should go in combineX86ShufflesRecursively eventually.
54441 if (VT.is256BitVector() && NumOps == 2) {
54442 SDValue Src0 = peekThroughBitcasts(Ops[0]);
54443 SDValue Src1 = peekThroughBitcasts(Ops[1]);
54444 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
54445 Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
54446 EVT SrcVT0 = Src0.getOperand(0).getValueType();
54447 EVT SrcVT1 = Src1.getOperand(0).getValueType();
54448 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
54449 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
54450 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
54451 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
54452 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
54453 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
54454 DAG.getBitcast(VT, Src0.getOperand(0)),
54455 DAG.getBitcast(VT, Src1.getOperand(0)),
54456 DAG.getTargetConstant(0x31, DL, MVT::i8));
54461 // Repeated opcode.
54462 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
54463 // but it currently struggles with different vector widths.
54464 if (llvm::all_of(Ops, [Op0](SDValue Op) {
54465 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
54466 })) {
54467 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
54468 SmallVector<SDValue> Subs;
54469 for (SDValue SubOp : SubOps)
54470 Subs.push_back(SubOp.getOperand(I));
54471 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
54473 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
54474 bool AllConstants = true;
54475 bool AllSubVectors = true;
54476 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
54477 SDValue Sub = SubOps[I].getOperand(Op);
54478 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
54479 SDValue BC = peekThroughBitcasts(Sub);
54480 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
54481 ISD::isBuildVectorOfConstantFPSDNodes(BC.getNode());
54482 AllSubVectors &= Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
54483 Sub.getOperand(0).getValueType() == VT &&
54484 Sub.getConstantOperandAPInt(1) == (I * NumSubElts);
54486 return AllConstants || AllSubVectors;
54489 switch (Op0.getOpcode()) {
54490 case X86ISD::VBROADCAST: {
54491 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
54492 return Op.getOperand(0).getValueType().is128BitVector();
54493 })) {
54494 if (VT == MVT::v4f64 || VT == MVT::v4i64)
54495 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
54496 ConcatSubOperand(VT, Ops, 0),
54497 ConcatSubOperand(VT, Ops, 0));
54498 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
54499 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
54500 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
54501 : X86ISD::PSHUFD,
54502 DL, VT, ConcatSubOperand(VT, Ops, 0),
54503 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
54505 break;
54507 case X86ISD::MOVDDUP:
54508 case X86ISD::MOVSHDUP:
54509 case X86ISD::MOVSLDUP: {
54510 if (!IsSplat)
54511 return DAG.getNode(Op0.getOpcode(), DL, VT,
54512 ConcatSubOperand(VT, Ops, 0));
54513 break;
54515 case X86ISD::SHUFP: {
54516 // Add SHUFPD support if/when necessary.
54517 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
54518 llvm::all_of(Ops, [Op0](SDValue Op) {
54519 return Op.getOperand(2) == Op0.getOperand(2);
54520 })) {
54521 return DAG.getNode(Op0.getOpcode(), DL, VT,
54522 ConcatSubOperand(VT, Ops, 0),
54523 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
54525 break;
54527 case X86ISD::UNPCKH:
54528 case X86ISD::UNPCKL: {
54529 // Don't concatenate build_vector patterns.
54530 if (!IsSplat && EltSizeInBits >= 32 &&
54531 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54532 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
54533 none_of(Ops, [](SDValue Op) {
54534 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
54535 ISD::SCALAR_TO_VECTOR ||
54536 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
54537 ISD::SCALAR_TO_VECTOR;
54538 })) {
54539 return DAG.getNode(Op0.getOpcode(), DL, VT,
54540 ConcatSubOperand(VT, Ops, 0),
54541 ConcatSubOperand(VT, Ops, 1));
54543 break;
54545 case X86ISD::PSHUFHW:
54546 case X86ISD::PSHUFLW:
54547 case X86ISD::PSHUFD:
54548 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
54549 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
54550 return DAG.getNode(Op0.getOpcode(), DL, VT,
54551 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
54553 [[fallthrough]];
54554 case X86ISD::VPERMILPI:
54555 if (!IsSplat && EltSizeInBits == 32 &&
54556 (VT.is256BitVector() ||
54557 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
54558 all_of(Ops, [&Op0](SDValue Op) {
54559 return Op0.getOperand(1) == Op.getOperand(1);
54560 })) {
54561 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
54562 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
54563 Res =
54564 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
54565 return DAG.getBitcast(VT, Res);
54567 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
54568 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
54569 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
54570 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
54571 return DAG.getNode(Op0.getOpcode(), DL, VT,
54572 ConcatSubOperand(VT, Ops, 0),
54573 DAG.getTargetConstant(Idx, DL, MVT::i8));
54575 break;
54576 case X86ISD::PSHUFB:
54577 case X86ISD::PSADBW:
54578 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54579 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
54580 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
54581 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
54582 NumOps * SrcVT.getVectorNumElements());
54583 return DAG.getNode(Op0.getOpcode(), DL, VT,
54584 ConcatSubOperand(SrcVT, Ops, 0),
54585 ConcatSubOperand(SrcVT, Ops, 1));
54587 break;
54588 case X86ISD::VPERMV:
54589 if (!IsSplat && NumOps == 2 &&
54590 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
54591 MVT OpVT = Op0.getSimpleValueType();
54592 int NumSrcElts = OpVT.getVectorNumElements();
54593 SmallVector<int, 64> ConcatMask;
54594 for (unsigned i = 0; i != NumOps; ++i) {
54595 SmallVector<int, 64> SubMask;
54596 SmallVector<SDValue, 2> SubOps;
54597 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
54598 SubMask))
54599 break;
54600 for (int M : SubMask) {
54601 if (0 <= M)
54602 M += i * NumSrcElts;
54603 ConcatMask.push_back(M);
54606 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
54607 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
54608 Ops[1].getOperand(1), DAG, DL);
54609 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
54610 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
54611 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
54612 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
54615 break;
54616 case X86ISD::VPERMV3:
54617 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
54618 MVT OpVT = Op0.getSimpleValueType();
54619 int NumSrcElts = OpVT.getVectorNumElements();
54620 SmallVector<int, 64> ConcatMask;
54621 for (unsigned i = 0; i != NumOps; ++i) {
54622 SmallVector<int, 64> SubMask;
54623 SmallVector<SDValue, 2> SubOps;
54624 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
54625 SubMask))
54626 break;
54627 for (int M : SubMask) {
54628 if (0 <= M) {
54629 M += M < NumSrcElts ? 0 : NumSrcElts;
54630 M += i * NumSrcElts;
54632 ConcatMask.push_back(M);
54635 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
54636 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
54637 Ops[1].getOperand(0), DAG, DL);
54638 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
54639 Ops[1].getOperand(2), DAG, DL);
54640 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
54641 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
54642 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
54643 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
54646 break;
54647 case X86ISD::VPERM2X128: {
54648 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
54649 assert(NumOps == 2 && "Bad concat_vectors operands");
54650 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
54651 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
54652 // TODO: Handle zero'd subvectors.
54653 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
54654 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
54655 (int)((Imm1 >> 4) & 0x3)};
54656 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
54657 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
54658 Ops[0].getOperand(1), DAG, DL);
54659 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
54660 Ops[1].getOperand(1), DAG, DL);
54661 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
54662 DAG.getBitcast(ShuffleVT, LHS),
54663 DAG.getBitcast(ShuffleVT, RHS),
54664 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
54665 return DAG.getBitcast(VT, Res);
54668 break;
54670 case X86ISD::SHUF128: {
54671 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
54672 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
54673 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
54674 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
54675 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
54676 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
54677 Ops[0].getOperand(1), DAG, DL);
54678 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
54679 Ops[1].getOperand(1), DAG, DL);
54680 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
54681 DAG.getTargetConstant(Imm, DL, MVT::i8));
54683 break;
54685 case ISD::TRUNCATE:
54686 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
54687 EVT SrcVT = Ops[0].getOperand(0).getValueType();
54688 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
54689 SrcVT == Ops[1].getOperand(0).getValueType() &&
54690 Subtarget.useAVX512Regs() &&
54691 Subtarget.getPreferVectorWidth() >= 512 &&
54692 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
54693 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
54694 return DAG.getNode(ISD::TRUNCATE, DL, VT,
54695 ConcatSubOperand(NewSrcVT, Ops, 0));
54698 break;
54699 case X86ISD::VSHLI:
54700 case X86ISD::VSRLI:
54701 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
54702 // TODO: Move this to LowerShiftByScalarImmediate?
54703 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
54704 llvm::all_of(Ops, [](SDValue Op) {
54705 return Op.getConstantOperandAPInt(1) == 32;
54706 })) {
54707 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
54708 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
54709 if (Op0.getOpcode() == X86ISD::VSHLI) {
54710 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
54711 {8, 0, 8, 2, 8, 4, 8, 6});
54712 } else {
54713 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
54714 {1, 8, 3, 8, 5, 8, 7, 8});
54716 return DAG.getBitcast(VT, Res);
54718 [[fallthrough]];
54719 case X86ISD::VSRAI:
54720 case X86ISD::VSHL:
54721 case X86ISD::VSRL:
54722 case X86ISD::VSRA:
54723 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
54724 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
54725 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
54726 llvm::all_of(Ops, [Op0](SDValue Op) {
54727 return Op0.getOperand(1) == Op.getOperand(1);
54728 })) {
54729 return DAG.getNode(Op0.getOpcode(), DL, VT,
54730 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
54732 break;
54733 case X86ISD::VPERMI:
54734 case X86ISD::VROTLI:
54735 case X86ISD::VROTRI:
54736 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
54737 llvm::all_of(Ops, [Op0](SDValue Op) {
54738 return Op0.getOperand(1) == Op.getOperand(1);
54739 })) {
54740 return DAG.getNode(Op0.getOpcode(), DL, VT,
54741 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
54743 break;
54744 case ISD::AND:
54745 case ISD::OR:
54746 case ISD::XOR:
54747 case X86ISD::ANDNP:
54748 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54749 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
54750 return DAG.getNode(Op0.getOpcode(), DL, VT,
54751 ConcatSubOperand(VT, Ops, 0),
54752 ConcatSubOperand(VT, Ops, 1));
54754 break;
54755 case X86ISD::PCMPEQ:
54756 case X86ISD::PCMPGT:
54757 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256() &&
54758 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
54759 return DAG.getNode(Op0.getOpcode(), DL, VT,
54760 ConcatSubOperand(VT, Ops, 0),
54761 ConcatSubOperand(VT, Ops, 1));
54763 break;
54764 case ISD::CTPOP:
54765 case ISD::CTTZ:
54766 case ISD::CTLZ:
54767 case ISD::CTTZ_ZERO_UNDEF:
54768 case ISD::CTLZ_ZERO_UNDEF:
54769 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54770 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
54771 return DAG.getNode(Op0.getOpcode(), DL, VT,
54772 ConcatSubOperand(VT, Ops, 0));
54774 break;
54775 case X86ISD::GF2P8AFFINEQB:
54776 if (!IsSplat &&
54777 (VT.is256BitVector() ||
54778 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
54779 llvm::all_of(Ops, [Op0](SDValue Op) {
54780 return Op0.getOperand(2) == Op.getOperand(2);
54781 })) {
54782 return DAG.getNode(Op0.getOpcode(), DL, VT,
54783 ConcatSubOperand(VT, Ops, 0),
54784 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
54786 break;
54787 case ISD::ADD:
54788 case ISD::SUB:
54789 case ISD::MUL:
54790 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54791 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
54792 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
54793 return DAG.getNode(Op0.getOpcode(), DL, VT,
54794 ConcatSubOperand(VT, Ops, 0),
54795 ConcatSubOperand(VT, Ops, 1));
54797 break;
54798 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
54799 // their latency are short, so here we don't replace them.
54800 case ISD::FDIV:
54801 if (!IsSplat && (VT.is256BitVector() ||
54802 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
54803 return DAG.getNode(Op0.getOpcode(), DL, VT,
54804 ConcatSubOperand(VT, Ops, 0),
54805 ConcatSubOperand(VT, Ops, 1));
54807 break;
54808 case X86ISD::HADD:
54809 case X86ISD::HSUB:
54810 case X86ISD::FHADD:
54811 case X86ISD::FHSUB:
54812 if (!IsSplat && VT.is256BitVector() &&
54813 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
54814 return DAG.getNode(Op0.getOpcode(), DL, VT,
54815 ConcatSubOperand(VT, Ops, 0),
54816 ConcatSubOperand(VT, Ops, 1));
54818 break;
54819 case X86ISD::PACKSS:
54820 case X86ISD::PACKUS:
54821 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54822 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
54823 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
54824 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
54825 NumOps * SrcVT.getVectorNumElements());
54826 return DAG.getNode(Op0.getOpcode(), DL, VT,
54827 ConcatSubOperand(SrcVT, Ops, 0),
54828 ConcatSubOperand(SrcVT, Ops, 1));
54830 break;
54831 case X86ISD::PALIGNR:
54832 if (!IsSplat &&
54833 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54834 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
54835 llvm::all_of(Ops, [Op0](SDValue Op) {
54836 return Op0.getOperand(2) == Op.getOperand(2);
54837 })) {
54838 return DAG.getNode(Op0.getOpcode(), DL, VT,
54839 ConcatSubOperand(VT, Ops, 0),
54840 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
54842 break;
54843 case ISD::VSELECT:
54844 if (!IsSplat && Subtarget.hasAVX512() &&
54845 (VT.is256BitVector() ||
54846 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
54847 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
54848 EVT SelVT = Ops[0].getOperand(0).getValueType();
54849 if (SelVT.getVectorElementType() == MVT::i1) {
54850 SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
54851 NumOps * SelVT.getVectorNumElements());
54852 if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
54853 return DAG.getNode(Op0.getOpcode(), DL, VT,
54854 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
54855 ConcatSubOperand(VT, Ops, 1),
54856 ConcatSubOperand(VT, Ops, 2));
54859 [[fallthrough]];
54860 case X86ISD::BLENDV:
54861 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
54862 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
54863 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
54864 EVT SelVT = Ops[0].getOperand(0).getValueType();
54865 SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());
54866 if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
54867 return DAG.getNode(Op0.getOpcode(), DL, VT,
54868 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
54869 ConcatSubOperand(VT, Ops, 1),
54870 ConcatSubOperand(VT, Ops, 2));
54872 break;
54876 // Fold subvector loads into one.
54877 // If needed, look through bitcasts to get to the load.
54878 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
54879 unsigned Fast;
54880 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
54881 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
54882 *FirstLd->getMemOperand(), &Fast) &&
54883 Fast) {
54884 if (SDValue Ld =
54885 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
54886 return Ld;
54890 // Attempt to fold target constant loads.
54891 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
54892 SmallVector<APInt> EltBits;
54893 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
54894 for (unsigned I = 0; I != NumOps; ++I) {
54895 APInt OpUndefElts;
54896 SmallVector<APInt> OpEltBits;
54897 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
54898 OpEltBits, true, false))
54899 break;
54900 EltBits.append(OpEltBits);
54901 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
54903 if (EltBits.size() == VT.getVectorNumElements())
54904 return getConstVector(EltBits, UndefElts, VT, DAG, DL);
54907 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
54908 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
54909 Subtarget.useAVX512Regs()) {
54910 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
54911 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
54912 Res = DAG.getBitcast(ShuffleVT, Res);
54913 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
54914 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
54915 return DAG.getBitcast(VT, Res);
54918 return SDValue();
54921 static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
54922 TargetLowering::DAGCombinerInfo &DCI,
54923 const X86Subtarget &Subtarget) {
54924 EVT VT = N->getValueType(0);
54925 EVT SrcVT = N->getOperand(0).getValueType();
54926 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54927 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
54929 if (VT.getVectorElementType() == MVT::i1) {
54930 // Attempt to constant fold.
54931 unsigned SubSizeInBits = SrcVT.getSizeInBits();
54932 APInt Constant = APInt::getZero(VT.getSizeInBits());
54933 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
54934 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
54935 if (!C) break;
54936 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
54937 if (I == (E - 1)) {
54938 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
54939 if (TLI.isTypeLegal(IntVT))
54940 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
54944 // Don't do anything else for i1 vectors.
54945 return SDValue();
54948 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
54949 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
54950 DCI, Subtarget))
54951 return R;
54954 return SDValue();
54957 static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
54958 TargetLowering::DAGCombinerInfo &DCI,
54959 const X86Subtarget &Subtarget) {
54960 if (DCI.isBeforeLegalizeOps())
54961 return SDValue();
54963 MVT OpVT = N->getSimpleValueType(0);
54965 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
54967 SDLoc dl(N);
54968 SDValue Vec = N->getOperand(0);
54969 SDValue SubVec = N->getOperand(1);
54971 uint64_t IdxVal = N->getConstantOperandVal(2);
54972 MVT SubVecVT = SubVec.getSimpleValueType();
54974 if (Vec.isUndef() && SubVec.isUndef())
54975 return DAG.getUNDEF(OpVT);
54977 // Inserting undefs/zeros into zeros/undefs is a zero vector.
54978 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
54979 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
54980 return getZeroVector(OpVT, Subtarget, DAG, dl);
54982 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
54983 // If we're inserting into a zero vector and then into a larger zero vector,
54984 // just insert into the larger zero vector directly.
54985 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
54986 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
54987 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
54988 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
54989 getZeroVector(OpVT, Subtarget, DAG, dl),
54990 SubVec.getOperand(1),
54991 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
54994 // If we're inserting into a zero vector and our input was extracted from an
54995 // insert into a zero vector of the same type and the extraction was at
54996 // least as large as the original insertion. Just insert the original
54997 // subvector into a zero vector.
54998 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
54999 isNullConstant(SubVec.getOperand(1)) &&
55000 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
55001 SDValue Ins = SubVec.getOperand(0);
55002 if (isNullConstant(Ins.getOperand(2)) &&
55003 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
55004 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
55005 SubVecVT.getFixedSizeInBits())
55006 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55007 getZeroVector(OpVT, Subtarget, DAG, dl),
55008 Ins.getOperand(1), N->getOperand(2));
55012 // Stop here if this is an i1 vector.
55013 if (IsI1Vector)
55014 return SDValue();
55016 // Eliminate an intermediate vector widening:
55017 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
55018 // insert_subvector X, Y, Idx
55019 // TODO: This is a more general version of a DAGCombiner fold, can we move it
55020 // there?
55021 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
55022 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
55023 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
55024 SubVec.getOperand(1), N->getOperand(2));
55026 // If this is an insert of an extract, combine to a shuffle. Don't do this
55027 // if the insert or extract can be represented with a subregister operation.
55028 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55029 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
55030 (IdxVal != 0 ||
55031 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
55032 int ExtIdxVal = SubVec.getConstantOperandVal(1);
55033 if (ExtIdxVal != 0) {
55034 int VecNumElts = OpVT.getVectorNumElements();
55035 int SubVecNumElts = SubVecVT.getVectorNumElements();
55036 SmallVector<int, 64> Mask(VecNumElts);
55037 // First create an identity shuffle mask.
55038 for (int i = 0; i != VecNumElts; ++i)
55039 Mask[i] = i;
55040 // Now insert the extracted portion.
55041 for (int i = 0; i != SubVecNumElts; ++i)
55042 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
55044 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
55048 // Match concat_vector style patterns.
55049 SmallVector<SDValue, 2> SubVectorOps;
55050 if (collectConcatOps(N, SubVectorOps, DAG)) {
55051 if (SDValue Fold =
55052 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
55053 return Fold;
55055 // If we're inserting all zeros into the upper half, change this to
55056 // a concat with zero. We will match this to a move
55057 // with implicit upper bit zeroing during isel.
55058 // We do this here because we don't want combineConcatVectorOps to
55059 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
55060 if (SubVectorOps.size() == 2 &&
55061 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
55062 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55063 getZeroVector(OpVT, Subtarget, DAG, dl),
55064 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
55066 // Attempt to recursively combine to a shuffle.
55067 if (all_of(SubVectorOps, [](SDValue SubOp) {
55068 return isTargetShuffle(SubOp.getOpcode());
55069 })) {
55070 SDValue Op(N, 0);
55071 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55072 return Res;
55076 // If this is a broadcast insert into an upper undef, use a larger broadcast.
55077 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
55078 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
55080 // If this is a broadcast load inserted into an upper undef, use a larger
55081 // broadcast load.
55082 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
55083 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
55084 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
55085 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
55086 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
55087 SDValue BcastLd =
55088 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
55089 MemIntr->getMemoryVT(),
55090 MemIntr->getMemOperand());
55091 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
55092 return BcastLd;
55095 // If we're splatting the lower half subvector of a full vector load into the
55096 // upper half, attempt to create a subvector broadcast.
55097 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
55098 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
55099 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
55100 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
55101 if (VecLd && SubLd &&
55102 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
55103 SubVec.getValueSizeInBits() / 8, 0))
55104 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
55105 SubLd, 0, DAG);
55108 return SDValue();
55111 /// If we are extracting a subvector of a vector select and the select condition
55112 /// is composed of concatenated vectors, try to narrow the select width. This
55113 /// is a common pattern for AVX1 integer code because 256-bit selects may be
55114 /// legal, but there is almost no integer math/logic available for 256-bit.
55115 /// This function should only be called with legal types (otherwise, the calls
55116 /// to get simple value types will assert).
55117 static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
55118 SDValue Sel = Ext->getOperand(0);
55119 if (Sel.getOpcode() != ISD::VSELECT ||
55120 !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG))
55121 return SDValue();
55123 // Note: We assume simple value types because this should only be called with
55124 // legal operations/types.
55125 // TODO: This can be extended to handle extraction to 256-bits.
55126 MVT VT = Ext->getSimpleValueType(0);
55127 if (!VT.is128BitVector())
55128 return SDValue();
55130 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
55131 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
55132 return SDValue();
55134 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
55135 MVT SelVT = Sel.getSimpleValueType();
55136 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
55137 "Unexpected vector type with legal operations");
55139 unsigned SelElts = SelVT.getVectorNumElements();
55140 unsigned CastedElts = WideVT.getVectorNumElements();
55141 unsigned ExtIdx = Ext->getConstantOperandVal(1);
55142 if (SelElts % CastedElts == 0) {
55143 // The select has the same or more (narrower) elements than the extract
55144 // operand. The extraction index gets scaled by that factor.
55145 ExtIdx *= (SelElts / CastedElts);
55146 } else if (CastedElts % SelElts == 0) {
55147 // The select has less (wider) elements than the extract operand. Make sure
55148 // that the extraction index can be divided evenly.
55149 unsigned IndexDivisor = CastedElts / SelElts;
55150 if (ExtIdx % IndexDivisor != 0)
55151 return SDValue();
55152 ExtIdx /= IndexDivisor;
55153 } else {
55154 llvm_unreachable("Element count of simple vector types are not divisible?");
55157 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
55158 unsigned NarrowElts = SelElts / NarrowingFactor;
55159 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
55160 SDLoc DL(Ext);
55161 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
55162 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
55163 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
55164 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
55165 return DAG.getBitcast(VT, NarrowSel);
55168 static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
55169 TargetLowering::DAGCombinerInfo &DCI,
55170 const X86Subtarget &Subtarget) {
55171 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
55172 // eventually get combined/lowered into ANDNP) with a concatenated operand,
55173 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
55174 // We let generic combining take over from there to simplify the
55175 // insert/extract and 'not'.
55176 // This pattern emerges during AVX1 legalization. We handle it before lowering
55177 // to avoid complications like splitting constant vector loads.
55179 // Capture the original wide type in the likely case that we need to bitcast
55180 // back to this type.
55181 if (!N->getValueType(0).isSimple())
55182 return SDValue();
55184 MVT VT = N->getSimpleValueType(0);
55185 SDValue InVec = N->getOperand(0);
55186 unsigned IdxVal = N->getConstantOperandVal(1);
55187 SDValue InVecBC = peekThroughBitcasts(InVec);
55188 EVT InVecVT = InVec.getValueType();
55189 unsigned SizeInBits = VT.getSizeInBits();
55190 unsigned InSizeInBits = InVecVT.getSizeInBits();
55191 unsigned NumSubElts = VT.getVectorNumElements();
55192 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55194 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
55195 TLI.isTypeLegal(InVecVT) &&
55196 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
55197 auto isConcatenatedNot = [](SDValue V) {
55198 V = peekThroughBitcasts(V);
55199 if (!isBitwiseNot(V))
55200 return false;
55201 SDValue NotOp = V->getOperand(0);
55202 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
55204 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
55205 isConcatenatedNot(InVecBC.getOperand(1))) {
55206 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
55207 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
55208 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
55209 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
55213 if (DCI.isBeforeLegalizeOps())
55214 return SDValue();
55216 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
55217 return V;
55219 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
55220 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
55222 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
55223 if (VT.getScalarType() == MVT::i1)
55224 return DAG.getConstant(1, SDLoc(N), VT);
55225 return getOnesVector(VT, DAG, SDLoc(N));
55228 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
55229 return DAG.getBuildVector(VT, SDLoc(N),
55230 InVec->ops().slice(IdxVal, NumSubElts));
55232 // If we are extracting from an insert into a larger vector, replace with a
55233 // smaller insert if we don't access less than the original subvector. Don't
55234 // do this for i1 vectors.
55235 // TODO: Relax the matching indices requirement?
55236 if (VT.getVectorElementType() != MVT::i1 &&
55237 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
55238 IdxVal == InVec.getConstantOperandVal(2) &&
55239 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
55240 SDLoc DL(N);
55241 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
55242 InVec.getOperand(0), N->getOperand(1));
55243 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
55244 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
55245 InVec.getOperand(1),
55246 DAG.getVectorIdxConstant(NewIdxVal, DL));
55249 // If we're extracting an upper subvector from a broadcast we should just
55250 // extract the lowest subvector instead which should allow
55251 // SimplifyDemandedVectorElts do more simplifications.
55252 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
55253 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
55254 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
55255 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
55257 // If we're extracting a broadcasted subvector, just use the lowest subvector.
55258 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
55259 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
55260 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
55262 // Attempt to extract from the source of a shuffle vector.
55263 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
55264 SmallVector<int, 32> ShuffleMask;
55265 SmallVector<int, 32> ScaledMask;
55266 SmallVector<SDValue, 2> ShuffleInputs;
55267 unsigned NumSubVecs = InSizeInBits / SizeInBits;
55268 // Decode the shuffle mask and scale it so its shuffling subvectors.
55269 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
55270 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
55271 unsigned SubVecIdx = IdxVal / NumSubElts;
55272 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
55273 return DAG.getUNDEF(VT);
55274 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
55275 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
55276 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
55277 if (Src.getValueSizeInBits() == InSizeInBits) {
55278 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
55279 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
55280 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
55281 SDLoc(N), SizeInBits);
55286 // If we're extracting the lowest subvector and we're the only user,
55287 // we may be able to perform this with a smaller vector width.
55288 unsigned InOpcode = InVec.getOpcode();
55289 if (InVec.hasOneUse()) {
55290 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
55291 // v2f64 CVTDQ2PD(v4i32).
55292 if (InOpcode == ISD::SINT_TO_FP &&
55293 InVec.getOperand(0).getValueType() == MVT::v4i32) {
55294 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
55296 // v2f64 CVTUDQ2PD(v4i32).
55297 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
55298 InVec.getOperand(0).getValueType() == MVT::v4i32) {
55299 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
55301 // v2f64 CVTPS2PD(v4f32).
55302 if (InOpcode == ISD::FP_EXTEND &&
55303 InVec.getOperand(0).getValueType() == MVT::v4f32) {
55304 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
55307 if (IdxVal == 0 &&
55308 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
55309 (SizeInBits == 128 || SizeInBits == 256) &&
55310 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
55311 SDLoc DL(N);
55312 SDValue Ext = InVec.getOperand(0);
55313 if (Ext.getValueSizeInBits() > SizeInBits)
55314 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
55315 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
55316 return DAG.getNode(ExtOp, DL, VT, Ext);
55318 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
55319 InVec.getOperand(0).getValueType().is256BitVector() &&
55320 InVec.getOperand(1).getValueType().is256BitVector() &&
55321 InVec.getOperand(2).getValueType().is256BitVector()) {
55322 SDLoc DL(N);
55323 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
55324 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
55325 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
55326 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
55328 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
55329 (VT.is128BitVector() || VT.is256BitVector())) {
55330 SDLoc DL(N);
55331 SDValue InVecSrc = InVec.getOperand(0);
55332 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
55333 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
55334 return DAG.getNode(InOpcode, DL, VT, Ext);
55336 if (InOpcode == X86ISD::MOVDDUP &&
55337 (VT.is128BitVector() || VT.is256BitVector())) {
55338 SDLoc DL(N);
55339 SDValue Ext0 =
55340 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
55341 return DAG.getNode(InOpcode, DL, VT, Ext0);
55345 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
55346 // as this is very likely to fold into a shuffle/truncation.
55347 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
55348 InVecVT.getScalarSizeInBits() == 64 &&
55349 InVec.getConstantOperandAPInt(1) == 32) {
55350 SDLoc DL(N);
55351 SDValue Ext =
55352 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
55353 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
55356 return SDValue();
55359 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
55360 EVT VT = N->getValueType(0);
55361 SDValue Src = N->getOperand(0);
55362 SDLoc DL(N);
55364 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
55365 // This occurs frequently in our masked scalar intrinsic code and our
55366 // floating point select lowering with AVX512.
55367 // TODO: SimplifyDemandedBits instead?
55368 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
55369 isOneConstant(Src.getOperand(1)))
55370 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
55372 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
55373 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
55374 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
55375 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55376 isNullConstant(Src.getOperand(1)))
55377 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
55378 Src.getOperand(1));
55380 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
55381 // TODO: Move to DAGCombine/SimplifyDemandedBits?
55382 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
55383 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
55384 if (Op.getValueType() != MVT::i64)
55385 return SDValue();
55386 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
55387 if (Op.getOpcode() == Opc &&
55388 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
55389 return Op.getOperand(0);
55390 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
55391 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
55392 if (Ld->getExtensionType() == Ext &&
55393 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
55394 return Op;
55395 if (IsZeroExt) {
55396 KnownBits Known = DAG.computeKnownBits(Op);
55397 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
55398 return Op;
55400 return SDValue();
55403 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
55404 return DAG.getBitcast(
55405 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
55406 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
55408 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
55409 return DAG.getBitcast(
55411 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
55412 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
55413 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
55416 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
55417 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
55418 Src.getOperand(0).getValueType() == MVT::x86mmx)
55419 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
55421 // See if we're broadcasting the scalar value, in which case just reuse that.
55422 // Ensure the same SDValue from the SDNode use is being used.
55423 if (VT.getScalarType() == Src.getValueType())
55424 for (SDNode *User : Src->uses())
55425 if (User->getOpcode() == X86ISD::VBROADCAST &&
55426 Src == User->getOperand(0)) {
55427 unsigned SizeInBits = VT.getFixedSizeInBits();
55428 unsigned BroadcastSizeInBits =
55429 User->getValueSizeInBits(0).getFixedValue();
55430 if (BroadcastSizeInBits == SizeInBits)
55431 return SDValue(User, 0);
55432 if (BroadcastSizeInBits > SizeInBits)
55433 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
55434 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
55435 // coverage.
55438 return SDValue();
55441 // Simplify PMULDQ and PMULUDQ operations.
55442 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
55443 TargetLowering::DAGCombinerInfo &DCI,
55444 const X86Subtarget &Subtarget) {
55445 SDValue LHS = N->getOperand(0);
55446 SDValue RHS = N->getOperand(1);
55448 // Canonicalize constant to RHS.
55449 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
55450 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
55451 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
55453 // Multiply by zero.
55454 // Don't return RHS as it may contain UNDEFs.
55455 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
55456 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
55458 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
55459 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55460 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
55461 return SDValue(N, 0);
55463 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
55464 // convert it to any_extend_invec, due to the LegalOperations check, do the
55465 // conversion directly to a vector shuffle manually. This exposes combine
55466 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
55467 // combineX86ShufflesRecursively on SSE4.1 targets.
55468 // FIXME: This is basically a hack around several other issues related to
55469 // ANY_EXTEND_VECTOR_INREG.
55470 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
55471 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
55472 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
55473 LHS.getOperand(0).getValueType() == MVT::v4i32) {
55474 SDLoc dl(N);
55475 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
55476 LHS.getOperand(0), { 0, -1, 1, -1 });
55477 LHS = DAG.getBitcast(MVT::v2i64, LHS);
55478 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
55480 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
55481 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
55482 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
55483 RHS.getOperand(0).getValueType() == MVT::v4i32) {
55484 SDLoc dl(N);
55485 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
55486 RHS.getOperand(0), { 0, -1, 1, -1 });
55487 RHS = DAG.getBitcast(MVT::v2i64, RHS);
55488 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
55491 return SDValue();
55494 // Simplify VPMADDUBSW/VPMADDWD operations.
55495 static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
55496 TargetLowering::DAGCombinerInfo &DCI) {
55497 EVT VT = N->getValueType(0);
55498 SDValue LHS = N->getOperand(0);
55499 SDValue RHS = N->getOperand(1);
55501 // Multiply by zero.
55502 // Don't return LHS/RHS as it may contain UNDEFs.
55503 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
55504 ISD::isBuildVectorAllZeros(RHS.getNode()))
55505 return DAG.getConstant(0, SDLoc(N), VT);
55507 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55508 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55509 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55510 return SDValue(N, 0);
55512 return SDValue();
55515 static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
55516 TargetLowering::DAGCombinerInfo &DCI,
55517 const X86Subtarget &Subtarget) {
55518 EVT VT = N->getValueType(0);
55519 SDValue In = N->getOperand(0);
55520 unsigned Opcode = N->getOpcode();
55521 unsigned InOpcode = In.getOpcode();
55522 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55523 SDLoc DL(N);
55525 // Try to merge vector loads and extend_inreg to an extload.
55526 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
55527 In.hasOneUse()) {
55528 auto *Ld = cast<LoadSDNode>(In);
55529 if (Ld->isSimple()) {
55530 MVT SVT = In.getSimpleValueType().getVectorElementType();
55531 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
55532 ? ISD::SEXTLOAD
55533 : ISD::ZEXTLOAD;
55534 EVT MemVT = VT.changeVectorElementType(SVT);
55535 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
55536 SDValue Load = DAG.getExtLoad(
55537 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
55538 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
55539 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
55540 return Load;
55545 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
55546 if (Opcode == InOpcode)
55547 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
55549 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
55550 // -> EXTEND_VECTOR_INREG(X).
55551 // TODO: Handle non-zero subvector indices.
55552 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
55553 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
55554 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
55555 In.getValueSizeInBits())
55556 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
55558 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
55559 // TODO: Move to DAGCombine?
55560 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
55561 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
55562 In.getValueSizeInBits() == VT.getSizeInBits()) {
55563 unsigned NumElts = VT.getVectorNumElements();
55564 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
55565 EVT EltVT = In.getOperand(0).getValueType();
55566 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
55567 for (unsigned I = 0; I != NumElts; ++I)
55568 Elts[I * Scale] = In.getOperand(I);
55569 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
55572 // Attempt to combine as a shuffle on SSE41+ targets.
55573 if (Subtarget.hasSSE41()) {
55574 SDValue Op(N, 0);
55575 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
55576 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55577 return Res;
55580 return SDValue();
55583 static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
55584 TargetLowering::DAGCombinerInfo &DCI) {
55585 EVT VT = N->getValueType(0);
55587 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
55588 return DAG.getConstant(0, SDLoc(N), VT);
55590 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55591 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55592 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55593 return SDValue(N, 0);
55595 return SDValue();
55598 // Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
55599 // Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
55600 // extra instructions between the conversion due to going to scalar and back.
55601 static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
55602 const X86Subtarget &Subtarget) {
55603 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
55604 return SDValue();
55606 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
55607 return SDValue();
55609 if (N->getValueType(0) != MVT::f32 ||
55610 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
55611 return SDValue();
55613 SDLoc dl(N);
55614 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
55615 N->getOperand(0).getOperand(0));
55616 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
55617 DAG.getTargetConstant(4, dl, MVT::i32));
55618 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
55619 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
55620 DAG.getIntPtrConstant(0, dl));
55623 static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
55624 const X86Subtarget &Subtarget) {
55625 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
55626 return SDValue();
55628 if (Subtarget.hasFP16())
55629 return SDValue();
55631 bool IsStrict = N->isStrictFPOpcode();
55632 EVT VT = N->getValueType(0);
55633 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55634 EVT SrcVT = Src.getValueType();
55636 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
55637 return SDValue();
55639 if (VT.getVectorElementType() != MVT::f32 &&
55640 VT.getVectorElementType() != MVT::f64)
55641 return SDValue();
55643 unsigned NumElts = VT.getVectorNumElements();
55644 if (NumElts == 1 || !isPowerOf2_32(NumElts))
55645 return SDValue();
55647 SDLoc dl(N);
55649 // Convert the input to vXi16.
55650 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
55651 Src = DAG.getBitcast(IntVT, Src);
55653 // Widen to at least 8 input elements.
55654 if (NumElts < 8) {
55655 unsigned NumConcats = 8 / NumElts;
55656 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
55657 : DAG.getConstant(0, dl, IntVT);
55658 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
55659 Ops[0] = Src;
55660 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
55663 // Destination is vXf32 with at least 4 elements.
55664 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
55665 std::max(4U, NumElts));
55666 SDValue Cvt, Chain;
55667 if (IsStrict) {
55668 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
55669 {N->getOperand(0), Src});
55670 Chain = Cvt.getValue(1);
55671 } else {
55672 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
55675 if (NumElts < 4) {
55676 assert(NumElts == 2 && "Unexpected size");
55677 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
55678 DAG.getIntPtrConstant(0, dl));
55681 if (IsStrict) {
55682 // Extend to the original VT if necessary.
55683 if (Cvt.getValueType() != VT) {
55684 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
55685 {Chain, Cvt});
55686 Chain = Cvt.getValue(1);
55688 return DAG.getMergeValues({Cvt, Chain}, dl);
55691 // Extend to the original VT if necessary.
55692 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
55695 // Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
55696 // from. Limit this to cases where the loads have the same input chain and the
55697 // output chains are unused. This avoids any memory ordering issues.
55698 static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
55699 TargetLowering::DAGCombinerInfo &DCI) {
55700 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
55701 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
55702 "Unknown broadcast load type");
55704 // Only do this if the chain result is unused.
55705 if (N->hasAnyUseOfValue(1))
55706 return SDValue();
55708 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
55710 SDValue Ptr = MemIntrin->getBasePtr();
55711 SDValue Chain = MemIntrin->getChain();
55712 EVT VT = N->getSimpleValueType(0);
55713 EVT MemVT = MemIntrin->getMemoryVT();
55715 // Look at other users of our base pointer and try to find a wider broadcast.
55716 // The input chain and the size of the memory VT must match.
55717 for (SDNode *User : Ptr->uses())
55718 if (User != N && User->getOpcode() == N->getOpcode() &&
55719 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
55720 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
55721 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
55722 MemVT.getSizeInBits() &&
55723 !User->hasAnyUseOfValue(1) &&
55724 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
55725 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
55726 VT.getSizeInBits());
55727 Extract = DAG.getBitcast(VT, Extract);
55728 return DCI.CombineTo(N, Extract, SDValue(User, 1));
55731 return SDValue();
55734 static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
55735 const X86Subtarget &Subtarget) {
55736 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
55737 return SDValue();
55739 bool IsStrict = N->isStrictFPOpcode();
55740 EVT VT = N->getValueType(0);
55741 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55742 EVT SrcVT = Src.getValueType();
55744 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
55745 SrcVT.getVectorElementType() != MVT::f32)
55746 return SDValue();
55748 SDLoc dl(N);
55750 SDValue Cvt, Chain;
55751 unsigned NumElts = VT.getVectorNumElements();
55752 if (Subtarget.hasFP16()) {
55753 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))
55754 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))
55755 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {
55756 SDValue Cvt0, Cvt1;
55757 SDValue Op0 = Src.getOperand(0);
55758 SDValue Op1 = Src.getOperand(1);
55759 bool IsOp0Strict = Op0->isStrictFPOpcode();
55760 if (Op0.getOpcode() != Op1.getOpcode() ||
55761 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
55762 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
55763 return SDValue();
55765 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
55766 if (IsStrict) {
55767 assert(IsOp0Strict && "Op0 must be strict node");
55768 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
55769 ? X86ISD::STRICT_CVTSI2P
55770 : X86ISD::STRICT_CVTUI2P;
55771 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
55772 {Op0.getOperand(0), Op0.getOperand(1)});
55773 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
55774 {Op1.getOperand(0), Op1.getOperand(1)});
55775 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
55776 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
55778 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
55779 : X86ISD::CVTUI2P;
55780 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
55781 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
55782 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
55784 return SDValue();
55787 if (NumElts == 1 || !isPowerOf2_32(NumElts))
55788 return SDValue();
55790 // Widen to at least 4 input elements.
55791 if (NumElts < 4)
55792 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
55793 DAG.getConstantFP(0.0, dl, SrcVT));
55795 // Destination is v8i16 with at least 8 elements.
55796 EVT CvtVT =
55797 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
55798 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
55799 if (IsStrict) {
55800 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
55801 {N->getOperand(0), Src, Rnd});
55802 Chain = Cvt.getValue(1);
55803 } else {
55804 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
55807 // Extract down to real number of elements.
55808 if (NumElts < 8) {
55809 EVT IntVT = VT.changeVectorElementTypeToInteger();
55810 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
55811 DAG.getIntPtrConstant(0, dl));
55814 Cvt = DAG.getBitcast(VT, Cvt);
55816 if (IsStrict)
55817 return DAG.getMergeValues({Cvt, Chain}, dl);
55819 return Cvt;
55822 static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
55823 SDValue Src = N->getOperand(0);
55825 // Turn MOVDQ2Q+simple_load into an mmx load.
55826 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55827 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
55829 if (LN->isSimple()) {
55830 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
55831 LN->getBasePtr(),
55832 LN->getPointerInfo(),
55833 LN->getOriginalAlign(),
55834 LN->getMemOperand()->getFlags());
55835 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
55836 return NewLd;
55840 return SDValue();
55843 static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
55844 TargetLowering::DAGCombinerInfo &DCI) {
55845 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
55846 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55847 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
55848 return SDValue(N, 0);
55850 return SDValue();
55853 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
55854 DAGCombinerInfo &DCI) const {
55855 SelectionDAG &DAG = DCI.DAG;
55856 switch (N->getOpcode()) {
55857 default: break;
55858 case ISD::SCALAR_TO_VECTOR:
55859 return combineScalarToVector(N, DAG);
55860 case ISD::EXTRACT_VECTOR_ELT:
55861 case X86ISD::PEXTRW:
55862 case X86ISD::PEXTRB:
55863 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
55864 case ISD::CONCAT_VECTORS:
55865 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
55866 case ISD::INSERT_SUBVECTOR:
55867 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
55868 case ISD::EXTRACT_SUBVECTOR:
55869 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
55870 case ISD::VSELECT:
55871 case ISD::SELECT:
55872 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
55873 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
55874 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
55875 case X86ISD::CMP: return combineCMP(N, DAG, Subtarget);
55876 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
55877 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
55878 case X86ISD::ADD:
55879 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
55880 case X86ISD::SBB: return combineSBB(N, DAG);
55881 case X86ISD::ADC: return combineADC(N, DAG, DCI);
55882 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
55883 case ISD::SHL: return combineShiftLeft(N, DAG);
55884 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
55885 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
55886 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
55887 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
55888 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
55889 case X86ISD::BEXTR:
55890 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
55891 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
55892 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
55893 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
55894 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
55895 case X86ISD::VEXTRACT_STORE:
55896 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
55897 case ISD::SINT_TO_FP:
55898 case ISD::STRICT_SINT_TO_FP:
55899 return combineSIntToFP(N, DAG, DCI, Subtarget);
55900 case ISD::UINT_TO_FP:
55901 case ISD::STRICT_UINT_TO_FP:
55902 return combineUIntToFP(N, DAG, Subtarget);
55903 case ISD::FADD:
55904 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
55905 case X86ISD::VFCMULC:
55906 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
55907 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
55908 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
55909 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
55910 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
55911 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
55912 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
55913 case X86ISD::FXOR:
55914 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
55915 case X86ISD::FMIN:
55916 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
55917 case ISD::FMINNUM:
55918 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
55919 case X86ISD::CVTSI2P:
55920 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
55921 case X86ISD::CVTP2SI:
55922 case X86ISD::CVTP2UI:
55923 case X86ISD::STRICT_CVTTP2SI:
55924 case X86ISD::CVTTP2SI:
55925 case X86ISD::STRICT_CVTTP2UI:
55926 case X86ISD::CVTTP2UI:
55927 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
55928 case X86ISD::STRICT_CVTPH2PS:
55929 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
55930 case X86ISD::BT: return combineBT(N, DAG, DCI);
55931 case ISD::ANY_EXTEND:
55932 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
55933 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
55934 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
55935 case ISD::ANY_EXTEND_VECTOR_INREG:
55936 case ISD::SIGN_EXTEND_VECTOR_INREG:
55937 case ISD::ZERO_EXTEND_VECTOR_INREG:
55938 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
55939 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
55940 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
55941 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
55942 case X86ISD::PACKSS:
55943 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
55944 case X86ISD::HADD:
55945 case X86ISD::HSUB:
55946 case X86ISD::FHADD:
55947 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
55948 case X86ISD::VSHL:
55949 case X86ISD::VSRA:
55950 case X86ISD::VSRL:
55951 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
55952 case X86ISD::VSHLI:
55953 case X86ISD::VSRAI:
55954 case X86ISD::VSRLI:
55955 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
55956 case ISD::INSERT_VECTOR_ELT:
55957 case X86ISD::PINSRB:
55958 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
55959 case X86ISD::SHUFP: // Handle all target specific shuffles
55960 case X86ISD::INSERTPS:
55961 case X86ISD::EXTRQI:
55962 case X86ISD::INSERTQI:
55963 case X86ISD::VALIGN:
55964 case X86ISD::PALIGNR:
55965 case X86ISD::VSHLDQ:
55966 case X86ISD::VSRLDQ:
55967 case X86ISD::BLENDI:
55968 case X86ISD::UNPCKH:
55969 case X86ISD::UNPCKL:
55970 case X86ISD::MOVHLPS:
55971 case X86ISD::MOVLHPS:
55972 case X86ISD::PSHUFB:
55973 case X86ISD::PSHUFD:
55974 case X86ISD::PSHUFHW:
55975 case X86ISD::PSHUFLW:
55976 case X86ISD::MOVSHDUP:
55977 case X86ISD::MOVSLDUP:
55978 case X86ISD::MOVDDUP:
55979 case X86ISD::MOVSS:
55980 case X86ISD::MOVSD:
55981 case X86ISD::MOVSH:
55982 case X86ISD::VBROADCAST:
55983 case X86ISD::VPPERM:
55984 case X86ISD::VPERMI:
55985 case X86ISD::VPERMV:
55986 case X86ISD::VPERMV3:
55987 case X86ISD::VPERMIL2:
55988 case X86ISD::VPERMILPI:
55989 case X86ISD::VPERMILPV:
55990 case X86ISD::VPERM2X128:
55991 case X86ISD::SHUF128:
55992 case X86ISD::VZEXT_MOVL:
55993 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
55994 case X86ISD::FMADD_RND:
55995 case X86ISD::FMSUB:
55996 case X86ISD::STRICT_FMSUB:
55997 case X86ISD::FMSUB_RND:
55998 case X86ISD::FNMADD:
55999 case X86ISD::STRICT_FNMADD:
56000 case X86ISD::FNMADD_RND:
56001 case X86ISD::FNMSUB:
56002 case X86ISD::STRICT_FNMSUB:
56003 case X86ISD::FNMSUB_RND:
56004 case ISD::FMA:
56005 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
56006 case X86ISD::FMADDSUB_RND:
56007 case X86ISD::FMSUBADD_RND:
56008 case X86ISD::FMADDSUB:
56009 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
56010 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
56011 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
56012 case X86ISD::MGATHER:
56013 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
56014 case ISD::MGATHER:
56015 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
56016 case X86ISD::PCMPEQ:
56017 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
56018 case X86ISD::PMULDQ:
56019 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
56020 case X86ISD::VPMADDUBSW:
56021 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
56022 case X86ISD::KSHIFTL:
56023 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
56024 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
56025 case ISD::STRICT_FP_EXTEND:
56026 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
56027 case ISD::STRICT_FP_ROUND:
56028 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
56029 case X86ISD::VBROADCAST_LOAD:
56030 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
56031 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
56032 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
56035 return SDValue();
56038 bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {
56039 return false;
56042 // Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
56043 bool X86TargetLowering::preferSextInRegOfTruncate(EVT TruncVT, EVT VT,
56044 EVT ExtVT) const {
56045 return Subtarget.hasAVX512() || !VT.isVector();
56048 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
56049 if (!isTypeLegal(VT))
56050 return false;
56052 // There are no vXi8 shifts.
56053 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
56054 return false;
56056 // TODO: Almost no 8-bit ops are desirable because they have no actual
56057 // size/speed advantages vs. 32-bit ops, but they do have a major
56058 // potential disadvantage by causing partial register stalls.
56060 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
56061 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
56062 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
56063 // check for a constant operand to the multiply.
56064 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
56065 return false;
56067 // i16 instruction encodings are longer and some i16 instructions are slow,
56068 // so those are not desirable.
56069 if (VT == MVT::i16) {
56070 switch (Opc) {
56071 default:
56072 break;
56073 case ISD::LOAD:
56074 case ISD::SIGN_EXTEND:
56075 case ISD::ZERO_EXTEND:
56076 case ISD::ANY_EXTEND:
56077 case ISD::SHL:
56078 case ISD::SRA:
56079 case ISD::SRL:
56080 case ISD::SUB:
56081 case ISD::ADD:
56082 case ISD::MUL:
56083 case ISD::AND:
56084 case ISD::OR:
56085 case ISD::XOR:
56086 return false;
56090 // Any legal type not explicitly accounted for above here is desirable.
56091 return true;
56094 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc &dl,
56095 SDValue Value, SDValue Addr,
56096 int JTI,
56097 SelectionDAG &DAG) const {
56098 const Module *M = DAG.getMachineFunction().getMMI().getModule();
56099 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
56100 if (IsCFProtectionSupported) {
56101 // In case control-flow branch protection is enabled, we need to add
56102 // notrack prefix to the indirect branch.
56103 // In order to do that we create NT_BRIND SDNode.
56104 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
56105 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Value, dl);
56106 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, JTInfo, Addr);
56109 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
56112 TargetLowering::AndOrSETCCFoldKind
56113 X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(
56114 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
56115 using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;
56116 EVT VT = LogicOp->getValueType(0);
56117 EVT OpVT = SETCC0->getOperand(0).getValueType();
56118 if (!VT.isInteger())
56119 return AndOrSETCCFoldKind::None;
56121 if (VT.isVector())
56122 return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |
56123 (isOperationLegal(ISD::ABS, OpVT)
56124 ? AndOrSETCCFoldKind::ABS
56125 : AndOrSETCCFoldKind::None));
56127 // Don't use `NotAnd` as even though `not` is generally shorter code size than
56128 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
56129 // `NotAnd` applies, `AddAnd` does as well.
56130 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
56131 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
56132 return AndOrSETCCFoldKind::AddAnd;
56135 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
56136 EVT VT = Op.getValueType();
56137 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
56138 isa<ConstantSDNode>(Op.getOperand(1));
56140 // i16 is legal, but undesirable since i16 instruction encodings are longer
56141 // and some i16 instructions are slow.
56142 // 8-bit multiply-by-constant can usually be expanded to something cheaper
56143 // using LEA and/or other ALU ops.
56144 if (VT != MVT::i16 && !Is8BitMulByConstant)
56145 return false;
56147 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
56148 if (!Op.hasOneUse())
56149 return false;
56150 SDNode *User = *Op->use_begin();
56151 if (!ISD::isNormalStore(User))
56152 return false;
56153 auto *Ld = cast<LoadSDNode>(Load);
56154 auto *St = cast<StoreSDNode>(User);
56155 return Ld->getBasePtr() == St->getBasePtr();
56158 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
56159 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
56160 return false;
56161 if (!Op.hasOneUse())
56162 return false;
56163 SDNode *User = *Op->use_begin();
56164 if (User->getOpcode() != ISD::ATOMIC_STORE)
56165 return false;
56166 auto *Ld = cast<AtomicSDNode>(Load);
56167 auto *St = cast<AtomicSDNode>(User);
56168 return Ld->getBasePtr() == St->getBasePtr();
56171 bool Commute = false;
56172 switch (Op.getOpcode()) {
56173 default: return false;
56174 case ISD::SIGN_EXTEND:
56175 case ISD::ZERO_EXTEND:
56176 case ISD::ANY_EXTEND:
56177 break;
56178 case ISD::SHL:
56179 case ISD::SRA:
56180 case ISD::SRL: {
56181 SDValue N0 = Op.getOperand(0);
56182 // Look out for (store (shl (load), x)).
56183 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
56184 return false;
56185 break;
56187 case ISD::ADD:
56188 case ISD::MUL:
56189 case ISD::AND:
56190 case ISD::OR:
56191 case ISD::XOR:
56192 Commute = true;
56193 [[fallthrough]];
56194 case ISD::SUB: {
56195 SDValue N0 = Op.getOperand(0);
56196 SDValue N1 = Op.getOperand(1);
56197 // Avoid disabling potential load folding opportunities.
56198 if (X86::mayFoldLoad(N1, Subtarget) &&
56199 (!Commute || !isa<ConstantSDNode>(N0) ||
56200 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
56201 return false;
56202 if (X86::mayFoldLoad(N0, Subtarget) &&
56203 ((Commute && !isa<ConstantSDNode>(N1)) ||
56204 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
56205 return false;
56206 if (IsFoldableAtomicRMW(N0, Op) ||
56207 (Commute && IsFoldableAtomicRMW(N1, Op)))
56208 return false;
56212 PVT = MVT::i32;
56213 return true;
56216 //===----------------------------------------------------------------------===//
56217 // X86 Inline Assembly Support
56218 //===----------------------------------------------------------------------===//
56220 // Helper to match a string separated by whitespace.
56221 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
56222 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
56224 for (StringRef Piece : Pieces) {
56225 if (!S.starts_with(Piece)) // Check if the piece matches.
56226 return false;
56228 S = S.substr(Piece.size());
56229 StringRef::size_type Pos = S.find_first_not_of(" \t");
56230 if (Pos == 0) // We matched a prefix.
56231 return false;
56233 S = S.substr(Pos);
56236 return S.empty();
56239 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
56241 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
56242 if (llvm::is_contained(AsmPieces, "~{cc}") &&
56243 llvm::is_contained(AsmPieces, "~{flags}") &&
56244 llvm::is_contained(AsmPieces, "~{fpsr}")) {
56246 if (AsmPieces.size() == 3)
56247 return true;
56248 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
56249 return true;
56252 return false;
56255 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
56256 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
56258 const std::string &AsmStr = IA->getAsmString();
56260 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
56261 if (!Ty || Ty->getBitWidth() % 16 != 0)
56262 return false;
56264 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
56265 SmallVector<StringRef, 4> AsmPieces;
56266 SplitString(AsmStr, AsmPieces, ";\n");
56268 switch (AsmPieces.size()) {
56269 default: return false;
56270 case 1:
56271 // FIXME: this should verify that we are targeting a 486 or better. If not,
56272 // we will turn this bswap into something that will be lowered to logical
56273 // ops instead of emitting the bswap asm. For now, we don't support 486 or
56274 // lower so don't worry about this.
56275 // bswap $0
56276 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
56277 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
56278 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
56279 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
56280 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
56281 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
56282 // No need to check constraints, nothing other than the equivalent of
56283 // "=r,0" would be valid here.
56284 return IntrinsicLowering::LowerToByteSwap(CI);
56287 // rorw $$8, ${0:w} --> llvm.bswap.i16
56288 if (CI->getType()->isIntegerTy(16) &&
56289 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
56290 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
56291 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
56292 AsmPieces.clear();
56293 StringRef ConstraintsStr = IA->getConstraintString();
56294 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
56295 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
56296 if (clobbersFlagRegisters(AsmPieces))
56297 return IntrinsicLowering::LowerToByteSwap(CI);
56299 break;
56300 case 3:
56301 if (CI->getType()->isIntegerTy(32) &&
56302 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
56303 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
56304 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
56305 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
56306 AsmPieces.clear();
56307 StringRef ConstraintsStr = IA->getConstraintString();
56308 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
56309 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
56310 if (clobbersFlagRegisters(AsmPieces))
56311 return IntrinsicLowering::LowerToByteSwap(CI);
56314 if (CI->getType()->isIntegerTy(64)) {
56315 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
56316 if (Constraints.size() >= 2 &&
56317 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
56318 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
56319 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
56320 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
56321 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
56322 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
56323 return IntrinsicLowering::LowerToByteSwap(CI);
56326 break;
56328 return false;
56331 static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
56332 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
56333 .Case("{@cca}", X86::COND_A)
56334 .Case("{@ccae}", X86::COND_AE)
56335 .Case("{@ccb}", X86::COND_B)
56336 .Case("{@ccbe}", X86::COND_BE)
56337 .Case("{@ccc}", X86::COND_B)
56338 .Case("{@cce}", X86::COND_E)
56339 .Case("{@ccz}", X86::COND_E)
56340 .Case("{@ccg}", X86::COND_G)
56341 .Case("{@ccge}", X86::COND_GE)
56342 .Case("{@ccl}", X86::COND_L)
56343 .Case("{@ccle}", X86::COND_LE)
56344 .Case("{@ccna}", X86::COND_BE)
56345 .Case("{@ccnae}", X86::COND_B)
56346 .Case("{@ccnb}", X86::COND_AE)
56347 .Case("{@ccnbe}", X86::COND_A)
56348 .Case("{@ccnc}", X86::COND_AE)
56349 .Case("{@ccne}", X86::COND_NE)
56350 .Case("{@ccnz}", X86::COND_NE)
56351 .Case("{@ccng}", X86::COND_LE)
56352 .Case("{@ccnge}", X86::COND_L)
56353 .Case("{@ccnl}", X86::COND_GE)
56354 .Case("{@ccnle}", X86::COND_G)
56355 .Case("{@ccno}", X86::COND_NO)
56356 .Case("{@ccnp}", X86::COND_NP)
56357 .Case("{@ccns}", X86::COND_NS)
56358 .Case("{@cco}", X86::COND_O)
56359 .Case("{@ccp}", X86::COND_P)
56360 .Case("{@ccs}", X86::COND_S)
56361 .Default(X86::COND_INVALID);
56362 return Cond;
56365 /// Given a constraint letter, return the type of constraint for this target.
56366 X86TargetLowering::ConstraintType
56367 X86TargetLowering::getConstraintType(StringRef Constraint) const {
56368 if (Constraint.size() == 1) {
56369 switch (Constraint[0]) {
56370 case 'R':
56371 case 'q':
56372 case 'Q':
56373 case 'f':
56374 case 't':
56375 case 'u':
56376 case 'y':
56377 case 'x':
56378 case 'v':
56379 case 'l':
56380 case 'k': // AVX512 masking registers.
56381 return C_RegisterClass;
56382 case 'a':
56383 case 'b':
56384 case 'c':
56385 case 'd':
56386 case 'S':
56387 case 'D':
56388 case 'A':
56389 return C_Register;
56390 case 'I':
56391 case 'J':
56392 case 'K':
56393 case 'N':
56394 case 'G':
56395 case 'L':
56396 case 'M':
56397 return C_Immediate;
56398 case 'C':
56399 case 'e':
56400 case 'Z':
56401 return C_Other;
56402 default:
56403 break;
56406 else if (Constraint.size() == 2) {
56407 switch (Constraint[0]) {
56408 default:
56409 break;
56410 case 'Y':
56411 switch (Constraint[1]) {
56412 default:
56413 break;
56414 case 'z':
56415 return C_Register;
56416 case 'i':
56417 case 'm':
56418 case 'k':
56419 case 't':
56420 case '2':
56421 return C_RegisterClass;
56424 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
56425 return C_Other;
56426 return TargetLowering::getConstraintType(Constraint);
56429 /// Examine constraint type and operand type and determine a weight value.
56430 /// This object must already have been set up with the operand type
56431 /// and the current alternative constraint selected.
56432 TargetLowering::ConstraintWeight
56433 X86TargetLowering::getSingleConstraintMatchWeight(
56434 AsmOperandInfo &Info, const char *Constraint) const {
56435 ConstraintWeight Wt = CW_Invalid;
56436 Value *CallOperandVal = Info.CallOperandVal;
56437 // If we don't have a value, we can't do a match,
56438 // but allow it at the lowest weight.
56439 if (!CallOperandVal)
56440 return CW_Default;
56441 Type *Ty = CallOperandVal->getType();
56442 // Look at the constraint type.
56443 switch (*Constraint) {
56444 default:
56445 Wt = TargetLowering::getSingleConstraintMatchWeight(Info, Constraint);
56446 [[fallthrough]];
56447 case 'R':
56448 case 'q':
56449 case 'Q':
56450 case 'a':
56451 case 'b':
56452 case 'c':
56453 case 'd':
56454 case 'S':
56455 case 'D':
56456 case 'A':
56457 if (CallOperandVal->getType()->isIntegerTy())
56458 Wt = CW_SpecificReg;
56459 break;
56460 case 'f':
56461 case 't':
56462 case 'u':
56463 if (Ty->isFloatingPointTy())
56464 Wt = CW_SpecificReg;
56465 break;
56466 case 'y':
56467 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
56468 Wt = CW_SpecificReg;
56469 break;
56470 case 'Y':
56471 if (StringRef(Constraint).size() != 2)
56472 break;
56473 switch (Constraint[1]) {
56474 default:
56475 return CW_Invalid;
56476 // XMM0
56477 case 'z':
56478 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
56479 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
56480 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
56481 return CW_SpecificReg;
56482 return CW_Invalid;
56483 // Conditional OpMask regs (AVX512)
56484 case 'k':
56485 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
56486 return CW_Register;
56487 return CW_Invalid;
56488 // Any MMX reg
56489 case 'm':
56490 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
56491 return Wt;
56492 return CW_Invalid;
56493 // Any SSE reg when ISA >= SSE2, same as 'x'
56494 case 'i':
56495 case 't':
56496 case '2':
56497 if (!Subtarget.hasSSE2())
56498 return CW_Invalid;
56499 break;
56501 break;
56502 case 'v':
56503 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
56504 Wt = CW_Register;
56505 [[fallthrough]];
56506 case 'x':
56507 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
56508 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
56509 Wt = CW_Register;
56510 break;
56511 case 'k':
56512 // Enable conditional vector operations using %k<#> registers.
56513 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
56514 Wt = CW_Register;
56515 break;
56516 case 'I':
56517 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
56518 if (C->getZExtValue() <= 31)
56519 Wt = CW_Constant;
56520 break;
56521 case 'J':
56522 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56523 if (C->getZExtValue() <= 63)
56524 Wt = CW_Constant;
56525 break;
56526 case 'K':
56527 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56528 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
56529 Wt = CW_Constant;
56530 break;
56531 case 'L':
56532 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56533 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
56534 Wt = CW_Constant;
56535 break;
56536 case 'M':
56537 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56538 if (C->getZExtValue() <= 3)
56539 Wt = CW_Constant;
56540 break;
56541 case 'N':
56542 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56543 if (C->getZExtValue() <= 0xff)
56544 Wt = CW_Constant;
56545 break;
56546 case 'G':
56547 case 'C':
56548 if (isa<ConstantFP>(CallOperandVal))
56549 Wt = CW_Constant;
56550 break;
56551 case 'e':
56552 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56553 if ((C->getSExtValue() >= -0x80000000LL) &&
56554 (C->getSExtValue() <= 0x7fffffffLL))
56555 Wt = CW_Constant;
56556 break;
56557 case 'Z':
56558 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56559 if (C->getZExtValue() <= 0xffffffff)
56560 Wt = CW_Constant;
56561 break;
56563 return Wt;
56566 /// Try to replace an X constraint, which matches anything, with another that
56567 /// has more specific requirements based on the type of the corresponding
56568 /// operand.
56569 const char *X86TargetLowering::
56570 LowerXConstraint(EVT ConstraintVT) const {
56571 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
56572 // 'f' like normal targets.
56573 if (ConstraintVT.isFloatingPoint()) {
56574 if (Subtarget.hasSSE1())
56575 return "x";
56578 return TargetLowering::LowerXConstraint(ConstraintVT);
56581 // Lower @cc targets via setcc.
56582 SDValue X86TargetLowering::LowerAsmOutputForConstraint(
56583 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
56584 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
56585 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
56586 if (Cond == X86::COND_INVALID)
56587 return SDValue();
56588 // Check that return type is valid.
56589 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
56590 OpInfo.ConstraintVT.getSizeInBits() < 8)
56591 report_fatal_error("Glue output operand is of invalid type");
56593 // Get EFLAGS register. Only update chain when copyfrom is glued.
56594 if (Glue.getNode()) {
56595 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
56596 Chain = Glue.getValue(1);
56597 } else
56598 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
56599 // Extract CC code.
56600 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
56601 // Extend to 32-bits
56602 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
56604 return Result;
56607 /// Lower the specified operand into the Ops vector.
56608 /// If it is invalid, don't add anything to Ops.
56609 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
56610 StringRef Constraint,
56611 std::vector<SDValue> &Ops,
56612 SelectionDAG &DAG) const {
56613 SDValue Result;
56615 // Only support length 1 constraints for now.
56616 if (Constraint.size() > 1)
56617 return;
56619 char ConstraintLetter = Constraint[0];
56620 switch (ConstraintLetter) {
56621 default: break;
56622 case 'I':
56623 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56624 if (C->getZExtValue() <= 31) {
56625 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56626 Op.getValueType());
56627 break;
56630 return;
56631 case 'J':
56632 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56633 if (C->getZExtValue() <= 63) {
56634 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56635 Op.getValueType());
56636 break;
56639 return;
56640 case 'K':
56641 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56642 if (isInt<8>(C->getSExtValue())) {
56643 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56644 Op.getValueType());
56645 break;
56648 return;
56649 case 'L':
56650 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56651 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
56652 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
56653 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
56654 Op.getValueType());
56655 break;
56658 return;
56659 case 'M':
56660 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56661 if (C->getZExtValue() <= 3) {
56662 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56663 Op.getValueType());
56664 break;
56667 return;
56668 case 'N':
56669 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56670 if (C->getZExtValue() <= 255) {
56671 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56672 Op.getValueType());
56673 break;
56676 return;
56677 case 'O':
56678 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56679 if (C->getZExtValue() <= 127) {
56680 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56681 Op.getValueType());
56682 break;
56685 return;
56686 case 'e': {
56687 // 32-bit signed value
56688 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56689 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
56690 C->getSExtValue())) {
56691 // Widen to 64 bits here to get it sign extended.
56692 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
56693 break;
56695 // FIXME gcc accepts some relocatable values here too, but only in certain
56696 // memory models; it's complicated.
56698 return;
56700 case 'Z': {
56701 // 32-bit unsigned value
56702 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56703 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
56704 C->getZExtValue())) {
56705 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56706 Op.getValueType());
56707 break;
56710 // FIXME gcc accepts some relocatable values here too, but only in certain
56711 // memory models; it's complicated.
56712 return;
56714 case 'i': {
56715 // Literal immediates are always ok.
56716 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
56717 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
56718 BooleanContent BCont = getBooleanContents(MVT::i64);
56719 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
56720 : ISD::SIGN_EXTEND;
56721 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
56722 : CST->getSExtValue();
56723 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
56724 break;
56727 // In any sort of PIC mode addresses need to be computed at runtime by
56728 // adding in a register or some sort of table lookup. These can't
56729 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
56730 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
56731 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
56732 return;
56734 // If we are in non-pic codegen mode, we allow the address of a global (with
56735 // an optional displacement) to be used with 'i'.
56736 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
56737 // If we require an extra load to get this address, as in PIC mode, we
56738 // can't accept it.
56739 if (isGlobalStubReference(
56740 Subtarget.classifyGlobalReference(GA->getGlobal())))
56741 return;
56742 break;
56746 if (Result.getNode()) {
56747 Ops.push_back(Result);
56748 return;
56750 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
56753 /// Check if \p RC is a general purpose register class.
56754 /// I.e., GR* or one of their variant.
56755 static bool isGRClass(const TargetRegisterClass &RC) {
56756 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
56757 RC.hasSuperClassEq(&X86::GR16RegClass) ||
56758 RC.hasSuperClassEq(&X86::GR32RegClass) ||
56759 RC.hasSuperClassEq(&X86::GR64RegClass) ||
56760 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
56763 /// Check if \p RC is a vector register class.
56764 /// I.e., FR* / VR* or one of their variant.
56765 static bool isFRClass(const TargetRegisterClass &RC) {
56766 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
56767 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
56768 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
56769 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
56770 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
56771 RC.hasSuperClassEq(&X86::VR512RegClass);
56774 /// Check if \p RC is a mask register class.
56775 /// I.e., VK* or one of their variant.
56776 static bool isVKClass(const TargetRegisterClass &RC) {
56777 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
56778 RC.hasSuperClassEq(&X86::VK2RegClass) ||
56779 RC.hasSuperClassEq(&X86::VK4RegClass) ||
56780 RC.hasSuperClassEq(&X86::VK8RegClass) ||
56781 RC.hasSuperClassEq(&X86::VK16RegClass) ||
56782 RC.hasSuperClassEq(&X86::VK32RegClass) ||
56783 RC.hasSuperClassEq(&X86::VK64RegClass);
56786 std::pair<unsigned, const TargetRegisterClass *>
56787 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
56788 StringRef Constraint,
56789 MVT VT) const {
56790 // First, see if this is a constraint that directly corresponds to an LLVM
56791 // register class.
56792 if (Constraint.size() == 1) {
56793 // GCC Constraint Letters
56794 switch (Constraint[0]) {
56795 default: break;
56796 // 'A' means [ER]AX + [ER]DX.
56797 case 'A':
56798 if (Subtarget.is64Bit())
56799 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
56800 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
56801 "Expecting 64, 32 or 16 bit subtarget");
56802 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
56804 // TODO: Slight differences here in allocation order and leaving
56805 // RIP in the class. Do they matter any more here than they do
56806 // in the normal allocation?
56807 case 'k':
56808 if (Subtarget.hasAVX512()) {
56809 if (VT == MVT::i1)
56810 return std::make_pair(0U, &X86::VK1RegClass);
56811 if (VT == MVT::i8)
56812 return std::make_pair(0U, &X86::VK8RegClass);
56813 if (VT == MVT::i16)
56814 return std::make_pair(0U, &X86::VK16RegClass);
56816 if (Subtarget.hasBWI()) {
56817 if (VT == MVT::i32)
56818 return std::make_pair(0U, &X86::VK32RegClass);
56819 if (VT == MVT::i64)
56820 return std::make_pair(0U, &X86::VK64RegClass);
56822 break;
56823 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
56824 if (Subtarget.is64Bit()) {
56825 if (VT == MVT::i8 || VT == MVT::i1)
56826 return std::make_pair(0U, &X86::GR8RegClass);
56827 if (VT == MVT::i16)
56828 return std::make_pair(0U, &X86::GR16RegClass);
56829 if (VT == MVT::i32 || VT == MVT::f32)
56830 return std::make_pair(0U, &X86::GR32RegClass);
56831 if (VT != MVT::f80 && !VT.isVector())
56832 return std::make_pair(0U, &X86::GR64RegClass);
56833 break;
56835 [[fallthrough]];
56836 // 32-bit fallthrough
56837 case 'Q': // Q_REGS
56838 if (VT == MVT::i8 || VT == MVT::i1)
56839 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
56840 if (VT == MVT::i16)
56841 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
56842 if (VT == MVT::i32 || VT == MVT::f32 ||
56843 (!VT.isVector() && !Subtarget.is64Bit()))
56844 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
56845 if (VT != MVT::f80 && !VT.isVector())
56846 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
56847 break;
56848 case 'r': // GENERAL_REGS
56849 case 'l': // INDEX_REGS
56850 if (VT == MVT::i8 || VT == MVT::i1)
56851 return std::make_pair(0U, &X86::GR8RegClass);
56852 if (VT == MVT::i16)
56853 return std::make_pair(0U, &X86::GR16RegClass);
56854 if (VT == MVT::i32 || VT == MVT::f32 ||
56855 (!VT.isVector() && !Subtarget.is64Bit()))
56856 return std::make_pair(0U, &X86::GR32RegClass);
56857 if (VT != MVT::f80 && !VT.isVector())
56858 return std::make_pair(0U, &X86::GR64RegClass);
56859 break;
56860 case 'R': // LEGACY_REGS
56861 if (VT == MVT::i8 || VT == MVT::i1)
56862 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
56863 if (VT == MVT::i16)
56864 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
56865 if (VT == MVT::i32 || VT == MVT::f32 ||
56866 (!VT.isVector() && !Subtarget.is64Bit()))
56867 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
56868 if (VT != MVT::f80 && !VT.isVector())
56869 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
56870 break;
56871 case 'f': // FP Stack registers.
56872 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
56873 // value to the correct fpstack register class.
56874 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
56875 return std::make_pair(0U, &X86::RFP32RegClass);
56876 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
56877 return std::make_pair(0U, &X86::RFP64RegClass);
56878 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
56879 return std::make_pair(0U, &X86::RFP80RegClass);
56880 break;
56881 case 'y': // MMX_REGS if MMX allowed.
56882 if (!Subtarget.hasMMX()) break;
56883 return std::make_pair(0U, &X86::VR64RegClass);
56884 case 'v':
56885 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
56886 if (!Subtarget.hasSSE1()) break;
56887 bool VConstraint = (Constraint[0] == 'v');
56889 switch (VT.SimpleTy) {
56890 default: break;
56891 // Scalar SSE types.
56892 case MVT::f16:
56893 if (VConstraint && Subtarget.hasFP16())
56894 return std::make_pair(0U, &X86::FR16XRegClass);
56895 break;
56896 case MVT::f32:
56897 case MVT::i32:
56898 if (VConstraint && Subtarget.hasVLX())
56899 return std::make_pair(0U, &X86::FR32XRegClass);
56900 return std::make_pair(0U, &X86::FR32RegClass);
56901 case MVT::f64:
56902 case MVT::i64:
56903 if (VConstraint && Subtarget.hasVLX())
56904 return std::make_pair(0U, &X86::FR64XRegClass);
56905 return std::make_pair(0U, &X86::FR64RegClass);
56906 case MVT::i128:
56907 if (Subtarget.is64Bit()) {
56908 if (VConstraint && Subtarget.hasVLX())
56909 return std::make_pair(0U, &X86::VR128XRegClass);
56910 return std::make_pair(0U, &X86::VR128RegClass);
56912 break;
56913 // Vector types and fp128.
56914 case MVT::v8f16:
56915 if (!Subtarget.hasFP16())
56916 break;
56917 if (VConstraint)
56918 return std::make_pair(0U, &X86::VR128XRegClass);
56919 return std::make_pair(0U, &X86::VR128RegClass);
56920 case MVT::v8bf16:
56921 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
56922 break;
56923 if (VConstraint)
56924 return std::make_pair(0U, &X86::VR128XRegClass);
56925 return std::make_pair(0U, &X86::VR128RegClass);
56926 case MVT::f128:
56927 case MVT::v16i8:
56928 case MVT::v8i16:
56929 case MVT::v4i32:
56930 case MVT::v2i64:
56931 case MVT::v4f32:
56932 case MVT::v2f64:
56933 if (VConstraint && Subtarget.hasVLX())
56934 return std::make_pair(0U, &X86::VR128XRegClass);
56935 return std::make_pair(0U, &X86::VR128RegClass);
56936 // AVX types.
56937 case MVT::v16f16:
56938 if (!Subtarget.hasFP16())
56939 break;
56940 if (VConstraint)
56941 return std::make_pair(0U, &X86::VR256XRegClass);
56942 return std::make_pair(0U, &X86::VR256RegClass);
56943 case MVT::v16bf16:
56944 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
56945 break;
56946 if (VConstraint)
56947 return std::make_pair(0U, &X86::VR256XRegClass);
56948 return std::make_pair(0U, &X86::VR256RegClass);
56949 case MVT::v32i8:
56950 case MVT::v16i16:
56951 case MVT::v8i32:
56952 case MVT::v4i64:
56953 case MVT::v8f32:
56954 case MVT::v4f64:
56955 if (VConstraint && Subtarget.hasVLX())
56956 return std::make_pair(0U, &X86::VR256XRegClass);
56957 if (Subtarget.hasAVX())
56958 return std::make_pair(0U, &X86::VR256RegClass);
56959 break;
56960 case MVT::v32f16:
56961 if (!Subtarget.hasFP16())
56962 break;
56963 if (VConstraint)
56964 return std::make_pair(0U, &X86::VR512RegClass);
56965 return std::make_pair(0U, &X86::VR512_0_15RegClass);
56966 case MVT::v32bf16:
56967 if (!Subtarget.hasBF16())
56968 break;
56969 if (VConstraint)
56970 return std::make_pair(0U, &X86::VR512RegClass);
56971 return std::make_pair(0U, &X86::VR512_0_15RegClass);
56972 case MVT::v64i8:
56973 case MVT::v32i16:
56974 case MVT::v8f64:
56975 case MVT::v16f32:
56976 case MVT::v16i32:
56977 case MVT::v8i64:
56978 if (!Subtarget.hasAVX512()) break;
56979 if (VConstraint)
56980 return std::make_pair(0U, &X86::VR512RegClass);
56981 return std::make_pair(0U, &X86::VR512_0_15RegClass);
56983 break;
56985 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
56986 switch (Constraint[1]) {
56987 default:
56988 break;
56989 case 'i':
56990 case 't':
56991 case '2':
56992 return getRegForInlineAsmConstraint(TRI, "x", VT);
56993 case 'm':
56994 if (!Subtarget.hasMMX()) break;
56995 return std::make_pair(0U, &X86::VR64RegClass);
56996 case 'z':
56997 if (!Subtarget.hasSSE1()) break;
56998 switch (VT.SimpleTy) {
56999 default: break;
57000 // Scalar SSE types.
57001 case MVT::f16:
57002 if (!Subtarget.hasFP16())
57003 break;
57004 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
57005 case MVT::f32:
57006 case MVT::i32:
57007 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
57008 case MVT::f64:
57009 case MVT::i64:
57010 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
57011 case MVT::v8f16:
57012 if (!Subtarget.hasFP16())
57013 break;
57014 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
57015 case MVT::v8bf16:
57016 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
57017 break;
57018 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
57019 case MVT::f128:
57020 case MVT::v16i8:
57021 case MVT::v8i16:
57022 case MVT::v4i32:
57023 case MVT::v2i64:
57024 case MVT::v4f32:
57025 case MVT::v2f64:
57026 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
57027 // AVX types.
57028 case MVT::v16f16:
57029 if (!Subtarget.hasFP16())
57030 break;
57031 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
57032 case MVT::v16bf16:
57033 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
57034 break;
57035 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
57036 case MVT::v32i8:
57037 case MVT::v16i16:
57038 case MVT::v8i32:
57039 case MVT::v4i64:
57040 case MVT::v8f32:
57041 case MVT::v4f64:
57042 if (Subtarget.hasAVX())
57043 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
57044 break;
57045 case MVT::v32f16:
57046 if (!Subtarget.hasFP16())
57047 break;
57048 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
57049 case MVT::v32bf16:
57050 if (!Subtarget.hasBF16())
57051 break;
57052 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
57053 case MVT::v64i8:
57054 case MVT::v32i16:
57055 case MVT::v8f64:
57056 case MVT::v16f32:
57057 case MVT::v16i32:
57058 case MVT::v8i64:
57059 if (Subtarget.hasAVX512())
57060 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
57061 break;
57063 break;
57064 case 'k':
57065 // This register class doesn't allocate k0 for masked vector operation.
57066 if (Subtarget.hasAVX512()) {
57067 if (VT == MVT::i1)
57068 return std::make_pair(0U, &X86::VK1WMRegClass);
57069 if (VT == MVT::i8)
57070 return std::make_pair(0U, &X86::VK8WMRegClass);
57071 if (VT == MVT::i16)
57072 return std::make_pair(0U, &X86::VK16WMRegClass);
57074 if (Subtarget.hasBWI()) {
57075 if (VT == MVT::i32)
57076 return std::make_pair(0U, &X86::VK32WMRegClass);
57077 if (VT == MVT::i64)
57078 return std::make_pair(0U, &X86::VK64WMRegClass);
57080 break;
57084 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
57085 return std::make_pair(0U, &X86::GR32RegClass);
57087 // Use the default implementation in TargetLowering to convert the register
57088 // constraint into a member of a register class.
57089 std::pair<Register, const TargetRegisterClass*> Res;
57090 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
57092 // Not found as a standard register?
57093 if (!Res.second) {
57094 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
57095 // to/from f80.
57096 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
57097 // Map st(0) -> st(7) -> ST0
57098 if (Constraint.size() == 7 && Constraint[0] == '{' &&
57099 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
57100 Constraint[3] == '(' &&
57101 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
57102 Constraint[5] == ')' && Constraint[6] == '}') {
57103 // st(7) is not allocatable and thus not a member of RFP80. Return
57104 // singleton class in cases where we have a reference to it.
57105 if (Constraint[4] == '7')
57106 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
57107 return std::make_pair(X86::FP0 + Constraint[4] - '0',
57108 &X86::RFP80RegClass);
57111 // GCC allows "st(0)" to be called just plain "st".
57112 if (StringRef("{st}").equals_insensitive(Constraint))
57113 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
57116 // flags -> EFLAGS
57117 if (StringRef("{flags}").equals_insensitive(Constraint))
57118 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
57120 // dirflag -> DF
57121 // Only allow for clobber.
57122 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
57123 VT == MVT::Other)
57124 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
57126 // fpsr -> FPSW
57127 if (StringRef("{fpsr}").equals_insensitive(Constraint))
57128 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
57130 return Res;
57133 // Make sure it isn't a register that requires 64-bit mode.
57134 if (!Subtarget.is64Bit() &&
57135 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
57136 TRI->getEncodingValue(Res.first) >= 8) {
57137 // Register requires REX prefix, but we're in 32-bit mode.
57138 return std::make_pair(0, nullptr);
57141 // Make sure it isn't a register that requires AVX512.
57142 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
57143 TRI->getEncodingValue(Res.first) & 0x10) {
57144 // Register requires EVEX prefix.
57145 return std::make_pair(0, nullptr);
57148 // Otherwise, check to see if this is a register class of the wrong value
57149 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
57150 // turn into {ax},{dx}.
57151 // MVT::Other is used to specify clobber names.
57152 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
57153 return Res; // Correct type already, nothing to do.
57155 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
57156 // return "eax". This should even work for things like getting 64bit integer
57157 // registers when given an f64 type.
57158 const TargetRegisterClass *Class = Res.second;
57159 // The generic code will match the first register class that contains the
57160 // given register. Thus, based on the ordering of the tablegened file,
57161 // the "plain" GR classes might not come first.
57162 // Therefore, use a helper method.
57163 if (isGRClass(*Class)) {
57164 unsigned Size = VT.getSizeInBits();
57165 if (Size == 1) Size = 8;
57166 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
57167 return std::make_pair(0, nullptr);
57168 Register DestReg = getX86SubSuperRegister(Res.first, Size);
57169 if (DestReg.isValid()) {
57170 bool is64Bit = Subtarget.is64Bit();
57171 const TargetRegisterClass *RC =
57172 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
57173 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
57174 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
57175 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
57176 if (Size == 64 && !is64Bit) {
57177 // Model GCC's behavior here and select a fixed pair of 32-bit
57178 // registers.
57179 switch (DestReg) {
57180 case X86::RAX:
57181 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
57182 case X86::RDX:
57183 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
57184 case X86::RCX:
57185 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
57186 case X86::RBX:
57187 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
57188 case X86::RSI:
57189 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
57190 case X86::RDI:
57191 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
57192 case X86::RBP:
57193 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
57194 default:
57195 return std::make_pair(0, nullptr);
57198 if (RC && RC->contains(DestReg))
57199 return std::make_pair(DestReg, RC);
57200 return Res;
57202 // No register found/type mismatch.
57203 return std::make_pair(0, nullptr);
57204 } else if (isFRClass(*Class)) {
57205 // Handle references to XMM physical registers that got mapped into the
57206 // wrong class. This can happen with constraints like {xmm0} where the
57207 // target independent register mapper will just pick the first match it can
57208 // find, ignoring the required type.
57210 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
57211 if (VT == MVT::f16)
57212 Res.second = &X86::FR16XRegClass;
57213 else if (VT == MVT::f32 || VT == MVT::i32)
57214 Res.second = &X86::FR32XRegClass;
57215 else if (VT == MVT::f64 || VT == MVT::i64)
57216 Res.second = &X86::FR64XRegClass;
57217 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
57218 Res.second = &X86::VR128XRegClass;
57219 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
57220 Res.second = &X86::VR256XRegClass;
57221 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
57222 Res.second = &X86::VR512RegClass;
57223 else {
57224 // Type mismatch and not a clobber: Return an error;
57225 Res.first = 0;
57226 Res.second = nullptr;
57228 } else if (isVKClass(*Class)) {
57229 if (VT == MVT::i1)
57230 Res.second = &X86::VK1RegClass;
57231 else if (VT == MVT::i8)
57232 Res.second = &X86::VK8RegClass;
57233 else if (VT == MVT::i16)
57234 Res.second = &X86::VK16RegClass;
57235 else if (VT == MVT::i32)
57236 Res.second = &X86::VK32RegClass;
57237 else if (VT == MVT::i64)
57238 Res.second = &X86::VK64RegClass;
57239 else {
57240 // Type mismatch and not a clobber: Return an error;
57241 Res.first = 0;
57242 Res.second = nullptr;
57246 return Res;
57249 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
57250 // Integer division on x86 is expensive. However, when aggressively optimizing
57251 // for code size, we prefer to use a div instruction, as it is usually smaller
57252 // than the alternative sequence.
57253 // The exception to this is vector division. Since x86 doesn't have vector
57254 // integer division, leaving the division as-is is a loss even in terms of
57255 // size, because it will have to be scalarized, while the alternative code
57256 // sequence can be performed in vector form.
57257 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
57258 return OptSize && !VT.isVector();
57261 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
57262 if (!Subtarget.is64Bit())
57263 return;
57265 // Update IsSplitCSR in X86MachineFunctionInfo.
57266 X86MachineFunctionInfo *AFI =
57267 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
57268 AFI->setIsSplitCSR(true);
57271 void X86TargetLowering::insertCopiesSplitCSR(
57272 MachineBasicBlock *Entry,
57273 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
57274 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
57275 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
57276 if (!IStart)
57277 return;
57279 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
57280 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
57281 MachineBasicBlock::iterator MBBI = Entry->begin();
57282 for (const MCPhysReg *I = IStart; *I; ++I) {
57283 const TargetRegisterClass *RC = nullptr;
57284 if (X86::GR64RegClass.contains(*I))
57285 RC = &X86::GR64RegClass;
57286 else
57287 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
57289 Register NewVR = MRI->createVirtualRegister(RC);
57290 // Create copy from CSR to a virtual register.
57291 // FIXME: this currently does not emit CFI pseudo-instructions, it works
57292 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
57293 // nounwind. If we want to generalize this later, we may need to emit
57294 // CFI pseudo-instructions.
57295 assert(
57296 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
57297 "Function should be nounwind in insertCopiesSplitCSR!");
57298 Entry->addLiveIn(*I);
57299 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
57300 .addReg(*I);
57302 // Insert the copy-back instructions right before the terminator.
57303 for (auto *Exit : Exits)
57304 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
57305 TII->get(TargetOpcode::COPY), *I)
57306 .addReg(NewVR);
57310 bool X86TargetLowering::supportSwiftError() const {
57311 return Subtarget.is64Bit();
57314 MachineInstr *
57315 X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
57316 MachineBasicBlock::instr_iterator &MBBI,
57317 const TargetInstrInfo *TII) const {
57318 assert(MBBI->isCall() && MBBI->getCFIType() &&
57319 "Invalid call instruction for a KCFI check");
57321 MachineFunction &MF = *MBB.getParent();
57322 // If the call target is a memory operand, unfold it and use R11 for the
57323 // call, so KCFI_CHECK won't have to recompute the address.
57324 switch (MBBI->getOpcode()) {
57325 case X86::CALL64m:
57326 case X86::CALL64m_NT:
57327 case X86::TAILJMPm64:
57328 case X86::TAILJMPm64_REX: {
57329 MachineBasicBlock::instr_iterator OrigCall = MBBI;
57330 SmallVector<MachineInstr *, 2> NewMIs;
57331 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
57332 /*UnfoldStore=*/false, NewMIs))
57333 report_fatal_error("Failed to unfold memory operand for a KCFI check");
57334 for (auto *NewMI : NewMIs)
57335 MBBI = MBB.insert(OrigCall, NewMI);
57336 assert(MBBI->isCall() &&
57337 "Unexpected instruction after memory operand unfolding");
57338 if (OrigCall->shouldUpdateCallSiteInfo())
57339 MF.moveCallSiteInfo(&*OrigCall, &*MBBI);
57340 MBBI->setCFIType(MF, OrigCall->getCFIType());
57341 OrigCall->eraseFromParent();
57342 break;
57344 default:
57345 break;
57348 MachineOperand &Target = MBBI->getOperand(0);
57349 Register TargetReg;
57350 switch (MBBI->getOpcode()) {
57351 case X86::CALL64r:
57352 case X86::CALL64r_NT:
57353 case X86::TAILJMPr64:
57354 case X86::TAILJMPr64_REX:
57355 assert(Target.isReg() && "Unexpected target operand for an indirect call");
57356 Target.setIsRenamable(false);
57357 TargetReg = Target.getReg();
57358 break;
57359 case X86::CALL64pcrel32:
57360 case X86::TAILJMPd64:
57361 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
57362 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
57363 // 64-bit indirect thunk calls.
57364 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
57365 "Unexpected register for an indirect thunk call");
57366 TargetReg = X86::R11;
57367 break;
57368 default:
57369 llvm_unreachable("Unexpected CFI call opcode");
57370 break;
57373 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
57374 .addReg(TargetReg)
57375 .addImm(MBBI->getCFIType())
57376 .getInstr();
57379 /// Returns true if stack probing through a function call is requested.
57380 bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {
57381 return !getStackProbeSymbolName(MF).empty();
57384 /// Returns true if stack probing through inline assembly is requested.
57385 bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
57387 // No inline stack probe for Windows, they have their own mechanism.
57388 if (Subtarget.isOSWindows() ||
57389 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
57390 return false;
57392 // If the function specifically requests inline stack probes, emit them.
57393 if (MF.getFunction().hasFnAttribute("probe-stack"))
57394 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
57395 "inline-asm";
57397 return false;
57400 /// Returns the name of the symbol used to emit stack probes or the empty
57401 /// string if not applicable.
57402 StringRef
57403 X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {
57404 // Inline Stack probes disable stack probe call
57405 if (hasInlineStackProbe(MF))
57406 return "";
57408 // If the function specifically requests stack probes, emit them.
57409 if (MF.getFunction().hasFnAttribute("probe-stack"))
57410 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
57412 // Generally, if we aren't on Windows, the platform ABI does not include
57413 // support for stack probes, so don't emit them.
57414 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
57415 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
57416 return "";
57418 // We need a stack probe to conform to the Windows ABI. Choose the right
57419 // symbol.
57420 if (Subtarget.is64Bit())
57421 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
57422 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
57425 unsigned
57426 X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
57427 // The default stack probe size is 4096 if the function has no stackprobesize
57428 // attribute.
57429 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
57430 4096);
57433 Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
57434 if (ML->isInnermost() &&
57435 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
57436 return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
57437 return TargetLowering::getPrefLoopAlignment();