1 //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file describes the X86 SSE instruction set, defining the instructions,
10 // and properties of the instructions which are needed for code generation,
11 // machine code emission, and analysis.
13 //===----------------------------------------------------------------------===//
15 //===----------------------------------------------------------------------===//
16 // SSE 1 & 2 Instructions Classes
17 //===----------------------------------------------------------------------===//
19 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
21 RegisterClass RC, X86MemOperand x86memop,
22 Domain d, X86FoldableSchedWrite sched,
24 let isCodeGenOnly = 1 in {
25 let isCommutable = 1 in {
26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38 Sched<[sched.Folded, sched.ReadAfterFold]>;
42 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
44 SDPatternOperator OpNode, RegisterClass RC,
45 ValueType VT, string asm, Operand memopr,
46 ComplexPattern mem_cpat, Domain d,
47 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48 let hasSideEffects = 0 in {
49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60 [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
61 Sched<[sched.Folded, sched.ReadAfterFold]>;
65 /// sse12_fp_packed - SSE 1 & 2 packed instructions class
66 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
67 RegisterClass RC, ValueType vt,
68 X86MemOperand x86memop, PatFrag mem_frag,
69 Domain d, X86FoldableSchedWrite sched,
71 let isCommutable = 1 in
72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
85 Sched<[sched.Folded, sched.ReadAfterFold]>;
88 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89 multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90 string OpcodeStr, X86MemOperand x86memop,
91 X86FoldableSchedWrite sched,
92 list<dag> pat_rr, list<dag> pat_rm,
94 let isCommutable = 1, hasSideEffects = 0 in
95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
101 let hasSideEffects = 0, mayLoad = 1 in
102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
107 Sched<[sched.Folded, sched.ReadAfterFold]>;
111 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112 // This is expanded by ExpandPostRAPseudos.
113 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114 isPseudo = 1, SchedRW = [WriteZero] in {
115 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
116 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
117 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
118 [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
119 def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
120 [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
123 //===----------------------------------------------------------------------===//
124 // AVX & SSE - Zero/One Vectors
125 //===----------------------------------------------------------------------===//
127 // Alias instruction that maps zero vector to pxor / xorp* for sse.
128 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
129 // swizzled by ExecutionDomainFix to pxor.
130 // We set canFoldAsLoad because this can be converted to a constant-pool
131 // load of an all-zeros value if folding it would be beneficial.
132 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
133 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
134 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
135 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
138 let Predicates = [NoAVX512] in {
139 def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
140 def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
141 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
142 def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
143 def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
147 // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
148 // and doesn't need it because on sandy bridge the register is set to zero
149 // at the rename stage without using any execution unit, so SET0PSY
150 // and SET0PDY can be used for vector int instructions without penalty
151 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
152 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
153 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
154 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
157 let Predicates = [NoAVX512] in {
158 def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
159 def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
160 def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
161 def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
162 def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
165 // We set canFoldAsLoad because this can be converted to a constant-pool
166 // load of an all-ones value if folding it would be beneficial.
167 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
168 isPseudo = 1, SchedRW = [WriteZero] in {
169 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
170 [(set VR128:$dst, (v4i32 immAllOnesV))]>;
171 let Predicates = [HasAVX1Only, OptForMinSize] in {
172 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
173 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
175 let Predicates = [HasAVX2] in
176 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
177 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
180 //===----------------------------------------------------------------------===//
181 // SSE 1 & 2 - Move FP Scalar Instructions
183 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
184 // register copies because it's a partial register update; Register-to-register
185 // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
186 // that the insert be implementable in terms of a copy, and just mentioned, we
187 // don't use movss/movsd for copies.
188 //===----------------------------------------------------------------------===//
190 multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
191 X86MemOperand x86memop, string base_opc,
192 string asm_opr, Domain d, string Name> {
193 let isCommutable = 1 in
194 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
195 (ins VR128:$src1, VR128:$src2),
196 !strconcat(base_opc, asm_opr),
197 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
198 Sched<[SchedWriteFShuffle.XMM]>;
200 // For the disassembler
201 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
202 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
203 (ins VR128:$src1, VR128:$src2),
204 !strconcat(base_opc, asm_opr), []>,
205 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
208 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
209 X86MemOperand x86memop, string OpcodeStr,
210 Domain d, string Name, Predicate pred> {
212 let Predicates = [UseAVX, OptForSize] in
213 defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
214 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
216 VEX_4V, VEX_LIG, VEX_WIG;
218 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
219 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
220 [(store RC:$src, addr:$dst)], d>,
221 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
223 let Constraints = "$src1 = $dst" in {
224 let Predicates = [pred, NoSSE41_Or_OptForSize] in
225 defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
226 "\t{$src2, $dst|$dst, $src2}", d, Name>;
229 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
230 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
231 [(store RC:$src, addr:$dst)], d>,
232 Sched<[WriteFStore]>;
234 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
235 (!cast<Instruction>("V"#NAME#"rr_REV")
236 VR128:$dst, VR128:$src1, VR128:$src2), 0>;
237 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
238 (!cast<Instruction>(NAME#"rr_REV")
239 VR128:$dst, VR128:$src2), 0>;
242 // Loading from memory automatically zeroing upper bits.
243 multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
244 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
246 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
247 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
248 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
249 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
250 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
251 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
252 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
255 // _alt version uses FR32/FR64 register class.
256 let isCodeGenOnly = 1 in {
257 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
258 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
259 [(set RC:$dst, (mem_pat addr:$src))], d>,
260 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
261 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
262 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
263 [(set RC:$dst, (mem_pat addr:$src))], d>,
268 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
269 SSEPackedSingle, "MOVSS", UseSSE1>, XS;
270 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
271 SSEPackedDouble, "MOVSD", UseSSE2>, XD;
273 let canFoldAsLoad = 1, isReMaterializable = 1 in {
274 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
275 SSEPackedSingle>, XS;
276 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
277 SSEPackedDouble>, XD;
281 let Predicates = [UseAVX] in {
282 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
283 (VMOVSSrm addr:$src)>;
284 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
285 (VMOVSDrm addr:$src)>;
287 // Represent the same patterns above but in the form they appear for
289 def : Pat<(v8f32 (X86vzload32 addr:$src)),
290 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
291 def : Pat<(v4f64 (X86vzload64 addr:$src)),
292 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
295 let Predicates = [UseAVX, OptForSize] in {
296 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
297 // MOVSS to the lower bits.
298 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
299 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
300 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
301 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
303 // Move low f32 and clear high bits.
304 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
305 (SUBREG_TO_REG (i32 0),
306 (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
307 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
308 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
309 (SUBREG_TO_REG (i32 0),
310 (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
311 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
314 let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
315 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
316 // MOVSS to the lower bits.
317 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
318 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
319 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
320 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
323 let Predicates = [UseSSE2] in
324 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
325 (MOVSDrm addr:$src)>;
327 let Predicates = [UseSSE1] in
328 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
329 (MOVSSrm addr:$src)>;
331 //===----------------------------------------------------------------------===//
332 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
333 //===----------------------------------------------------------------------===//
335 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
336 X86MemOperand x86memop, PatFrag ld_frag,
337 string asm, Domain d,
338 X86SchedWriteMoveLS sched> {
339 let hasSideEffects = 0, isMoveReg = 1 in
340 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
341 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
343 let canFoldAsLoad = 1, isReMaterializable = 1 in
344 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
345 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
346 [(set RC:$dst, (ld_frag addr:$src))], d>,
350 let Predicates = [HasAVX, NoVLX] in {
351 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
352 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
354 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
355 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
357 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
358 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
360 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
361 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
364 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
365 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
366 PS, VEX, VEX_L, VEX_WIG;
367 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
368 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
369 PD, VEX, VEX_L, VEX_WIG;
370 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
371 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
372 PS, VEX, VEX_L, VEX_WIG;
373 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
374 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
375 PD, VEX, VEX_L, VEX_WIG;
378 let Predicates = [UseSSE1] in {
379 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
380 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
382 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
383 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
386 let Predicates = [UseSSE2] in {
387 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
388 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
390 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
391 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
395 let Predicates = [HasAVX, NoVLX] in {
396 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
397 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
398 "movaps\t{$src, $dst|$dst, $src}",
399 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
401 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
402 "movapd\t{$src, $dst|$dst, $src}",
403 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
405 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
406 "movups\t{$src, $dst|$dst, $src}",
407 [(store (v4f32 VR128:$src), addr:$dst)]>,
409 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
410 "movupd\t{$src, $dst|$dst, $src}",
411 [(store (v2f64 VR128:$src), addr:$dst)]>,
415 let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
416 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
417 "movaps\t{$src, $dst|$dst, $src}",
418 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
420 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
421 "movapd\t{$src, $dst|$dst, $src}",
422 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
424 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
425 "movups\t{$src, $dst|$dst, $src}",
426 [(store (v8f32 VR256:$src), addr:$dst)]>,
428 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
429 "movupd\t{$src, $dst|$dst, $src}",
430 [(store (v4f64 VR256:$src), addr:$dst)]>,
436 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
438 let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
439 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
441 "movaps\t{$src, $dst|$dst, $src}", []>,
442 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
443 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
445 "movapd\t{$src, $dst|$dst, $src}", []>,
446 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
447 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
449 "movups\t{$src, $dst|$dst, $src}", []>,
450 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
451 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
453 "movupd\t{$src, $dst|$dst, $src}", []>,
454 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
457 let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
458 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
460 "movaps\t{$src, $dst|$dst, $src}", []>,
461 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
462 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
464 "movapd\t{$src, $dst|$dst, $src}", []>,
465 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
466 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
468 "movups\t{$src, $dst|$dst, $src}", []>,
469 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
470 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
472 "movupd\t{$src, $dst|$dst, $src}", []>,
473 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
477 // Reversed version with ".s" suffix for GAS compatibility.
478 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
479 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
480 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
481 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
482 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
483 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
484 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
485 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
486 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
487 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
488 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
489 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
490 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
491 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
492 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
493 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
495 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
496 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
497 "movaps\t{$src, $dst|$dst, $src}",
498 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
499 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
500 "movapd\t{$src, $dst|$dst, $src}",
501 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
502 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
503 "movups\t{$src, $dst|$dst, $src}",
504 [(store (v4f32 VR128:$src), addr:$dst)]>;
505 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
506 "movupd\t{$src, $dst|$dst, $src}",
507 [(store (v2f64 VR128:$src), addr:$dst)]>;
511 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
512 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
513 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
514 "movaps\t{$src, $dst|$dst, $src}", []>,
515 FoldGenData<"MOVAPSrr">;
516 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
517 "movapd\t{$src, $dst|$dst, $src}", []>,
518 FoldGenData<"MOVAPDrr">;
519 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
520 "movups\t{$src, $dst|$dst, $src}", []>,
521 FoldGenData<"MOVUPSrr">;
522 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
523 "movupd\t{$src, $dst|$dst, $src}", []>,
524 FoldGenData<"MOVUPDrr">;
527 // Reversed version with ".s" suffix for GAS compatibility.
528 def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
529 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
530 def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
531 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
532 def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
533 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
534 def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
535 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
537 let Predicates = [HasAVX, NoVLX] in {
538 // 256-bit load/store need to use floating point load/store in case we don't
539 // have AVX2. Execution domain fixing will convert to integer if AVX2 is
540 // available and changing the domain is beneficial.
541 def : Pat<(alignedloadv4i64 addr:$src),
542 (VMOVAPSYrm addr:$src)>;
543 def : Pat<(alignedloadv8i32 addr:$src),
544 (VMOVAPSYrm addr:$src)>;
545 def : Pat<(alignedloadv16i16 addr:$src),
546 (VMOVAPSYrm addr:$src)>;
547 def : Pat<(alignedloadv32i8 addr:$src),
548 (VMOVAPSYrm addr:$src)>;
549 def : Pat<(loadv4i64 addr:$src),
550 (VMOVUPSYrm addr:$src)>;
551 def : Pat<(loadv8i32 addr:$src),
552 (VMOVUPSYrm addr:$src)>;
553 def : Pat<(loadv16i16 addr:$src),
554 (VMOVUPSYrm addr:$src)>;
555 def : Pat<(loadv32i8 addr:$src),
556 (VMOVUPSYrm addr:$src)>;
558 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
559 (VMOVAPSYmr addr:$dst, VR256:$src)>;
560 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
561 (VMOVAPSYmr addr:$dst, VR256:$src)>;
562 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
563 (VMOVAPSYmr addr:$dst, VR256:$src)>;
564 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
565 (VMOVAPSYmr addr:$dst, VR256:$src)>;
566 def : Pat<(store (v4i64 VR256:$src), addr:$dst),
567 (VMOVUPSYmr addr:$dst, VR256:$src)>;
568 def : Pat<(store (v8i32 VR256:$src), addr:$dst),
569 (VMOVUPSYmr addr:$dst, VR256:$src)>;
570 def : Pat<(store (v16i16 VR256:$src), addr:$dst),
571 (VMOVUPSYmr addr:$dst, VR256:$src)>;
572 def : Pat<(store (v32i8 VR256:$src), addr:$dst),
573 (VMOVUPSYmr addr:$dst, VR256:$src)>;
576 // Use movaps / movups for SSE integer load / store (one byte shorter).
577 // The instructions selected below are then converted to MOVDQA/MOVDQU
578 // during the SSE domain pass.
579 let Predicates = [UseSSE1] in {
580 def : Pat<(alignedloadv2i64 addr:$src),
581 (MOVAPSrm addr:$src)>;
582 def : Pat<(alignedloadv4i32 addr:$src),
583 (MOVAPSrm addr:$src)>;
584 def : Pat<(alignedloadv8i16 addr:$src),
585 (MOVAPSrm addr:$src)>;
586 def : Pat<(alignedloadv16i8 addr:$src),
587 (MOVAPSrm addr:$src)>;
588 def : Pat<(loadv2i64 addr:$src),
589 (MOVUPSrm addr:$src)>;
590 def : Pat<(loadv4i32 addr:$src),
591 (MOVUPSrm addr:$src)>;
592 def : Pat<(loadv8i16 addr:$src),
593 (MOVUPSrm addr:$src)>;
594 def : Pat<(loadv16i8 addr:$src),
595 (MOVUPSrm addr:$src)>;
597 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
598 (MOVAPSmr addr:$dst, VR128:$src)>;
599 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
600 (MOVAPSmr addr:$dst, VR128:$src)>;
601 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
602 (MOVAPSmr addr:$dst, VR128:$src)>;
603 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
604 (MOVAPSmr addr:$dst, VR128:$src)>;
605 def : Pat<(store (v2i64 VR128:$src), addr:$dst),
606 (MOVUPSmr addr:$dst, VR128:$src)>;
607 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
608 (MOVUPSmr addr:$dst, VR128:$src)>;
609 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
610 (MOVUPSmr addr:$dst, VR128:$src)>;
611 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
612 (MOVUPSmr addr:$dst, VR128:$src)>;
615 //===----------------------------------------------------------------------===//
616 // SSE 1 & 2 - Move Low packed FP Instructions
617 //===----------------------------------------------------------------------===//
619 multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode,
620 string base_opc, string asm_opr> {
621 // No pattern as they need be special cased between high and low.
622 let hasSideEffects = 0, mayLoad = 1 in
623 def PSrm : PI<opc, MRMSrcMem,
624 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
625 !strconcat(base_opc, "s", asm_opr),
626 [], SSEPackedSingle>, PS,
627 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
629 def PDrm : PI<opc, MRMSrcMem,
630 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
631 !strconcat(base_opc, "d", asm_opr),
632 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
633 (scalar_to_vector (loadf64 addr:$src2)))))],
634 SSEPackedDouble>, PD,
635 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
638 multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
640 let Predicates = [UseAVX] in
641 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
642 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
645 let Constraints = "$src1 = $dst" in
646 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
647 "\t{$src2, $dst|$dst, $src2}">;
650 defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
652 let SchedRW = [WriteFStore] in {
653 let Predicates = [UseAVX] in {
654 let mayStore = 1, hasSideEffects = 0 in
655 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
656 "movlps\t{$src, $dst|$dst, $src}",
659 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
660 "movlpd\t{$src, $dst|$dst, $src}",
661 [(store (f64 (extractelt (v2f64 VR128:$src),
662 (iPTR 0))), addr:$dst)]>,
665 let mayStore = 1, hasSideEffects = 0 in
666 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
667 "movlps\t{$src, $dst|$dst, $src}",
669 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
670 "movlpd\t{$src, $dst|$dst, $src}",
671 [(store (f64 (extractelt (v2f64 VR128:$src),
672 (iPTR 0))), addr:$dst)]>;
675 let Predicates = [UseSSE1] in {
676 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
677 // end up with a movsd or blend instead of shufp.
678 // No need for aligned load, we're only loading 64-bits.
679 def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
681 (MOVLPSrm VR128:$src1, addr:$src2)>;
682 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
683 (MOVLPSrm VR128:$src1, addr:$src2)>;
685 def : Pat<(v4f32 (X86vzload64 addr:$src)),
686 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
687 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
688 (MOVLPSmr addr:$dst, VR128:$src)>;
691 //===----------------------------------------------------------------------===//
692 // SSE 1 & 2 - Move Hi packed FP Instructions
693 //===----------------------------------------------------------------------===//
695 defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
697 let SchedRW = [WriteFStore] in {
698 // v2f64 extract element 1 is always custom lowered to unpack high to low
699 // and extract element 0 so the non-store version isn't too horrible.
700 let Predicates = [UseAVX] in {
701 let mayStore = 1, hasSideEffects = 0 in
702 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
703 "movhps\t{$src, $dst|$dst, $src}",
705 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
706 "movhpd\t{$src, $dst|$dst, $src}",
707 [(store (f64 (extractelt
708 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
709 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
711 let mayStore = 1, hasSideEffects = 0 in
712 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
713 "movhps\t{$src, $dst|$dst, $src}",
715 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
716 "movhpd\t{$src, $dst|$dst, $src}",
717 [(store (f64 (extractelt
718 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
719 (iPTR 0))), addr:$dst)]>;
722 let Predicates = [UseAVX] in {
723 // Also handle an i64 load because that may get selected as a faster way to
725 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
726 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
727 (VMOVHPDrm VR128:$src1, addr:$src2)>;
728 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
729 (VMOVHPDrm VR128:$src1, addr:$src2)>;
731 def : Pat<(store (f64 (extractelt
732 (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
733 (iPTR 0))), addr:$dst),
734 (VMOVHPDmr addr:$dst, VR128:$src)>;
737 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
738 (VMOVLPDrm VR128:$src1, addr:$src2)>;
741 let Predicates = [UseSSE1] in {
742 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
743 // end up with a movsd or blend instead of shufp.
744 // No need for aligned load, we're only loading 64-bits.
745 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
746 (MOVHPSrm VR128:$src1, addr:$src2)>;
747 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
748 (MOVHPSrm VR128:$src1, addr:$src2)>;
750 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
752 (MOVHPSmr addr:$dst, VR128:$src)>;
755 let Predicates = [UseSSE2] in {
758 // Also handle an i64 load because that may get selected as a faster way to
760 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
761 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
762 (MOVHPDrm VR128:$src1, addr:$src2)>;
763 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
764 (MOVHPDrm VR128:$src1, addr:$src2)>;
766 def : Pat<(store (f64 (extractelt
767 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
768 (iPTR 0))), addr:$dst),
769 (MOVHPDmr addr:$dst, VR128:$src)>;
772 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
773 (MOVLPDrm VR128:$src1, addr:$src2)>;
776 let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
777 // Use MOVLPD to load into the low bits from a full vector unless we can use
779 def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
780 (MOVLPDrm VR128:$src1, addr:$src2)>;
783 //===----------------------------------------------------------------------===//
784 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
785 //===----------------------------------------------------------------------===//
787 let Predicates = [UseAVX] in {
788 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
789 (ins VR128:$src1, VR128:$src2),
790 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
792 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
793 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
794 let isCommutable = 1 in
795 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
796 (ins VR128:$src1, VR128:$src2),
797 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
799 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
800 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
803 let Constraints = "$src1 = $dst" in {
804 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
805 (ins VR128:$src1, VR128:$src2),
806 "movlhps\t{$src2, $dst|$dst, $src2}",
808 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
809 Sched<[SchedWriteFShuffle.XMM]>;
810 let isCommutable = 1 in
811 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
812 (ins VR128:$src1, VR128:$src2),
813 "movhlps\t{$src2, $dst|$dst, $src2}",
815 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
816 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
819 //===----------------------------------------------------------------------===//
820 // SSE 1 & 2 - Conversion Instructions
821 //===----------------------------------------------------------------------===//
823 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
824 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
825 string asm, string mem, X86FoldableSchedWrite sched,
826 SchedRead Int2Fpu = ReadDefault> {
827 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
828 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
829 [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
830 Sched<[sched, Int2Fpu]>;
831 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
832 mem#"\t{$src, $dst|$dst, $src}",
833 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
834 Sched<[sched.Folded]>;
837 multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
838 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
839 string asm, Domain d, X86FoldableSchedWrite sched> {
840 let hasSideEffects = 0 in {
841 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
842 [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>,
845 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
846 [(set RC:$dst, (DstTy (sint_to_fp
847 (SrcTy (ld_frag addr:$src)))))], d>,
848 Sched<[sched.Folded]>;
852 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
853 X86MemOperand x86memop, string asm, string mem,
854 X86FoldableSchedWrite sched> {
855 let hasSideEffects = 0, Predicates = [UseAVX] in {
856 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
857 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
858 Sched<[sched, ReadDefault, ReadInt2Fpu]>;
860 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
861 (ins DstRC:$src1, x86memop:$src),
862 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
863 Sched<[sched.Folded, sched.ReadAfterFold]>;
864 } // hasSideEffects = 0
867 let isCodeGenOnly = 1, Predicates = [UseAVX] in {
868 defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
869 "cvttss2si", "cvttss2si",
872 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
873 "cvttss2si", "cvttss2si",
875 XS, VEX, VEX_W, VEX_LIG;
876 defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
877 "cvttsd2si", "cvttsd2si",
880 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
881 "cvttsd2si", "cvttsd2si",
883 XD, VEX, VEX_W, VEX_LIG;
886 // The assembler can recognize rr 64-bit instructions by seeing a rxx
887 // register, but the same isn't true when only using memory operands,
888 // provide other assembly "l" and "q" forms to address this explicitly
889 // where appropriate to do so.
890 let isCodeGenOnly = 1 in {
891 defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
892 WriteCvtI2SS>, XS, VEX_4V, VEX_LIG;
893 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
894 WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
895 defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
896 WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
897 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
898 WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
899 } // isCodeGenOnly = 1
901 let Predicates = [UseAVX] in {
902 def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
903 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
904 def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
905 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
906 def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
907 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
908 def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
909 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
911 def : Pat<(f32 (sint_to_fp GR32:$src)),
912 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
913 def : Pat<(f32 (sint_to_fp GR64:$src)),
914 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
915 def : Pat<(f64 (sint_to_fp GR32:$src)),
916 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
917 def : Pat<(f64 (sint_to_fp GR64:$src)),
918 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
921 let isCodeGenOnly = 1 in {
922 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
923 "cvttss2si", "cvttss2si",
925 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
926 "cvttss2si", "cvttss2si",
927 WriteCvtSS2I>, XS, REX_W;
928 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
929 "cvttsd2si", "cvttsd2si",
931 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
932 "cvttsd2si", "cvttsd2si",
933 WriteCvtSD2I>, XD, REX_W;
934 defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
935 "cvtsi2ss", "cvtsi2ss{l}",
936 WriteCvtI2SS, ReadInt2Fpu>, XS;
937 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
938 "cvtsi2ss", "cvtsi2ss{q}",
939 WriteCvtI2SS, ReadInt2Fpu>, XS, REX_W;
940 defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
941 "cvtsi2sd", "cvtsi2sd{l}",
942 WriteCvtI2SD, ReadInt2Fpu>, XD;
943 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
944 "cvtsi2sd", "cvtsi2sd{q}",
945 WriteCvtI2SD, ReadInt2Fpu>, XD, REX_W;
946 } // isCodeGenOnly = 1
948 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
949 // and/or XMM operand(s).
951 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
952 ValueType DstVT, ValueType SrcVT, SDNode OpNode,
953 Operand memop, ComplexPattern mem_cpat, string asm,
954 X86FoldableSchedWrite sched> {
955 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
956 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
957 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
959 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
960 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
961 [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>,
962 Sched<[sched.Folded]>;
965 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
966 RegisterClass DstRC, X86MemOperand x86memop,
967 string asm, string mem, X86FoldableSchedWrite sched,
969 let hasSideEffects = 0 in {
970 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
972 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
973 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
974 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
976 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
977 (ins DstRC:$src1, x86memop:$src2),
979 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
980 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
981 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
985 let Predicates = [UseAVX] in {
986 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
987 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
988 WriteCvtSD2I>, XD, VEX, VEX_LIG;
989 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
990 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
991 WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG;
993 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
994 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD;
995 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
996 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
999 let Predicates = [UseAVX] in {
1000 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1001 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG;
1002 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1003 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG, VEX_W;
1004 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1005 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG;
1006 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1007 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG, VEX_W;
1009 let Constraints = "$src1 = $dst" in {
1010 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1011 i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS;
1012 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1013 i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, REX_W;
1014 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1015 i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD;
1016 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1017 i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, REX_W;
1020 def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1021 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1022 def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1023 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1024 def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1025 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1026 def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1027 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1029 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1030 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1031 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1032 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1034 def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1035 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1036 def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1037 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1038 def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1039 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1040 def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1041 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1043 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1044 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1045 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1046 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1050 // Aliases for intrinsics
1051 let Predicates = [UseAVX] in {
1052 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1053 ssmem, sse_load_f32, "cvttss2si",
1054 WriteCvtSS2I>, XS, VEX, VEX_LIG;
1055 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1056 X86cvtts2Int, ssmem, sse_load_f32,
1057 "cvttss2si", WriteCvtSS2I>,
1058 XS, VEX, VEX_LIG, VEX_W;
1059 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1060 sdmem, sse_load_f64, "cvttsd2si",
1061 WriteCvtSS2I>, XD, VEX, VEX_LIG;
1062 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1063 X86cvtts2Int, sdmem, sse_load_f64,
1064 "cvttsd2si", WriteCvtSS2I>,
1065 XD, VEX, VEX_LIG, VEX_W;
1067 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1068 ssmem, sse_load_f32, "cvttss2si",
1070 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1071 X86cvtts2Int, ssmem, sse_load_f32,
1072 "cvttss2si", WriteCvtSS2I>, XS, REX_W;
1073 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1074 sdmem, sse_load_f64, "cvttsd2si",
1076 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1077 X86cvtts2Int, sdmem, sse_load_f64,
1078 "cvttsd2si", WriteCvtSD2I>, XD, REX_W;
1080 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1081 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1082 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1083 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1084 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1085 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1086 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1087 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1088 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1089 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1090 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1091 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1092 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1093 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1094 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1095 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1097 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1098 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1099 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1100 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1101 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1102 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1103 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1104 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1105 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1106 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1107 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1108 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1109 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1110 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1111 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1112 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1114 let Predicates = [UseAVX] in {
1115 defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1116 ssmem, sse_load_f32, "cvtss2si",
1117 WriteCvtSS2I>, XS, VEX, VEX_LIG;
1118 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1119 ssmem, sse_load_f32, "cvtss2si",
1120 WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG;
1122 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1123 ssmem, sse_load_f32, "cvtss2si",
1125 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1126 ssmem, sse_load_f32, "cvtss2si",
1127 WriteCvtSS2I>, XS, REX_W;
1129 defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1130 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1131 SSEPackedSingle, WriteCvtI2PS>,
1132 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1133 defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1134 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1135 SSEPackedSingle, WriteCvtI2PSY>,
1136 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1138 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1139 "cvtdq2ps\t{$src, $dst|$dst, $src}",
1140 SSEPackedSingle, WriteCvtI2PS>,
1141 PS, Requires<[UseSSE2]>;
1144 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1145 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1146 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1147 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1148 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1149 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1150 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1151 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1152 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1153 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1154 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1155 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1156 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1157 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1158 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1159 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1162 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1163 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1164 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1165 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1166 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1167 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1168 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1169 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1170 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1171 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1172 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1173 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1174 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1175 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1176 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1177 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1181 // Convert scalar double to scalar single
1182 let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in {
1183 def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1184 (ins FR32:$src1, FR64:$src2),
1185 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1186 VEX_4V, VEX_LIG, VEX_WIG,
1187 Sched<[WriteCvtSD2SS]>;
1189 def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1190 (ins FR32:$src1, f64mem:$src2),
1191 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1192 XD, VEX_4V, VEX_LIG, VEX_WIG,
1193 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1196 def : Pat<(f32 (fpround FR64:$src)),
1197 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1200 let isCodeGenOnly = 1 in {
1201 def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1202 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1203 [(set FR32:$dst, (fpround FR64:$src))]>,
1204 Sched<[WriteCvtSD2SS]>;
1205 def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1206 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1207 [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>,
1208 XD, Requires<[UseSSE2, OptForSize]>,
1209 Sched<[WriteCvtSD2SS.Folded]>;
1212 def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1213 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1214 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1216 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1217 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1218 Sched<[WriteCvtSD2SS]>;
1219 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1220 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1221 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1223 (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>,
1224 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1225 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1226 let Constraints = "$src1 = $dst" in {
1227 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1228 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1229 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1231 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1232 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1233 def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1234 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1235 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1237 (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>,
1238 XD, Requires<[UseSSE2]>,
1239 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1242 // Convert scalar single to scalar double
1243 // SSE2 instructions with XS prefix
1244 let isCodeGenOnly = 1, hasSideEffects = 0 in {
1245 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1246 (ins FR64:$src1, FR32:$src2),
1247 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1248 XS, VEX_4V, VEX_LIG, VEX_WIG,
1249 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>;
1251 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1252 (ins FR64:$src1, f32mem:$src2),
1253 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1254 XS, VEX_4V, VEX_LIG, VEX_WIG,
1255 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1256 Requires<[UseAVX, OptForSize]>;
1257 } // isCodeGenOnly = 1, hasSideEffects = 0
1259 def : Pat<(f64 (fpextend FR32:$src)),
1260 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1261 def : Pat<(fpextend (loadf32 addr:$src)),
1262 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1264 let isCodeGenOnly = 1 in {
1265 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1266 "cvtss2sd\t{$src, $dst|$dst, $src}",
1267 [(set FR64:$dst, (fpextend FR32:$src))]>,
1268 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>;
1269 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1270 "cvtss2sd\t{$src, $dst|$dst, $src}",
1271 [(set FR64:$dst, (fpextend (loadf32 addr:$src)))]>,
1272 XS, Requires<[UseSSE2, OptForSize]>,
1273 Sched<[WriteCvtSS2SD.Folded]>;
1274 } // isCodeGenOnly = 1
1276 let hasSideEffects = 0 in {
1277 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1278 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1279 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1280 []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
1281 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1283 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1284 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1285 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1286 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
1287 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1288 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1289 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1290 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1291 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1292 []>, XS, Requires<[UseSSE2]>,
1293 Sched<[WriteCvtSS2SD]>;
1295 def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1296 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1297 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1298 []>, XS, Requires<[UseSSE2]>,
1299 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1301 } // hasSideEffects = 0
1303 // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1304 // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1305 // vmovs{s,d} instructions
1306 let Predicates = [UseAVX] in {
1307 def : Pat<(v4f32 (X86Movss
1309 (v4f32 (scalar_to_vector
1310 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1311 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1313 def : Pat<(v2f64 (X86Movsd
1315 (v2f64 (scalar_to_vector
1316 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1317 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1319 def : Pat<(v4f32 (X86Movss
1321 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1322 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1324 def : Pat<(v4f32 (X86Movss
1326 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
1327 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1329 def : Pat<(v4f32 (X86Movss
1331 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1332 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1334 def : Pat<(v4f32 (X86Movss
1336 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
1337 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1339 def : Pat<(v2f64 (X86Movsd
1341 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1342 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1344 def : Pat<(v2f64 (X86Movsd
1346 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
1347 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1349 def : Pat<(v2f64 (X86Movsd
1351 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1352 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1354 def : Pat<(v2f64 (X86Movsd
1356 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
1357 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1358 } // Predicates = [UseAVX]
1360 let Predicates = [UseSSE2] in {
1361 def : Pat<(v4f32 (X86Movss
1363 (v4f32 (scalar_to_vector
1364 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1365 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1367 def : Pat<(v2f64 (X86Movsd
1369 (v2f64 (scalar_to_vector
1370 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1371 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1373 def : Pat<(v2f64 (X86Movsd
1375 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1376 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1378 def : Pat<(v2f64 (X86Movsd
1380 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
1381 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1383 def : Pat<(v2f64 (X86Movsd
1385 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1386 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1388 def : Pat<(v2f64 (X86Movsd
1390 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
1391 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1392 } // Predicates = [UseSSE2]
1394 let Predicates = [UseSSE1] in {
1395 def : Pat<(v4f32 (X86Movss
1397 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1398 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1400 def : Pat<(v4f32 (X86Movss
1402 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
1403 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1405 def : Pat<(v4f32 (X86Movss
1407 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1408 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1410 def : Pat<(v4f32 (X86Movss
1412 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
1413 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1414 } // Predicates = [UseSSE1]
1416 let Predicates = [HasAVX, NoVLX] in {
1417 // Convert packed single/double fp to doubleword
1418 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1419 "cvtps2dq\t{$src, $dst|$dst, $src}",
1420 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1421 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1422 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1423 "cvtps2dq\t{$src, $dst|$dst, $src}",
1425 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1426 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1427 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1428 "cvtps2dq\t{$src, $dst|$dst, $src}",
1430 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1431 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1432 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1433 "cvtps2dq\t{$src, $dst|$dst, $src}",
1435 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1436 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1438 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1439 "cvtps2dq\t{$src, $dst|$dst, $src}",
1440 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1441 Sched<[WriteCvtPS2I]>;
1442 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1443 "cvtps2dq\t{$src, $dst|$dst, $src}",
1445 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1446 Sched<[WriteCvtPS2ILd]>;
1449 // Convert Packed Double FP to Packed DW Integers
1450 let Predicates = [HasAVX, NoVLX] in {
1451 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1452 // register, but the same isn't true when using memory operands instead.
1453 // Provide other assembly rr and rm forms to address this explicitly.
1454 def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1455 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1457 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1458 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1461 def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1462 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1464 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1465 Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1468 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1469 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1471 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1472 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1473 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1474 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1476 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1477 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1480 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1481 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1482 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1483 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1485 def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1486 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1488 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1489 Sched<[WriteCvtPD2ILd]>;
1490 def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1491 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1493 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1494 Sched<[WriteCvtPD2I]>;
1496 // Convert with truncation packed single/double fp to doubleword
1497 // SSE2 packed instructions with XS prefix
1498 let Predicates = [HasAVX, NoVLX] in {
1499 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1500 "cvttps2dq\t{$src, $dst|$dst, $src}",
1502 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
1503 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1504 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1505 "cvttps2dq\t{$src, $dst|$dst, $src}",
1507 (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>,
1508 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1509 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1510 "cvttps2dq\t{$src, $dst|$dst, $src}",
1512 (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>,
1513 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1514 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1515 "cvttps2dq\t{$src, $dst|$dst, $src}",
1517 (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>,
1519 Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1522 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1523 "cvttps2dq\t{$src, $dst|$dst, $src}",
1525 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
1526 Sched<[WriteCvtPS2I]>;
1527 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1528 "cvttps2dq\t{$src, $dst|$dst, $src}",
1530 (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>,
1531 Sched<[WriteCvtPS2ILd]>;
1533 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1534 // register, but the same isn't true when using memory operands instead.
1535 // Provide other assembly rr and rm forms to address this explicitly.
1536 let Predicates = [HasAVX, NoVLX] in {
1538 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1539 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1541 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
1542 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1543 def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1544 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1546 (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>,
1547 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1550 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1551 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1553 (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>,
1554 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1555 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1556 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1558 (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>,
1559 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1560 } // Predicates = [HasAVX, NoVLX]
1562 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1563 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1564 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1565 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1567 let Predicates = [HasAVX, NoVLX] in {
1568 def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
1569 (VCVTTPD2DQYrr VR256:$src)>;
1570 def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
1571 (VCVTTPD2DQYrm addr:$src)>;
1574 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1575 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1577 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
1578 Sched<[WriteCvtPD2I]>;
1579 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1580 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1582 (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>,
1583 Sched<[WriteCvtPD2ILd]>;
1585 // Convert packed single to packed double
1586 let Predicates = [HasAVX, NoVLX] in {
1587 // SSE2 instructions without OpSize prefix
1588 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1589 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1590 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
1591 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1592 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1593 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1594 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1595 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1596 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1597 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1598 [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>,
1599 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1600 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1601 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1602 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1603 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1606 let Predicates = [UseSSE2] in {
1607 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1608 "cvtps2pd\t{$src, $dst|$dst, $src}",
1609 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
1610 PS, Sched<[WriteCvtPS2PD]>;
1611 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1612 "cvtps2pd\t{$src, $dst|$dst, $src}",
1613 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1614 PS, Sched<[WriteCvtPS2PD.Folded]>;
1617 // Convert Packed DW Integers to Packed Double FP
1618 let Predicates = [HasAVX, NoVLX] in {
1619 let hasSideEffects = 0, mayLoad = 1 in
1620 def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1621 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1623 (v2f64 (X86VSintToFP
1625 (v2i64 (scalar_to_vector
1626 (loadi64 addr:$src)))))))]>,
1627 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1628 def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1629 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1631 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
1632 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1633 def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1634 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1636 (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>,
1637 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1639 def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1640 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1642 (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
1643 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1646 let hasSideEffects = 0, mayLoad = 1 in
1647 def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1648 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1650 (v2f64 (X86VSintToFP
1652 (v2i64 (scalar_to_vector
1653 (loadi64 addr:$src)))))))]>,
1654 Sched<[WriteCvtI2PDLd]>;
1655 def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1656 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1658 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
1659 Sched<[WriteCvtI2PD]>;
1661 // AVX register conversion intrinsics
1662 let Predicates = [HasAVX, NoVLX] in {
1663 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1664 (VCVTDQ2PDrm addr:$src)>;
1665 } // Predicates = [HasAVX, NoVLX]
1667 // SSE2 register conversion intrinsics
1668 let Predicates = [UseSSE2] in {
1669 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1670 (CVTDQ2PDrm addr:$src)>;
1671 } // Predicates = [UseSSE2]
1673 // Convert packed double to packed single
1674 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1675 // register, but the same isn't true when using memory operands instead.
1676 // Provide other assembly rr and rm forms to address this explicitly.
1677 let Predicates = [HasAVX, NoVLX] in {
1679 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1680 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1681 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
1682 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1683 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1684 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1685 [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>,
1686 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1688 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1689 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1690 [(set VR128:$dst, (X86vfpround VR256:$src))]>,
1691 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1692 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1693 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1694 [(set VR128:$dst, (X86vfpround (loadv4f64 addr:$src)))]>,
1695 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1696 } // Predicates = [HasAVX, NoVLX]
1698 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1699 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1700 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1701 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1703 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1704 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1705 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
1706 Sched<[WriteCvtPD2PS]>;
1707 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1708 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1709 [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>,
1710 Sched<[WriteCvtPD2PS.Folded]>;
1712 let Predicates = [HasAVX, NoVLX] in {
1713 def : Pat<(v4f32 (fpround (v4f64 VR256:$src))),
1714 (VCVTPD2PSYrr VR256:$src)>;
1715 def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))),
1716 (VCVTPD2PSYrm addr:$src)>;
1719 //===----------------------------------------------------------------------===//
1720 // SSE 1 & 2 - Compare Instructions
1721 //===----------------------------------------------------------------------===//
1723 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1724 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1725 SDNode OpNode, ValueType VT,
1726 PatFrag ld_frag, string asm,
1727 X86FoldableSchedWrite sched> {
1728 let isCommutable = 1 in
1729 def rr : SIi8<0xC2, MRMSrcReg,
1730 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1731 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>,
1733 def rm : SIi8<0xC2, MRMSrcMem,
1734 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1735 [(set RC:$dst, (OpNode (VT RC:$src1),
1736 (ld_frag addr:$src2), timm:$cc))]>,
1737 Sched<[sched.Folded, sched.ReadAfterFold]>;
1740 let isCodeGenOnly = 1 in {
1741 let ExeDomain = SSEPackedSingle in
1742 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
1743 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1744 SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
1745 let ExeDomain = SSEPackedDouble in
1746 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
1747 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1748 SchedWriteFCmpSizes.PD.Scl>,
1749 XD, VEX_4V, VEX_LIG, VEX_WIG;
1751 let Constraints = "$src1 = $dst" in {
1752 let ExeDomain = SSEPackedSingle in
1753 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
1754 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1755 SchedWriteFCmpSizes.PS.Scl>, XS;
1756 let ExeDomain = SSEPackedDouble in
1757 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
1758 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1759 SchedWriteFCmpSizes.PD.Scl>, XD;
1763 multiclass sse12_cmp_scalar_int<Operand memop,
1764 Intrinsic Int, string asm, X86FoldableSchedWrite sched,
1765 ComplexPattern mem_cpat> {
1766 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1767 (ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
1768 [(set VR128:$dst, (Int VR128:$src1,
1769 VR128:$src, timm:$cc))]>,
1772 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1773 (ins VR128:$src1, memop:$src, u8imm:$cc), asm,
1774 [(set VR128:$dst, (Int VR128:$src1,
1775 mem_cpat:$src, timm:$cc))]>,
1776 Sched<[sched.Folded, sched.ReadAfterFold]>;
1779 // Aliases to match intrinsics which expect XMM operand(s).
1780 let ExeDomain = SSEPackedSingle in
1781 defm VCMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
1782 "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
1783 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1784 XS, VEX_4V, VEX_LIG, VEX_WIG;
1785 let ExeDomain = SSEPackedDouble in
1786 defm VCMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
1787 "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
1788 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1789 XD, VEX_4V, VEX_LIG, VEX_WIG;
1790 let Constraints = "$src1 = $dst" in {
1791 let ExeDomain = SSEPackedSingle in
1792 defm CMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
1793 "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}",
1794 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1795 let ExeDomain = SSEPackedDouble in
1796 defm CMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
1797 "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}",
1798 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1802 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1803 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
1804 ValueType vt, X86MemOperand x86memop,
1805 PatFrag ld_frag, string OpcodeStr,
1806 X86FoldableSchedWrite sched> {
1807 let hasSideEffects = 0 in {
1808 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1809 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1810 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1813 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1814 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1815 [(set EFLAGS, (OpNode (vt RC:$src1),
1816 (ld_frag addr:$src2)))]>,
1817 Sched<[sched.Folded, sched.ReadAfterFold]>;
1821 // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1822 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1823 ValueType vt, Operand memop,
1824 ComplexPattern mem_cpat, string OpcodeStr,
1825 X86FoldableSchedWrite sched> {
1826 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1827 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1828 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1831 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1832 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1833 [(set EFLAGS, (OpNode (vt RC:$src1),
1835 Sched<[sched.Folded, sched.ReadAfterFold]>;
1838 let Defs = [EFLAGS] in {
1839 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
1840 "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1841 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
1842 "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1843 let Pattern = []<dag> in {
1844 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
1845 "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1846 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
1847 "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1850 let isCodeGenOnly = 1 in {
1851 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1852 sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1853 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1854 sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1856 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1857 sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1858 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1859 sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1861 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
1862 "ucomiss", WriteFCom>, PS;
1863 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
1864 "ucomisd", WriteFCom>, PD;
1866 let Pattern = []<dag> in {
1867 defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
1868 "comiss", WriteFCom>, PS;
1869 defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
1870 "comisd", WriteFCom>, PD;
1873 let isCodeGenOnly = 1 in {
1874 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1875 sse_load_f32, "ucomiss", WriteFCom>, PS;
1876 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1877 sse_load_f64, "ucomisd", WriteFCom>, PD;
1879 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1880 sse_load_f32, "comiss", WriteFCom>, PS;
1881 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1882 sse_load_f64, "comisd", WriteFCom>, PD;
1884 } // Defs = [EFLAGS]
1886 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
1887 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1888 ValueType VT, string asm,
1889 X86FoldableSchedWrite sched,
1890 Domain d, PatFrag ld_frag> {
1891 let isCommutable = 1 in
1892 def rri : PIi8<0xC2, MRMSrcReg,
1893 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1894 [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
1896 def rmi : PIi8<0xC2, MRMSrcMem,
1897 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1899 (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
1900 Sched<[sched.Folded, sched.ReadAfterFold]>;
1903 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1904 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1905 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
1906 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1907 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1908 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
1909 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1910 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1911 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
1912 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1913 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1914 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
1915 let Constraints = "$src1 = $dst" in {
1916 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1917 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1918 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1919 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1920 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1921 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
1924 def CommutableCMPCC : PatLeaf<(timm), [{
1925 uint64_t Imm = N->getZExtValue() & 0x7;
1926 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
1929 // Patterns to select compares with loads in first operand.
1930 let Predicates = [HasAVX] in {
1931 def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1,
1932 CommutableCMPCC:$cc)),
1933 (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
1935 def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1,
1936 CommutableCMPCC:$cc)),
1937 (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
1939 def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1,
1940 CommutableCMPCC:$cc)),
1941 (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
1943 def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1,
1944 CommutableCMPCC:$cc)),
1945 (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
1947 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1948 CommutableCMPCC:$cc)),
1949 (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
1951 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
1952 CommutableCMPCC:$cc)),
1953 (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
1956 let Predicates = [UseSSE2] in {
1957 def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1,
1958 CommutableCMPCC:$cc)),
1959 (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
1961 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1962 CommutableCMPCC:$cc)),
1963 (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
1966 let Predicates = [UseSSE1] in {
1967 def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1,
1968 CommutableCMPCC:$cc)),
1969 (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
1971 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
1972 CommutableCMPCC:$cc)),
1973 (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
1976 //===----------------------------------------------------------------------===//
1977 // SSE 1 & 2 - Shuffle Instructions
1978 //===----------------------------------------------------------------------===//
1980 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
1981 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
1982 ValueType vt, string asm, PatFrag mem_frag,
1983 X86FoldableSchedWrite sched, Domain d,
1984 bit IsCommutable = 0> {
1985 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
1986 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
1987 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
1988 (i8 timm:$src3))))], d>,
1989 Sched<[sched.Folded, sched.ReadAfterFold]>;
1990 let isCommutable = IsCommutable in
1991 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
1992 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
1993 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
1994 (i8 timm:$src3))))], d>,
1998 let Predicates = [HasAVX, NoVLX] in {
1999 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2000 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2001 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2002 PS, VEX_4V, VEX_WIG;
2003 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2004 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2005 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2006 PS, VEX_4V, VEX_L, VEX_WIG;
2007 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2008 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2009 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2010 PD, VEX_4V, VEX_WIG;
2011 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2012 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2013 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2014 PD, VEX_4V, VEX_L, VEX_WIG;
2016 let Constraints = "$src1 = $dst" in {
2017 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2018 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2019 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2020 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2021 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2022 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2025 //===----------------------------------------------------------------------===//
2026 // SSE 1 & 2 - Unpack FP Instructions
2027 //===----------------------------------------------------------------------===//
2029 /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2030 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2031 PatFrag mem_frag, RegisterClass RC,
2032 X86MemOperand x86memop, string asm,
2033 X86FoldableSchedWrite sched, Domain d,
2034 bit IsCommutable = 0> {
2035 let isCommutable = IsCommutable in
2036 def rr : PI<opc, MRMSrcReg,
2037 (outs RC:$dst), (ins RC:$src1, RC:$src2),
2039 (vt (OpNode RC:$src1, RC:$src2)))], d>,
2041 def rm : PI<opc, MRMSrcMem,
2042 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2044 (vt (OpNode RC:$src1,
2045 (mem_frag addr:$src2))))], d>,
2046 Sched<[sched.Folded, sched.ReadAfterFold]>;
2049 let Predicates = [HasAVX, NoVLX] in {
2050 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2051 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2052 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2053 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2054 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2055 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2056 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2057 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2058 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2059 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2060 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2061 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2063 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2064 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2065 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2066 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2067 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2068 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2069 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2070 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2071 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2072 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2073 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2074 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2075 }// Predicates = [HasAVX, NoVLX]
2077 let Constraints = "$src1 = $dst" in {
2078 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2079 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2080 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2081 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2082 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2083 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2084 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2085 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2086 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2087 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2088 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2089 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2090 } // Constraints = "$src1 = $dst"
2092 let Predicates = [HasAVX1Only] in {
2093 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2094 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2095 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2096 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2097 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2098 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2099 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2100 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2102 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2103 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2104 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2105 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2106 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2107 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2108 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2109 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2112 let Predicates = [UseSSE2] in {
2113 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2114 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2115 (v2f64 (simple_load addr:$src2)))),
2116 (MOVHPDrm VR128:$src1, addr:$src2)>;
2119 //===----------------------------------------------------------------------===//
2120 // SSE 1 & 2 - Extract Floating-Point Sign mask
2121 //===----------------------------------------------------------------------===//
2123 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2124 multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2125 string asm, Domain d> {
2126 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2127 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2128 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2129 Sched<[WriteFMOVMSK]>;
2132 let Predicates = [HasAVX] in {
2133 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2134 SSEPackedSingle>, PS, VEX, VEX_WIG;
2135 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2136 SSEPackedDouble>, PD, VEX, VEX_WIG;
2137 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2138 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2139 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2140 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2142 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2143 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2144 (VMOVMSKPSrr VR128:$src)>;
2145 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2146 (VMOVMSKPDrr VR128:$src)>;
2147 def : Pat<(X86movmsk (v8i32 VR256:$src)),
2148 (VMOVMSKPSYrr VR256:$src)>;
2149 def : Pat<(X86movmsk (v4i64 VR256:$src)),
2150 (VMOVMSKPDYrr VR256:$src)>;
2153 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2154 SSEPackedSingle>, PS;
2155 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2156 SSEPackedDouble>, PD;
2158 let Predicates = [UseSSE2] in {
2159 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2160 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2161 (MOVMSKPSrr VR128:$src)>;
2162 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2163 (MOVMSKPDrr VR128:$src)>;
2166 //===---------------------------------------------------------------------===//
2167 // SSE2 - Packed Integer Logical Instructions
2168 //===---------------------------------------------------------------------===//
2170 let ExeDomain = SSEPackedInt in { // SSE integer instructions
2172 /// PDI_binop_rm - Simple SSE2 binary operator.
2173 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2174 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2175 X86MemOperand x86memop, X86FoldableSchedWrite sched,
2176 bit IsCommutable, bit Is2Addr> {
2177 let isCommutable = IsCommutable in
2178 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2179 (ins RC:$src1, RC:$src2),
2181 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2182 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2183 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2185 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2186 (ins RC:$src1, x86memop:$src2),
2188 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2189 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2190 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2191 Sched<[sched.Folded, sched.ReadAfterFold]>;
2193 } // ExeDomain = SSEPackedInt
2195 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2196 ValueType OpVT128, ValueType OpVT256,
2197 X86SchedWriteWidths sched, bit IsCommutable,
2199 let Predicates = [HasAVX, prd] in
2200 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2201 VR128, load, i128mem, sched.XMM,
2202 IsCommutable, 0>, VEX_4V, VEX_WIG;
2204 let Constraints = "$src1 = $dst" in
2205 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2206 memop, i128mem, sched.XMM, IsCommutable, 1>;
2208 let Predicates = [HasAVX2, prd] in
2209 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2210 OpVT256, VR256, load, i256mem, sched.YMM,
2211 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2214 // These are ordered here for pattern ordering requirements with the fp versions
2216 defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2217 SchedWriteVecLogic, 1, NoVLX>;
2218 defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2219 SchedWriteVecLogic, 1, NoVLX>;
2220 defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2221 SchedWriteVecLogic, 1, NoVLX>;
2222 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2223 SchedWriteVecLogic, 0, NoVLX>;
2225 //===----------------------------------------------------------------------===//
2226 // SSE 1 & 2 - Logical Instructions
2227 //===----------------------------------------------------------------------===//
2229 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2231 /// There are no patterns here because isel prefers integer versions for SSE2
2232 /// and later. There are SSE1 v4f32 patterns later.
2233 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2234 SDNode OpNode, X86SchedWriteWidths sched> {
2235 let Predicates = [HasAVX, NoVLX] in {
2236 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2237 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2238 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2240 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2241 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2242 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2244 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2245 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2246 [], [], 0>, PS, VEX_4V, VEX_WIG;
2248 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2249 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2250 [], [], 0>, PD, VEX_4V, VEX_WIG;
2253 let Constraints = "$src1 = $dst" in {
2254 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2255 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2258 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2259 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2264 defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
2265 defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
2266 defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
2267 let isCommutable = 0 in
2268 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
2270 let Predicates = [HasAVX2, NoVLX] in {
2271 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2272 (VPANDYrr VR256:$src1, VR256:$src2)>;
2273 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2274 (VPANDYrr VR256:$src1, VR256:$src2)>;
2275 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2276 (VPANDYrr VR256:$src1, VR256:$src2)>;
2278 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2279 (VPORYrr VR256:$src1, VR256:$src2)>;
2280 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2281 (VPORYrr VR256:$src1, VR256:$src2)>;
2282 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2283 (VPORYrr VR256:$src1, VR256:$src2)>;
2285 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2286 (VPXORYrr VR256:$src1, VR256:$src2)>;
2287 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2288 (VPXORYrr VR256:$src1, VR256:$src2)>;
2289 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2290 (VPXORYrr VR256:$src1, VR256:$src2)>;
2292 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2293 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2294 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2295 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2296 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2297 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2299 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2300 (VPANDYrm VR256:$src1, addr:$src2)>;
2301 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2302 (VPANDYrm VR256:$src1, addr:$src2)>;
2303 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2304 (VPANDYrm VR256:$src1, addr:$src2)>;
2306 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2307 (VPORYrm VR256:$src1, addr:$src2)>;
2308 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2309 (VPORYrm VR256:$src1, addr:$src2)>;
2310 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2311 (VPORYrm VR256:$src1, addr:$src2)>;
2313 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2314 (VPXORYrm VR256:$src1, addr:$src2)>;
2315 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2316 (VPXORYrm VR256:$src1, addr:$src2)>;
2317 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2318 (VPXORYrm VR256:$src1, addr:$src2)>;
2320 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2321 (VPANDNYrm VR256:$src1, addr:$src2)>;
2322 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2323 (VPANDNYrm VR256:$src1, addr:$src2)>;
2324 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2325 (VPANDNYrm VR256:$src1, addr:$src2)>;
2328 // If only AVX1 is supported, we need to handle integer operations with
2329 // floating point instructions since the integer versions aren't available.
2330 let Predicates = [HasAVX1Only] in {
2331 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2332 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2333 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2334 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2335 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2336 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2337 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2338 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2340 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2341 (VORPSYrr VR256:$src1, VR256:$src2)>;
2342 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2343 (VORPSYrr VR256:$src1, VR256:$src2)>;
2344 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2345 (VORPSYrr VR256:$src1, VR256:$src2)>;
2346 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2347 (VORPSYrr VR256:$src1, VR256:$src2)>;
2349 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2350 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2351 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2352 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2353 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2354 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2355 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2356 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2358 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2359 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2360 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2361 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2362 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2363 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2364 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2365 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2367 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2368 (VANDPSYrm VR256:$src1, addr:$src2)>;
2369 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2370 (VANDPSYrm VR256:$src1, addr:$src2)>;
2371 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2372 (VANDPSYrm VR256:$src1, addr:$src2)>;
2373 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2374 (VANDPSYrm VR256:$src1, addr:$src2)>;
2376 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2377 (VORPSYrm VR256:$src1, addr:$src2)>;
2378 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2379 (VORPSYrm VR256:$src1, addr:$src2)>;
2380 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2381 (VORPSYrm VR256:$src1, addr:$src2)>;
2382 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2383 (VORPSYrm VR256:$src1, addr:$src2)>;
2385 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2386 (VXORPSYrm VR256:$src1, addr:$src2)>;
2387 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2388 (VXORPSYrm VR256:$src1, addr:$src2)>;
2389 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2390 (VXORPSYrm VR256:$src1, addr:$src2)>;
2391 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2392 (VXORPSYrm VR256:$src1, addr:$src2)>;
2394 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2395 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2396 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2397 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2398 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2399 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2400 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2401 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2404 let Predicates = [HasAVX, NoVLX] in {
2405 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2406 (VPANDrr VR128:$src1, VR128:$src2)>;
2407 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2408 (VPANDrr VR128:$src1, VR128:$src2)>;
2409 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2410 (VPANDrr VR128:$src1, VR128:$src2)>;
2412 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2413 (VPORrr VR128:$src1, VR128:$src2)>;
2414 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2415 (VPORrr VR128:$src1, VR128:$src2)>;
2416 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2417 (VPORrr VR128:$src1, VR128:$src2)>;
2419 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2420 (VPXORrr VR128:$src1, VR128:$src2)>;
2421 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2422 (VPXORrr VR128:$src1, VR128:$src2)>;
2423 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2424 (VPXORrr VR128:$src1, VR128:$src2)>;
2426 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2427 (VPANDNrr VR128:$src1, VR128:$src2)>;
2428 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2429 (VPANDNrr VR128:$src1, VR128:$src2)>;
2430 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2431 (VPANDNrr VR128:$src1, VR128:$src2)>;
2433 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2434 (VPANDrm VR128:$src1, addr:$src2)>;
2435 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2436 (VPANDrm VR128:$src1, addr:$src2)>;
2437 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2438 (VPANDrm VR128:$src1, addr:$src2)>;
2440 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2441 (VPORrm VR128:$src1, addr:$src2)>;
2442 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2443 (VPORrm VR128:$src1, addr:$src2)>;
2444 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2445 (VPORrm VR128:$src1, addr:$src2)>;
2447 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2448 (VPXORrm VR128:$src1, addr:$src2)>;
2449 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2450 (VPXORrm VR128:$src1, addr:$src2)>;
2451 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2452 (VPXORrm VR128:$src1, addr:$src2)>;
2454 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2455 (VPANDNrm VR128:$src1, addr:$src2)>;
2456 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2457 (VPANDNrm VR128:$src1, addr:$src2)>;
2458 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2459 (VPANDNrm VR128:$src1, addr:$src2)>;
2462 let Predicates = [UseSSE2] in {
2463 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2464 (PANDrr VR128:$src1, VR128:$src2)>;
2465 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2466 (PANDrr VR128:$src1, VR128:$src2)>;
2467 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2468 (PANDrr VR128:$src1, VR128:$src2)>;
2470 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2471 (PORrr VR128:$src1, VR128:$src2)>;
2472 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2473 (PORrr VR128:$src1, VR128:$src2)>;
2474 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2475 (PORrr VR128:$src1, VR128:$src2)>;
2477 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2478 (PXORrr VR128:$src1, VR128:$src2)>;
2479 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2480 (PXORrr VR128:$src1, VR128:$src2)>;
2481 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2482 (PXORrr VR128:$src1, VR128:$src2)>;
2484 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2485 (PANDNrr VR128:$src1, VR128:$src2)>;
2486 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2487 (PANDNrr VR128:$src1, VR128:$src2)>;
2488 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2489 (PANDNrr VR128:$src1, VR128:$src2)>;
2491 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2492 (PANDrm VR128:$src1, addr:$src2)>;
2493 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2494 (PANDrm VR128:$src1, addr:$src2)>;
2495 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2496 (PANDrm VR128:$src1, addr:$src2)>;
2498 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2499 (PORrm VR128:$src1, addr:$src2)>;
2500 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2501 (PORrm VR128:$src1, addr:$src2)>;
2502 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2503 (PORrm VR128:$src1, addr:$src2)>;
2505 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2506 (PXORrm VR128:$src1, addr:$src2)>;
2507 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2508 (PXORrm VR128:$src1, addr:$src2)>;
2509 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2510 (PXORrm VR128:$src1, addr:$src2)>;
2512 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2513 (PANDNrm VR128:$src1, addr:$src2)>;
2514 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2515 (PANDNrm VR128:$src1, addr:$src2)>;
2516 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2517 (PANDNrm VR128:$src1, addr:$src2)>;
2520 // Patterns for packed operations when we don't have integer type available.
2521 def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2522 (ANDPSrr VR128:$src1, VR128:$src2)>;
2523 def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2524 (ORPSrr VR128:$src1, VR128:$src2)>;
2525 def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2526 (XORPSrr VR128:$src1, VR128:$src2)>;
2527 def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2528 (ANDNPSrr VR128:$src1, VR128:$src2)>;
2530 def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2531 (ANDPSrm VR128:$src1, addr:$src2)>;
2532 def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2533 (ORPSrm VR128:$src1, addr:$src2)>;
2534 def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2535 (XORPSrm VR128:$src1, addr:$src2)>;
2536 def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2537 (ANDNPSrm VR128:$src1, addr:$src2)>;
2539 //===----------------------------------------------------------------------===//
2540 // SSE 1 & 2 - Arithmetic Instructions
2541 //===----------------------------------------------------------------------===//
2543 /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2546 /// In addition, we also have a special variant of the scalar form here to
2547 /// represent the associated intrinsic operation. This form is unlike the
2548 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
2549 /// and leaves the top elements unmodified (therefore these cannot be commuted).
2551 /// These three forms can each be reg+reg or reg+mem.
2554 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2556 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2557 SDNode OpNode, X86SchedWriteSizes sched> {
2558 let Predicates = [HasAVX, NoVLX] in {
2559 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2560 VR128, v4f32, f128mem, loadv4f32,
2561 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2562 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2563 VR128, v2f64, f128mem, loadv2f64,
2564 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2566 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2567 OpNode, VR256, v8f32, f256mem, loadv8f32,
2568 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2569 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2570 OpNode, VR256, v4f64, f256mem, loadv4f64,
2571 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2574 let Constraints = "$src1 = $dst" in {
2575 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2576 v4f32, f128mem, memopv4f32, SSEPackedSingle,
2578 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2579 v2f64, f128mem, memopv2f64, SSEPackedDouble,
2584 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2585 X86SchedWriteSizes sched> {
2586 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2587 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2588 XS, VEX_4V, VEX_LIG, VEX_WIG;
2589 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2590 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2591 XD, VEX_4V, VEX_LIG, VEX_WIG;
2593 let Constraints = "$src1 = $dst" in {
2594 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2595 OpNode, FR32, f32mem, SSEPackedSingle,
2597 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2598 OpNode, FR64, f64mem, SSEPackedDouble,
2603 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2604 SDPatternOperator OpNode,
2605 X86SchedWriteSizes sched> {
2606 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2607 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2608 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2609 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2610 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2611 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2613 let Constraints = "$src1 = $dst" in {
2614 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2615 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2616 SSEPackedSingle, sched.PS.Scl>, XS;
2617 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2618 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2619 SSEPackedDouble, sched.PD.Scl>, XD;
2623 // Binary Arithmetic instructions
2624 defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>,
2625 basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>,
2626 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2627 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>,
2628 basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>,
2629 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2630 let isCommutable = 0 in {
2631 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>,
2632 basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>,
2633 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2634 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>,
2635 basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>,
2636 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2637 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2638 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2639 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2640 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2641 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2642 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2645 let isCodeGenOnly = 1 in {
2646 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2647 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2648 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2649 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2652 // Patterns used to select SSE scalar fp arithmetic instructions from
2655 // (1) a scalar fp operation followed by a blend
2657 // The effect is that the backend no longer emits unnecessary vector
2658 // insert instructions immediately after SSE scalar fp instructions
2659 // like addss or mulss.
2661 // For example, given the following code:
2662 // __m128 foo(__m128 A, __m128 B) {
2667 // Previously we generated:
2668 // addss %xmm0, %xmm1
2669 // movss %xmm1, %xmm0
2672 // addss %xmm1, %xmm0
2674 // (2) a vector packed single/double fp operation followed by a vector insert
2676 // The effect is that the backend converts the packed fp instruction
2677 // followed by a vector insert into a single SSE scalar fp instruction.
2679 // For example, given the following code:
2680 // __m128 foo(__m128 A, __m128 B) {
2681 // __m128 C = A + B;
2682 // return (__m128) {c[0], a[1], a[2], a[3]};
2685 // Previously we generated:
2686 // addps %xmm0, %xmm1
2687 // movss %xmm1, %xmm0
2690 // addss %xmm1, %xmm0
2692 // TODO: Some canonicalization in lowering would simplify the number of
2693 // patterns we have to try to match.
2694 multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
2695 ValueType VT, ValueType EltTy,
2696 RegisterClass RC, PatFrag ld_frag,
2697 Predicate BasePredicate> {
2698 let Predicates = [BasePredicate] in {
2699 // extracted scalar math op with insert via movss/movsd
2700 def : Pat<(VT (Move (VT VR128:$dst),
2701 (VT (scalar_to_vector
2702 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2704 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2705 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2706 def : Pat<(VT (Move (VT VR128:$dst),
2707 (VT (scalar_to_vector
2708 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2709 (ld_frag addr:$src)))))),
2710 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2713 // Repeat for AVX versions of the instructions.
2714 let Predicates = [UseAVX] in {
2715 // extracted scalar math op with insert via movss/movsd
2716 def : Pat<(VT (Move (VT VR128:$dst),
2717 (VT (scalar_to_vector
2718 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2720 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2721 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2722 def : Pat<(VT (Move (VT VR128:$dst),
2723 (VT (scalar_to_vector
2724 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2725 (ld_frag addr:$src)))))),
2726 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2730 defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2731 defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2732 defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2733 defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2735 defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2736 defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2737 defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2738 defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2741 /// In addition, we also have a special variant of the scalar form here to
2742 /// represent the associated intrinsic operation. This form is unlike the
2743 /// plain scalar form, in that it takes an entire vector (instead of a
2744 /// scalar) and leaves the top elements undefined.
2746 /// And, we have a special variant form for a full-vector intrinsic form.
2748 /// sse_fp_unop_s - SSE1 unops in scalar form
2749 /// For the non-AVX defs, we need $src1 to be tied to $dst because
2750 /// the HW instructions are 2 operand / destructive.
2751 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2752 ValueType ScalarVT, X86MemOperand x86memop,
2753 Operand intmemop, SDNode OpNode, Domain d,
2754 X86FoldableSchedWrite sched, Predicate target> {
2755 let isCodeGenOnly = 1, hasSideEffects = 0 in {
2756 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2757 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2758 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2761 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2762 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2763 [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2764 Sched<[sched.Folded]>,
2765 Requires<[target, OptForSize]>;
2768 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2769 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2770 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2773 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2774 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2775 Sched<[sched.Folded, sched.ReadAfterFold]>;
2780 multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
2781 ComplexPattern int_cpat, Intrinsic Intr,
2782 Predicate target, string Suffix> {
2783 let Predicates = [target] in {
2784 // These are unary operations, but they are modeled as having 2 source operands
2785 // because the high elements of the destination are unchanged in SSE.
2786 def : Pat<(Intr VR128:$src),
2787 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2789 // We don't want to fold scalar loads into these instructions unless
2790 // optimizing for size. This is because the folded instruction will have a
2791 // partial register update, while the unfolded sequence will not, e.g.
2793 // rcpss %xmm0, %xmm0
2794 // which has a clobber before the rcp, vs.
2796 let Predicates = [target, OptForSize] in {
2797 def : Pat<(Intr int_cpat:$src2),
2798 (!cast<Instruction>(NAME#m_Int)
2799 (vt (IMPLICIT_DEF)), addr:$src2)>;
2803 multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
2804 Intrinsic Intr, Predicate target> {
2805 let Predicates = [target] in {
2806 def : Pat<(Intr VR128:$src),
2807 (!cast<Instruction>(NAME#r_Int) VR128:$src,
2810 let Predicates = [target, OptForSize] in {
2811 def : Pat<(Intr int_cpat:$src2),
2812 (!cast<Instruction>(NAME#m_Int)
2813 (vt (IMPLICIT_DEF)), addr:$src2)>;
2817 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2818 ValueType ScalarVT, X86MemOperand x86memop,
2819 Operand intmemop, SDNode OpNode, Domain d,
2820 X86FoldableSchedWrite sched, Predicate target> {
2821 let isCodeGenOnly = 1, hasSideEffects = 0 in {
2822 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2823 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2824 [], d>, Sched<[sched]>;
2826 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2827 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2828 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2830 let hasSideEffects = 0, ExeDomain = d in {
2831 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2832 (ins VR128:$src1, VR128:$src2),
2833 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2834 []>, Sched<[sched]>;
2836 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2837 (ins VR128:$src1, intmemop:$src2),
2838 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2839 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2842 // We don't want to fold scalar loads into these instructions unless
2843 // optimizing for size. This is because the folded instruction will have a
2844 // partial register update, while the unfolded sequence will not, e.g.
2845 // vmovss mem, %xmm0
2846 // vrcpss %xmm0, %xmm0, %xmm0
2847 // which has a clobber before the rcp, vs.
2848 // vrcpss mem, %xmm0, %xmm0
2849 // TODO: In theory, we could fold the load, and avoid the stall caused by
2850 // the partial register store, either in BreakFalseDeps or with smarter RA.
2851 let Predicates = [target] in {
2852 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r)
2853 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2855 let Predicates = [target, OptForSize] in {
2856 def : Pat<(ScalarVT (OpNode (load addr:$src))),
2857 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2862 /// sse1_fp_unop_p - SSE1 unops in packed form.
2863 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
2864 X86SchedWriteWidths sched, list<Predicate> prds> {
2865 let Predicates = prds in {
2866 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2867 !strconcat("v", OpcodeStr,
2868 "ps\t{$src, $dst|$dst, $src}"),
2869 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2870 VEX, Sched<[sched.XMM]>, VEX_WIG;
2871 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2872 !strconcat("v", OpcodeStr,
2873 "ps\t{$src, $dst|$dst, $src}"),
2874 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2875 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2876 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2877 !strconcat("v", OpcodeStr,
2878 "ps\t{$src, $dst|$dst, $src}"),
2879 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2880 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2881 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2882 !strconcat("v", OpcodeStr,
2883 "ps\t{$src, $dst|$dst, $src}"),
2884 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2885 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2888 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2889 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2890 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2892 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2893 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2894 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2895 Sched<[sched.XMM.Folded]>;
2898 /// sse2_fp_unop_p - SSE2 unops in vector forms.
2899 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2900 SDNode OpNode, X86SchedWriteWidths sched> {
2901 let Predicates = [HasAVX, NoVLX] in {
2902 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2903 !strconcat("v", OpcodeStr,
2904 "pd\t{$src, $dst|$dst, $src}"),
2905 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2906 VEX, Sched<[sched.XMM]>, VEX_WIG;
2907 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2908 !strconcat("v", OpcodeStr,
2909 "pd\t{$src, $dst|$dst, $src}"),
2910 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2911 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2912 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2913 !strconcat("v", OpcodeStr,
2914 "pd\t{$src, $dst|$dst, $src}"),
2915 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2916 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2917 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2918 !strconcat("v", OpcodeStr,
2919 "pd\t{$src, $dst|$dst, $src}"),
2920 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
2921 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2924 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2925 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2926 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2928 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2929 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2930 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
2931 Sched<[sched.XMM.Folded]>;
2934 multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
2935 X86SchedWriteWidths sched, Predicate AVXTarget> {
2936 defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2937 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
2939 defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2940 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
2942 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
2945 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2946 X86SchedWriteWidths sched, Predicate AVXTarget> {
2947 defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
2948 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
2949 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
2950 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
2951 XS, VEX_4V, VEX_LIG, VEX_WIG;
2954 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2955 X86SchedWriteWidths sched, Predicate AVXTarget> {
2956 defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
2957 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
2958 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
2959 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
2960 XD, VEX_4V, VEX_LIG, VEX_WIG;
2964 defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>,
2965 sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
2966 sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>,
2967 sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>;
2969 // Reciprocal approximations. Note that these typically require refinement
2970 // in order to obtain suitable precision.
2971 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
2972 sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
2973 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
2974 defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
2975 sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
2976 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
2978 // There is no f64 version of the reciprocal approximation instructions.
2980 multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
2981 ValueType VT, Predicate BasePredicate> {
2982 let Predicates = [BasePredicate] in {
2983 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
2984 (OpNode (extractelt VT:$src, 0))))),
2985 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2988 // Repeat for AVX versions of the instructions.
2989 let Predicates = [UseAVX] in {
2990 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
2991 (OpNode (extractelt VT:$src, 0))))),
2992 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2996 defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
2997 defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
2999 multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3000 SDNode Move, ValueType VT,
3001 Predicate BasePredicate> {
3002 let Predicates = [BasePredicate] in {
3003 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3004 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3007 // Repeat for AVX versions of the instructions.
3008 let Predicates = [HasAVX] in {
3009 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3010 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3014 defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3016 defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3020 //===----------------------------------------------------------------------===//
3021 // SSE 1 & 2 - Non-temporal stores
3022 //===----------------------------------------------------------------------===//
3024 let AddedComplexity = 400 in { // Prefer non-temporal versions
3025 let Predicates = [HasAVX, NoVLX] in {
3026 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3027 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3028 (ins f128mem:$dst, VR128:$src),
3029 "movntps\t{$src, $dst|$dst, $src}",
3030 [(alignednontemporalstore (v4f32 VR128:$src),
3031 addr:$dst)]>, VEX, VEX_WIG;
3032 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3033 (ins f128mem:$dst, VR128:$src),
3034 "movntpd\t{$src, $dst|$dst, $src}",
3035 [(alignednontemporalstore (v2f64 VR128:$src),
3036 addr:$dst)]>, VEX, VEX_WIG;
3039 let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3040 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3041 (ins f256mem:$dst, VR256:$src),
3042 "movntps\t{$src, $dst|$dst, $src}",
3043 [(alignednontemporalstore (v8f32 VR256:$src),
3044 addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3045 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3046 (ins f256mem:$dst, VR256:$src),
3047 "movntpd\t{$src, $dst|$dst, $src}",
3048 [(alignednontemporalstore (v4f64 VR256:$src),
3049 addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3052 let ExeDomain = SSEPackedInt in {
3053 def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
3054 (ins i128mem:$dst, VR128:$src),
3055 "movntdq\t{$src, $dst|$dst, $src}",
3056 [(alignednontemporalstore (v2i64 VR128:$src),
3057 addr:$dst)]>, VEX, VEX_WIG,
3058 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3059 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3060 (ins i256mem:$dst, VR256:$src),
3061 "movntdq\t{$src, $dst|$dst, $src}",
3062 [(alignednontemporalstore (v4i64 VR256:$src),
3063 addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3064 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3068 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3069 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3070 "movntps\t{$src, $dst|$dst, $src}",
3071 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3072 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3073 "movntpd\t{$src, $dst|$dst, $src}",
3074 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3077 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3078 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3079 "movntdq\t{$src, $dst|$dst, $src}",
3080 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3082 let SchedRW = [WriteStoreNT] in {
3083 // There is no AVX form for instructions below this point
3084 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3085 "movnti{l}\t{$src, $dst|$dst, $src}",
3086 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3087 PS, Requires<[HasSSE2]>;
3088 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3089 "movnti{q}\t{$src, $dst|$dst, $src}",
3090 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3091 PS, Requires<[HasSSE2]>;
3092 } // SchedRW = [WriteStoreNT]
3094 let Predicates = [HasAVX, NoVLX] in {
3095 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3096 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3097 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3098 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3099 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3100 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3102 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3103 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3104 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3105 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3106 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3107 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3110 let Predicates = [UseSSE2] in {
3111 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3112 (MOVNTDQmr addr:$dst, VR128:$src)>;
3113 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3114 (MOVNTDQmr addr:$dst, VR128:$src)>;
3115 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3116 (MOVNTDQmr addr:$dst, VR128:$src)>;
3119 } // AddedComplexity
3121 //===----------------------------------------------------------------------===//
3122 // SSE 1 & 2 - Prefetch and memory fence
3123 //===----------------------------------------------------------------------===//
3125 // Prefetch intrinsic.
3126 let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3127 def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3128 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3129 def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3130 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3131 def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3132 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3133 def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3134 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3137 // FIXME: How should flush instruction be modeled?
3138 let SchedRW = [WriteLoad] in {
3140 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3141 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3142 PS, Requires<[HasSSE2]>;
3145 let SchedRW = [WriteNop] in {
3146 // Pause. This "instruction" is encoded as "rep; nop", so even though it
3147 // was introduced with SSE2, it's backward compatible.
3148 def PAUSE : I<0x90, RawFrm, (outs), (ins),
3149 "pause", [(int_x86_sse2_pause)]>, OBXS;
3152 let SchedRW = [WriteFence] in {
3153 // Load, store, and memory fence
3154 // TODO: As with mfence, we may want to ease the availablity of sfence/lfence
3155 // to include any 64-bit target.
3156 def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3157 PS, Requires<[HasSSE1]>;
3158 def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3159 PS, Requires<[HasSSE2]>;
3160 def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3161 PS, Requires<[HasMFence]>;
3164 def : Pat<(X86MFence), (MFENCE)>;
3166 //===----------------------------------------------------------------------===//
3167 // SSE 1 & 2 - Load/Store XCSR register
3168 //===----------------------------------------------------------------------===//
3170 let mayLoad=1, hasSideEffects=1 in
3171 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3172 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3173 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3174 let mayStore=1, hasSideEffects=1 in
3175 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3176 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3177 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3179 let mayLoad=1, hasSideEffects=1 in
3180 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3181 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3182 TB, Sched<[WriteLDMXCSR]>;
3183 let mayStore=1, hasSideEffects=1 in
3184 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3185 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3186 TB, Sched<[WriteSTMXCSR]>;
3188 //===---------------------------------------------------------------------===//
3189 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3190 //===---------------------------------------------------------------------===//
3192 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3194 let hasSideEffects = 0 in {
3195 def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3196 "movdqa\t{$src, $dst|$dst, $src}", []>,
3197 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3198 def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3199 "movdqu\t{$src, $dst|$dst, $src}", []>,
3200 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3201 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3202 "movdqa\t{$src, $dst|$dst, $src}", []>,
3203 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3204 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3205 "movdqu\t{$src, $dst|$dst, $src}", []>,
3206 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3210 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3211 def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3212 "movdqa\t{$src, $dst|$dst, $src}", []>,
3213 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3214 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3215 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3216 "movdqa\t{$src, $dst|$dst, $src}", []>,
3217 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3218 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3219 def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3220 "movdqu\t{$src, $dst|$dst, $src}", []>,
3221 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3222 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3223 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3224 "movdqu\t{$src, $dst|$dst, $src}", []>,
3225 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3226 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3229 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3230 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3231 def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3232 "movdqa\t{$src, $dst|$dst, $src}",
3233 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3234 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3235 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3236 "movdqa\t{$src, $dst|$dst, $src}", []>,
3237 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3238 VEX, VEX_L, VEX_WIG;
3239 def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3240 "vmovdqu\t{$src, $dst|$dst, $src}",
3241 [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3242 Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3244 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3245 "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3246 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3247 XS, VEX, VEX_L, VEX_WIG;
3250 let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3251 def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
3252 (ins i128mem:$dst, VR128:$src),
3253 "movdqa\t{$src, $dst|$dst, $src}",
3254 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3255 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3256 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3257 (ins i256mem:$dst, VR256:$src),
3258 "movdqa\t{$src, $dst|$dst, $src}", []>,
3259 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3260 def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3261 "vmovdqu\t{$src, $dst|$dst, $src}",
3262 [(store (v2i64 VR128:$src), addr:$dst)]>,
3263 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3264 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3265 "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3266 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3269 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3270 let hasSideEffects = 0 in {
3271 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3272 "movdqa\t{$src, $dst|$dst, $src}", []>;
3274 def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3275 "movdqu\t{$src, $dst|$dst, $src}", []>,
3276 XS, Requires<[UseSSE2]>;
3280 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3281 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3282 "movdqa\t{$src, $dst|$dst, $src}", []>,
3283 FoldGenData<"MOVDQArr">;
3285 def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3286 "movdqu\t{$src, $dst|$dst, $src}", []>,
3287 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3291 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3292 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3293 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3294 "movdqa\t{$src, $dst|$dst, $src}",
3295 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3296 def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3297 "movdqu\t{$src, $dst|$dst, $src}",
3298 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3299 XS, Requires<[UseSSE2]>;
3302 let mayStore = 1, hasSideEffects = 0,
3303 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3304 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3305 "movdqa\t{$src, $dst|$dst, $src}",
3306 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3307 def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3308 "movdqu\t{$src, $dst|$dst, $src}",
3309 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3310 XS, Requires<[UseSSE2]>;
3313 } // ExeDomain = SSEPackedInt
3315 // Reversed version with ".s" suffix for GAS compatibility.
3316 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3317 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3318 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3319 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3320 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3321 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3322 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3323 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3325 // Reversed version with ".s" suffix for GAS compatibility.
3326 def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3327 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3328 def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3329 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3331 let Predicates = [HasAVX, NoVLX] in {
3332 // Additional patterns for other integer sizes.
3333 def : Pat<(alignedloadv4i32 addr:$src),
3334 (VMOVDQArm addr:$src)>;
3335 def : Pat<(alignedloadv8i16 addr:$src),
3336 (VMOVDQArm addr:$src)>;
3337 def : Pat<(alignedloadv16i8 addr:$src),
3338 (VMOVDQArm addr:$src)>;
3339 def : Pat<(loadv4i32 addr:$src),
3340 (VMOVDQUrm addr:$src)>;
3341 def : Pat<(loadv8i16 addr:$src),
3342 (VMOVDQUrm addr:$src)>;
3343 def : Pat<(loadv16i8 addr:$src),
3344 (VMOVDQUrm addr:$src)>;
3346 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3347 (VMOVDQAmr addr:$dst, VR128:$src)>;
3348 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3349 (VMOVDQAmr addr:$dst, VR128:$src)>;
3350 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3351 (VMOVDQAmr addr:$dst, VR128:$src)>;
3352 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3353 (VMOVDQUmr addr:$dst, VR128:$src)>;
3354 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3355 (VMOVDQUmr addr:$dst, VR128:$src)>;
3356 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3357 (VMOVDQUmr addr:$dst, VR128:$src)>;
3360 //===---------------------------------------------------------------------===//
3361 // SSE2 - Packed Integer Arithmetic Instructions
3362 //===---------------------------------------------------------------------===//
3364 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3366 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3367 multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3368 ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3369 PatFrag memop_frag, X86MemOperand x86memop,
3370 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3371 let isCommutable = 1 in
3372 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3373 (ins RC:$src1, RC:$src2),
3375 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3376 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3377 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3379 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3380 (ins RC:$src1, x86memop:$src2),
3382 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3383 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3384 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3385 (memop_frag addr:$src2))))]>,
3386 Sched<[sched.Folded, sched.ReadAfterFold]>;
3388 } // ExeDomain = SSEPackedInt
3390 defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3391 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3392 defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3393 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3394 defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3395 SchedWriteVecALU, 1, NoVLX>;
3396 defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3397 SchedWriteVecALU, 1, NoVLX>;
3398 defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3399 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3400 defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3401 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3402 defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3403 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3404 defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3405 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3406 defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3407 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3408 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3409 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3410 defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3411 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3412 defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3413 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3414 defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3415 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3416 defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3417 SchedWriteVecALU, 0, NoVLX>;
3418 defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3419 SchedWriteVecALU, 0, NoVLX>;
3420 defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3421 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3422 defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3423 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3424 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3425 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3426 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3427 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3428 defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3429 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3430 defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3431 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3432 defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3433 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3434 defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3435 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3436 defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
3437 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3438 defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
3439 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3440 defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3441 SchedWriteVecIMul, 1, NoVLX>;
3443 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3444 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3445 load, i128mem, SchedWriteVecIMul.XMM, 0>,
3448 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3449 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3450 VR256, load, i256mem, SchedWriteVecIMul.YMM,
3451 0>, VEX_4V, VEX_L, VEX_WIG;
3452 let Constraints = "$src1 = $dst" in
3453 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3454 memop, i128mem, SchedWriteVecIMul.XMM>;
3456 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3457 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3458 load, i128mem, SchedWritePSADBW.XMM, 0>,
3460 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3461 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3462 load, i256mem, SchedWritePSADBW.YMM, 0>,
3463 VEX_4V, VEX_L, VEX_WIG;
3464 let Constraints = "$src1 = $dst" in
3465 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3466 memop, i128mem, SchedWritePSADBW.XMM>;
3468 //===---------------------------------------------------------------------===//
3469 // SSE2 - Packed Integer Logical Instructions
3470 //===---------------------------------------------------------------------===//
3472 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3473 string OpcodeStr, SDNode OpNode,
3474 SDNode OpNode2, RegisterClass RC,
3475 X86FoldableSchedWrite sched,
3476 X86FoldableSchedWrite schedImm,
3477 ValueType DstVT, ValueType SrcVT,
3478 PatFrag ld_frag, bit Is2Addr = 1> {
3479 // src2 is always 128-bit
3480 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3481 (ins RC:$src1, VR128:$src2),
3483 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3484 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3485 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3487 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3488 (ins RC:$src1, i128mem:$src2),
3490 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3491 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3492 [(set RC:$dst, (DstVT (OpNode RC:$src1,
3493 (SrcVT (ld_frag addr:$src2)))))]>,
3494 Sched<[sched.Folded, sched.ReadAfterFold]>;
3495 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3496 (ins RC:$src1, u8imm:$src2),
3498 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3499 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3500 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
3504 multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3505 string OpcodeStr, SDNode OpNode,
3506 SDNode OpNode2, ValueType DstVT128,
3507 ValueType DstVT256, ValueType SrcVT,
3508 X86SchedWriteWidths sched,
3509 X86SchedWriteWidths schedImm, Predicate prd> {
3510 let Predicates = [HasAVX, prd] in
3511 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3512 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3513 DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3514 let Predicates = [HasAVX2, prd] in
3515 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3516 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3517 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3519 let Constraints = "$src1 = $dst" in
3520 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3521 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3525 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3526 SDNode OpNode, RegisterClass RC, ValueType VT,
3527 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3528 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3530 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3531 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3532 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
3536 multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3537 SDNode OpNode, X86SchedWriteWidths sched> {
3538 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3539 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3540 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3541 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3542 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3543 VR256, v32i8, sched.YMM, 0>,
3544 VEX_4V, VEX_L, VEX_WIG;
3545 let Constraints = "$src1 = $dst" in
3546 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3550 let ExeDomain = SSEPackedInt in {
3551 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3552 v8i16, v16i16, v8i16, SchedWriteVecShift,
3553 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3554 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3555 v4i32, v8i32, v4i32, SchedWriteVecShift,
3556 SchedWriteVecShiftImm, NoVLX>;
3557 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3558 v2i64, v4i64, v2i64, SchedWriteVecShift,
3559 SchedWriteVecShiftImm, NoVLX>;
3561 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3562 v8i16, v16i16, v8i16, SchedWriteVecShift,
3563 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3564 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3565 v4i32, v8i32, v4i32, SchedWriteVecShift,
3566 SchedWriteVecShiftImm, NoVLX>;
3567 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3568 v2i64, v4i64, v2i64, SchedWriteVecShift,
3569 SchedWriteVecShiftImm, NoVLX>;
3571 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3572 v8i16, v16i16, v8i16, SchedWriteVecShift,
3573 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3574 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3575 v4i32, v8i32, v4i32, SchedWriteVecShift,
3576 SchedWriteVecShiftImm, NoVLX>;
3578 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3580 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3582 } // ExeDomain = SSEPackedInt
3584 //===---------------------------------------------------------------------===//
3585 // SSE2 - Packed Integer Comparison Instructions
3586 //===---------------------------------------------------------------------===//
3588 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3589 SchedWriteVecALU, 1, TruePredicate>;
3590 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3591 SchedWriteVecALU, 1, TruePredicate>;
3592 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3593 SchedWriteVecALU, 1, TruePredicate>;
3594 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3595 SchedWriteVecALU, 0, TruePredicate>;
3596 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3597 SchedWriteVecALU, 0, TruePredicate>;
3598 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3599 SchedWriteVecALU, 0, TruePredicate>;
3601 //===---------------------------------------------------------------------===//
3602 // SSE2 - Packed Integer Shuffle Instructions
3603 //===---------------------------------------------------------------------===//
3605 let ExeDomain = SSEPackedInt in {
3606 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3607 SDNode OpNode, X86SchedWriteWidths sched,
3609 let Predicates = [HasAVX, prd] in {
3610 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3611 (ins VR128:$src1, u8imm:$src2),
3612 !strconcat("v", OpcodeStr,
3613 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3615 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3616 VEX, Sched<[sched.XMM]>, VEX_WIG;
3617 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3618 (ins i128mem:$src1, u8imm:$src2),
3619 !strconcat("v", OpcodeStr,
3620 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3622 (vt128 (OpNode (load addr:$src1),
3623 (i8 timm:$src2))))]>, VEX,
3624 Sched<[sched.XMM.Folded]>, VEX_WIG;
3627 let Predicates = [HasAVX2, prd] in {
3628 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3629 (ins VR256:$src1, u8imm:$src2),
3630 !strconcat("v", OpcodeStr,
3631 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3633 (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
3634 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3635 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3636 (ins i256mem:$src1, u8imm:$src2),
3637 !strconcat("v", OpcodeStr,
3638 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3640 (vt256 (OpNode (load addr:$src1),
3641 (i8 timm:$src2))))]>, VEX, VEX_L,
3642 Sched<[sched.YMM.Folded]>, VEX_WIG;
3645 let Predicates = [UseSSE2] in {
3646 def ri : Ii8<0x70, MRMSrcReg,
3647 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3648 !strconcat(OpcodeStr,
3649 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3651 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3653 def mi : Ii8<0x70, MRMSrcMem,
3654 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3655 !strconcat(OpcodeStr,
3656 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3658 (vt128 (OpNode (memop addr:$src1),
3659 (i8 timm:$src2))))]>,
3660 Sched<[sched.XMM.Folded]>;
3663 } // ExeDomain = SSEPackedInt
3665 defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3666 SchedWriteShuffle, NoVLX>, PD;
3667 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3668 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3669 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3670 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3672 //===---------------------------------------------------------------------===//
3673 // Packed Integer Pack Instructions (SSE & AVX)
3674 //===---------------------------------------------------------------------===//
3676 let ExeDomain = SSEPackedInt in {
3677 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3678 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3679 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3680 PatFrag ld_frag, bit Is2Addr = 1> {
3681 def rr : PDI<opc, MRMSrcReg,
3682 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3684 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3685 !strconcat(OpcodeStr,
3686 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3688 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3690 def rm : PDI<opc, MRMSrcMem,
3691 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3693 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3694 !strconcat(OpcodeStr,
3695 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3697 (OutVT (OpNode (ArgVT RC:$src1),
3698 (ld_frag addr:$src2))))]>,
3699 Sched<[sched.Folded, sched.ReadAfterFold]>;
3702 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3703 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3704 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3705 PatFrag ld_frag, bit Is2Addr = 1> {
3706 def rr : SS48I<opc, MRMSrcReg,
3707 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3709 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3710 !strconcat(OpcodeStr,
3711 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3713 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3715 def rm : SS48I<opc, MRMSrcMem,
3716 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3718 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3719 !strconcat(OpcodeStr,
3720 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3722 (OutVT (OpNode (ArgVT RC:$src1),
3723 (ld_frag addr:$src2))))]>,
3724 Sched<[sched.Folded, sched.ReadAfterFold]>;
3727 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3728 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3729 i128mem, SchedWriteShuffle.XMM, load, 0>,
3731 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3732 i128mem, SchedWriteShuffle.XMM, load, 0>,
3735 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3736 i128mem, SchedWriteShuffle.XMM, load, 0>,
3738 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3739 i128mem, SchedWriteShuffle.XMM, load, 0>,
3743 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3744 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3745 i256mem, SchedWriteShuffle.YMM, load, 0>,
3746 VEX_4V, VEX_L, VEX_WIG;
3747 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3748 i256mem, SchedWriteShuffle.YMM, load, 0>,
3749 VEX_4V, VEX_L, VEX_WIG;
3751 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3752 i256mem, SchedWriteShuffle.YMM, load, 0>,
3753 VEX_4V, VEX_L, VEX_WIG;
3754 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3755 i256mem, SchedWriteShuffle.YMM, load, 0>,
3759 let Constraints = "$src1 = $dst" in {
3760 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3761 i128mem, SchedWriteShuffle.XMM, memop>;
3762 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3763 i128mem, SchedWriteShuffle.XMM, memop>;
3765 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3766 i128mem, SchedWriteShuffle.XMM, memop>;
3768 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3769 i128mem, SchedWriteShuffle.XMM, memop>;
3771 } // ExeDomain = SSEPackedInt
3773 //===---------------------------------------------------------------------===//
3774 // SSE2 - Packed Integer Unpack Instructions
3775 //===---------------------------------------------------------------------===//
3777 let ExeDomain = SSEPackedInt in {
3778 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3779 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3780 X86FoldableSchedWrite sched, PatFrag ld_frag,
3782 def rr : PDI<opc, MRMSrcReg,
3783 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3785 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3786 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3787 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3789 def rm : PDI<opc, MRMSrcMem,
3790 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3792 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3793 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3794 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3795 Sched<[sched.Folded, sched.ReadAfterFold]>;
3798 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3799 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3800 i128mem, SchedWriteShuffle.XMM, load, 0>,
3802 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3803 i128mem, SchedWriteShuffle.XMM, load, 0>,
3805 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3806 i128mem, SchedWriteShuffle.XMM, load, 0>,
3808 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3809 i128mem, SchedWriteShuffle.XMM, load, 0>,
3813 let Predicates = [HasAVX, NoVLX] in {
3814 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3815 i128mem, SchedWriteShuffle.XMM, load, 0>,
3817 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3818 i128mem, SchedWriteShuffle.XMM, load, 0>,
3820 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3821 i128mem, SchedWriteShuffle.XMM, load, 0>,
3823 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3824 i128mem, SchedWriteShuffle.XMM, load, 0>,
3828 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3829 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3830 i256mem, SchedWriteShuffle.YMM, load, 0>,
3831 VEX_4V, VEX_L, VEX_WIG;
3832 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3833 i256mem, SchedWriteShuffle.YMM, load, 0>,
3834 VEX_4V, VEX_L, VEX_WIG;
3835 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3836 i256mem, SchedWriteShuffle.YMM, load, 0>,
3837 VEX_4V, VEX_L, VEX_WIG;
3838 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3839 i256mem, SchedWriteShuffle.YMM, load, 0>,
3840 VEX_4V, VEX_L, VEX_WIG;
3843 let Predicates = [HasAVX2, NoVLX] in {
3844 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3845 i256mem, SchedWriteShuffle.YMM, load, 0>,
3846 VEX_4V, VEX_L, VEX_WIG;
3847 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3848 i256mem, SchedWriteShuffle.YMM, load, 0>,
3849 VEX_4V, VEX_L, VEX_WIG;
3850 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3851 i256mem, SchedWriteShuffle.YMM, load, 0>,
3852 VEX_4V, VEX_L, VEX_WIG;
3853 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3854 i256mem, SchedWriteShuffle.YMM, load, 0>,
3855 VEX_4V, VEX_L, VEX_WIG;
3858 let Constraints = "$src1 = $dst" in {
3859 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3860 i128mem, SchedWriteShuffle.XMM, memop>;
3861 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3862 i128mem, SchedWriteShuffle.XMM, memop>;
3863 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3864 i128mem, SchedWriteShuffle.XMM, memop>;
3865 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3866 i128mem, SchedWriteShuffle.XMM, memop>;
3868 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3869 i128mem, SchedWriteShuffle.XMM, memop>;
3870 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3871 i128mem, SchedWriteShuffle.XMM, memop>;
3872 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3873 i128mem, SchedWriteShuffle.XMM, memop>;
3874 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3875 i128mem, SchedWriteShuffle.XMM, memop>;
3877 } // ExeDomain = SSEPackedInt
3879 //===---------------------------------------------------------------------===//
3880 // SSE2 - Packed Integer Extract and Insert
3881 //===---------------------------------------------------------------------===//
3883 let ExeDomain = SSEPackedInt in {
3884 multiclass sse2_pinsrw<bit Is2Addr = 1> {
3885 def rr : Ii8<0xC4, MRMSrcReg,
3886 (outs VR128:$dst), (ins VR128:$src1,
3887 GR32orGR64:$src2, u8imm:$src3),
3889 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3890 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3892 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
3893 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3894 def rm : Ii8<0xC4, MRMSrcMem,
3895 (outs VR128:$dst), (ins VR128:$src1,
3896 i16mem:$src2, u8imm:$src3),
3898 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3899 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3901 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3903 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
3907 let Predicates = [HasAVX, NoBWI] in
3908 def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
3909 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3910 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3911 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3913 PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
3914 def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
3915 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3916 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3917 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3919 Sched<[WriteVecExtract]>;
3922 let Predicates = [HasAVX, NoBWI] in
3923 defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
3925 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
3926 defm PINSRW : sse2_pinsrw, PD;
3928 } // ExeDomain = SSEPackedInt
3930 //===---------------------------------------------------------------------===//
3931 // SSE2 - Packed Mask Creation
3932 //===---------------------------------------------------------------------===//
3934 let ExeDomain = SSEPackedInt in {
3936 def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3938 "pmovmskb\t{$src, $dst|$dst, $src}",
3939 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3940 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
3942 let Predicates = [HasAVX2] in {
3943 def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3945 "pmovmskb\t{$src, $dst|$dst, $src}",
3946 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
3947 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
3950 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
3951 "pmovmskb\t{$src, $dst|$dst, $src}",
3952 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3953 Sched<[WriteVecMOVMSK]>;
3955 } // ExeDomain = SSEPackedInt
3957 //===---------------------------------------------------------------------===//
3958 // SSE2 - Conditional Store
3959 //===---------------------------------------------------------------------===//
3961 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3962 let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
3963 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
3964 (ins VR128:$src, VR128:$mask),
3965 "maskmovdqu\t{$mask, $src|$src, $mask}",
3966 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
3968 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
3969 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
3970 (ins VR128:$src, VR128:$mask),
3971 "maskmovdqu\t{$mask, $src|$src, $mask}",
3972 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
3975 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
3976 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
3977 "maskmovdqu\t{$mask, $src|$src, $mask}",
3978 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
3979 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
3980 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
3981 "maskmovdqu\t{$mask, $src|$src, $mask}",
3982 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
3984 } // ExeDomain = SSEPackedInt
3986 //===---------------------------------------------------------------------===//
3987 // SSE2 - Move Doubleword/Quadword
3988 //===---------------------------------------------------------------------===//
3990 //===---------------------------------------------------------------------===//
3991 // Move Int Doubleword to Packed Double Int
3993 let ExeDomain = SSEPackedInt in {
3994 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
3995 "movd\t{$src, $dst|$dst, $src}",
3997 (v4i32 (scalar_to_vector GR32:$src)))]>,
3998 VEX, Sched<[WriteVecMoveFromGpr]>;
3999 def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4000 "movd\t{$src, $dst|$dst, $src}",
4002 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4003 VEX, Sched<[WriteVecLoad]>;
4004 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4005 "movq\t{$src, $dst|$dst, $src}",
4007 (v2i64 (scalar_to_vector GR64:$src)))]>,
4008 VEX, Sched<[WriteVecMoveFromGpr]>;
4009 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4010 def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4011 "movq\t{$src, $dst|$dst, $src}", []>,
4012 VEX, Sched<[WriteVecLoad]>;
4013 let isCodeGenOnly = 1 in
4014 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4015 "movq\t{$src, $dst|$dst, $src}",
4016 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4017 VEX, Sched<[WriteVecMoveFromGpr]>;
4019 def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4020 "movd\t{$src, $dst|$dst, $src}",
4022 (v4i32 (scalar_to_vector GR32:$src)))]>,
4023 Sched<[WriteVecMoveFromGpr]>;
4024 def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4025 "movd\t{$src, $dst|$dst, $src}",
4027 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4028 Sched<[WriteVecLoad]>;
4029 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4030 "movq\t{$src, $dst|$dst, $src}",
4032 (v2i64 (scalar_to_vector GR64:$src)))]>,
4033 Sched<[WriteVecMoveFromGpr]>;
4034 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4035 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4036 "movq\t{$src, $dst|$dst, $src}", []>,
4037 Sched<[WriteVecLoad]>;
4038 let isCodeGenOnly = 1 in
4039 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4040 "movq\t{$src, $dst|$dst, $src}",
4041 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4042 Sched<[WriteVecMoveFromGpr]>;
4043 } // ExeDomain = SSEPackedInt
4045 //===---------------------------------------------------------------------===//
4046 // Move Int Doubleword to Single Scalar
4048 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4049 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4050 "movd\t{$src, $dst|$dst, $src}",
4051 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4052 VEX, Sched<[WriteVecMoveFromGpr]>;
4054 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4055 "movd\t{$src, $dst|$dst, $src}",
4056 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4057 Sched<[WriteVecMoveFromGpr]>;
4059 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4061 //===---------------------------------------------------------------------===//
4062 // Move Packed Doubleword Int to Packed Double Int
4064 let ExeDomain = SSEPackedInt in {
4065 def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4066 "movd\t{$src, $dst|$dst, $src}",
4067 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4069 Sched<[WriteVecMoveToGpr]>;
4070 def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
4071 (ins i32mem:$dst, VR128:$src),
4072 "movd\t{$src, $dst|$dst, $src}",
4073 [(store (i32 (extractelt (v4i32 VR128:$src),
4074 (iPTR 0))), addr:$dst)]>,
4075 VEX, Sched<[WriteVecStore]>;
4076 def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4077 "movd\t{$src, $dst|$dst, $src}",
4078 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4080 Sched<[WriteVecMoveToGpr]>;
4081 def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4082 "movd\t{$src, $dst|$dst, $src}",
4083 [(store (i32 (extractelt (v4i32 VR128:$src),
4084 (iPTR 0))), addr:$dst)]>,
4085 Sched<[WriteVecStore]>;
4086 } // ExeDomain = SSEPackedInt
4088 //===---------------------------------------------------------------------===//
4089 // Move Packed Doubleword Int first element to Doubleword Int
4091 let ExeDomain = SSEPackedInt in {
4092 let SchedRW = [WriteVecMoveToGpr] in {
4093 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4094 "movq\t{$src, $dst|$dst, $src}",
4095 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4099 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4100 "movq\t{$src, $dst|$dst, $src}",
4101 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4105 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4106 def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4107 (ins i64mem:$dst, VR128:$src),
4108 "movq\t{$src, $dst|$dst, $src}", []>,
4109 VEX, Sched<[WriteVecStore]>;
4110 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4111 def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4112 "movq\t{$src, $dst|$dst, $src}", []>,
4113 Sched<[WriteVecStore]>;
4114 } // ExeDomain = SSEPackedInt
4116 //===---------------------------------------------------------------------===//
4117 // Bitcast FR64 <-> GR64
4119 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4120 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4121 "movq\t{$src, $dst|$dst, $src}",
4122 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4123 VEX, Sched<[WriteVecMoveToGpr]>;
4125 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4126 "movq\t{$src, $dst|$dst, $src}",
4127 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4128 Sched<[WriteVecMoveToGpr]>;
4129 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4131 //===---------------------------------------------------------------------===//
4132 // Move Scalar Single to Double Int
4134 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4135 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4136 "movd\t{$src, $dst|$dst, $src}",
4137 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4138 VEX, Sched<[WriteVecMoveToGpr]>;
4139 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4140 "movd\t{$src, $dst|$dst, $src}",
4141 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4142 Sched<[WriteVecMoveToGpr]>;
4143 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4145 let Predicates = [UseAVX] in {
4146 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4147 (VMOVDI2PDIrr GR32:$src)>;
4149 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4150 (VMOV64toPQIrr GR64:$src)>;
4152 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4153 // These instructions also write zeros in the high part of a 256-bit register.
4154 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4155 (VMOVDI2PDIrm addr:$src)>;
4156 def : Pat<(v4i32 (X86vzload32 addr:$src)),
4157 (VMOVDI2PDIrm addr:$src)>;
4158 def : Pat<(v8i32 (X86vzload32 addr:$src)),
4159 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4162 let Predicates = [UseSSE2] in {
4163 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4164 (MOVDI2PDIrr GR32:$src)>;
4166 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4167 (MOV64toPQIrr GR64:$src)>;
4168 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4169 (MOVDI2PDIrm addr:$src)>;
4170 def : Pat<(v4i32 (X86vzload32 addr:$src)),
4171 (MOVDI2PDIrm addr:$src)>;
4174 // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4175 // "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4177 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4178 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4179 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4180 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4181 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4182 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4183 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4184 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4185 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4187 //===---------------------------------------------------------------------===//
4188 // SSE2 - Move Quadword
4189 //===---------------------------------------------------------------------===//
4191 //===---------------------------------------------------------------------===//
4192 // Move Quadword Int to Packed Quadword Int
4195 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4196 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4197 "vmovq\t{$src, $dst|$dst, $src}",
4199 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4200 VEX, Requires<[UseAVX]>, VEX_WIG;
4201 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4202 "movq\t{$src, $dst|$dst, $src}",
4204 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4205 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4206 } // ExeDomain, SchedRW
4208 //===---------------------------------------------------------------------===//
4209 // Move Packed Quadword Int to Quadword Int
4211 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4212 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4213 "movq\t{$src, $dst|$dst, $src}",
4214 [(store (i64 (extractelt (v2i64 VR128:$src),
4215 (iPTR 0))), addr:$dst)]>,
4217 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4218 "movq\t{$src, $dst|$dst, $src}",
4219 [(store (i64 (extractelt (v2i64 VR128:$src),
4220 (iPTR 0))), addr:$dst)]>;
4221 } // ExeDomain, SchedRW
4223 // For disassembler only
4224 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4225 SchedRW = [SchedWriteVecLogic.XMM] in {
4226 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4227 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4228 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4229 "movq\t{$src, $dst|$dst, $src}", []>;
4232 def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4233 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4234 def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4235 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4237 let Predicates = [UseAVX] in {
4238 def : Pat<(v2i64 (X86vzload64 addr:$src)),
4239 (VMOVQI2PQIrm addr:$src)>;
4240 def : Pat<(v4i64 (X86vzload64 addr:$src)),
4241 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4243 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4244 (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4247 let Predicates = [UseSSE2] in {
4248 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4250 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4251 (MOVPQI2QImr addr:$dst, VR128:$src)>;
4254 //===---------------------------------------------------------------------===//
4255 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4256 // IA32 document. movq xmm1, xmm2 does clear the high bits.
4258 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4259 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4260 "vmovq\t{$src, $dst|$dst, $src}",
4261 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4262 XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4263 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4264 "movq\t{$src, $dst|$dst, $src}",
4265 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4266 XS, Requires<[UseSSE2]>;
4267 } // ExeDomain, SchedRW
4269 let Predicates = [UseAVX] in {
4270 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4271 (VMOVZPQILo2PQIrr VR128:$src)>;
4273 let Predicates = [UseSSE2] in {
4274 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4275 (MOVZPQILo2PQIrr VR128:$src)>;
4278 let Predicates = [UseAVX] in {
4279 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4280 (SUBREG_TO_REG (i32 0),
4281 (v2f64 (VMOVZPQILo2PQIrr
4282 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4284 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4285 (SUBREG_TO_REG (i32 0),
4286 (v2i64 (VMOVZPQILo2PQIrr
4287 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4291 //===---------------------------------------------------------------------===//
4292 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4293 //===---------------------------------------------------------------------===//
4295 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4296 ValueType vt, RegisterClass RC, PatFrag mem_frag,
4297 X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4298 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4299 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4300 [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4302 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4303 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4304 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4305 Sched<[sched.Folded]>;
4308 let Predicates = [HasAVX, NoVLX] in {
4309 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4310 v4f32, VR128, loadv4f32, f128mem,
4311 SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4312 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4313 v4f32, VR128, loadv4f32, f128mem,
4314 SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4315 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4316 v8f32, VR256, loadv8f32, f256mem,
4317 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4318 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4319 v8f32, VR256, loadv8f32, f256mem,
4320 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4322 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4323 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4324 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4325 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4327 let Predicates = [HasAVX, NoVLX] in {
4328 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4329 (VMOVSHDUPrr VR128:$src)>;
4330 def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4331 (VMOVSHDUPrm addr:$src)>;
4332 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4333 (VMOVSLDUPrr VR128:$src)>;
4334 def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4335 (VMOVSLDUPrm addr:$src)>;
4336 def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4337 (VMOVSHDUPYrr VR256:$src)>;
4338 def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4339 (VMOVSHDUPYrm addr:$src)>;
4340 def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4341 (VMOVSLDUPYrr VR256:$src)>;
4342 def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4343 (VMOVSLDUPYrm addr:$src)>;
4346 let Predicates = [UseSSE3] in {
4347 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4348 (MOVSHDUPrr VR128:$src)>;
4349 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4350 (MOVSHDUPrm addr:$src)>;
4351 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4352 (MOVSLDUPrr VR128:$src)>;
4353 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4354 (MOVSLDUPrm addr:$src)>;
4357 //===---------------------------------------------------------------------===//
4358 // SSE3 - Replicate Double FP - MOVDDUP
4359 //===---------------------------------------------------------------------===//
4361 multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4362 def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4363 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4364 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4366 def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4367 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4370 (scalar_to_vector (loadf64 addr:$src)))))]>,
4371 Sched<[sched.XMM.Folded]>;
4374 // FIXME: Merge with above classes when there are patterns for the ymm version
4375 multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4376 def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4377 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4378 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4380 def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4381 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4383 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4384 Sched<[sched.YMM.Folded]>;
4387 let Predicates = [HasAVX, NoVLX] in {
4388 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4390 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4391 VEX, VEX_L, VEX_WIG;
4394 defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4397 let Predicates = [HasAVX, NoVLX] in {
4398 def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
4399 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4400 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4401 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4404 let Predicates = [UseSSE3] in {
4405 // No need for aligned memory as this only loads 64-bits.
4406 def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
4407 (MOVDDUPrm addr:$src)>;
4408 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4409 (MOVDDUPrm addr:$src)>;
4412 //===---------------------------------------------------------------------===//
4413 // SSE3 - Move Unaligned Integer
4414 //===---------------------------------------------------------------------===//
4416 let Predicates = [HasAVX] in {
4417 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4418 "vlddqu\t{$src, $dst|$dst, $src}",
4419 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4420 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4421 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4422 "vlddqu\t{$src, $dst|$dst, $src}",
4423 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4424 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4427 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4428 "lddqu\t{$src, $dst|$dst, $src}",
4429 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4430 Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4432 //===---------------------------------------------------------------------===//
4433 // SSE3 - Arithmetic
4434 //===---------------------------------------------------------------------===//
4436 multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4437 X86MemOperand x86memop, X86FoldableSchedWrite sched,
4438 PatFrag ld_frag, bit Is2Addr = 1> {
4439 def rr : I<0xD0, MRMSrcReg,
4440 (outs RC:$dst), (ins RC:$src1, RC:$src2),
4442 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4443 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4444 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4446 def rm : I<0xD0, MRMSrcMem,
4447 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4449 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4450 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4451 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4452 Sched<[sched.Folded, sched.ReadAfterFold]>;
4455 let Predicates = [HasAVX] in {
4456 let ExeDomain = SSEPackedSingle in {
4457 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4458 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4459 XD, VEX_4V, VEX_WIG;
4460 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4461 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4462 XD, VEX_4V, VEX_L, VEX_WIG;
4464 let ExeDomain = SSEPackedDouble in {
4465 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4466 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4467 PD, VEX_4V, VEX_WIG;
4468 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4469 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4470 PD, VEX_4V, VEX_L, VEX_WIG;
4473 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4474 let ExeDomain = SSEPackedSingle in
4475 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4476 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4477 let ExeDomain = SSEPackedDouble in
4478 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4479 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4482 //===---------------------------------------------------------------------===//
4483 // SSE3 Instructions
4484 //===---------------------------------------------------------------------===//
4487 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4488 X86MemOperand x86memop, SDNode OpNode,
4489 X86FoldableSchedWrite sched, PatFrag ld_frag,
4491 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4493 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4494 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4495 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4498 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4500 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4501 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4502 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4503 Sched<[sched.Folded, sched.ReadAfterFold]>;
4505 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4506 X86MemOperand x86memop, SDNode OpNode,
4507 X86FoldableSchedWrite sched, PatFrag ld_frag,
4509 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4511 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4512 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4513 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4516 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4518 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4519 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4520 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4521 Sched<[sched.Folded, sched.ReadAfterFold]>;
4524 let Predicates = [HasAVX] in {
4525 let ExeDomain = SSEPackedSingle in {
4526 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4527 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4528 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4529 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4530 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4531 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4532 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4533 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4535 let ExeDomain = SSEPackedDouble in {
4536 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4537 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4538 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4539 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4540 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4541 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4542 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4543 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4547 let Constraints = "$src1 = $dst" in {
4548 let ExeDomain = SSEPackedSingle in {
4549 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4550 WriteFHAdd, memopv4f32>;
4551 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4552 WriteFHAdd, memopv4f32>;
4554 let ExeDomain = SSEPackedDouble in {
4555 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4556 WriteFHAdd, memopv2f64>;
4557 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4558 WriteFHAdd, memopv2f64>;
4562 //===---------------------------------------------------------------------===//
4563 // SSSE3 - Packed Absolute Instructions
4564 //===---------------------------------------------------------------------===//
4566 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4567 multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4568 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4569 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4571 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4572 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4575 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4577 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4579 (vt (OpNode (ld_frag addr:$src))))]>,
4580 Sched<[sched.XMM.Folded]>;
4583 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4584 multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4585 SDNode OpNode, X86SchedWriteWidths sched> {
4586 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4588 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4589 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4592 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4594 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4596 (vt (OpNode (load addr:$src))))]>,
4597 Sched<[sched.YMM.Folded]>;
4600 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4601 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4602 load>, VEX, VEX_WIG;
4603 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4604 load>, VEX, VEX_WIG;
4606 let Predicates = [HasAVX, NoVLX] in {
4607 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4608 load>, VEX, VEX_WIG;
4610 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4611 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4612 VEX, VEX_L, VEX_WIG;
4613 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4614 VEX, VEX_L, VEX_WIG;
4616 let Predicates = [HasAVX2, NoVLX] in {
4617 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4618 VEX, VEX_L, VEX_WIG;
4621 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4623 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4625 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4628 //===---------------------------------------------------------------------===//
4629 // SSSE3 - Packed Binary Operator Instructions
4630 //===---------------------------------------------------------------------===//
4632 /// SS3I_binop_rm - Simple SSSE3 bin op
4633 multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4634 ValueType DstVT, ValueType OpVT, RegisterClass RC,
4635 PatFrag memop_frag, X86MemOperand x86memop,
4636 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4637 let isCommutable = 1 in
4638 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4639 (ins RC:$src1, RC:$src2),
4641 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4642 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4643 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4645 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4646 (ins RC:$src1, x86memop:$src2),
4648 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4649 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4651 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4652 Sched<[sched.Folded, sched.ReadAfterFold]>;
4655 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4656 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4657 Intrinsic IntId128, X86FoldableSchedWrite sched,
4658 PatFrag ld_frag, bit Is2Addr = 1> {
4659 let isCommutable = 1 in
4660 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4661 (ins VR128:$src1, VR128:$src2),
4663 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4664 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4665 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4667 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4668 (ins VR128:$src1, i128mem:$src2),
4670 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4671 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4673 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4674 Sched<[sched.Folded, sched.ReadAfterFold]>;
4677 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4679 X86FoldableSchedWrite sched> {
4680 let isCommutable = 1 in
4681 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4682 (ins VR256:$src1, VR256:$src2),
4683 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4684 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4686 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4687 (ins VR256:$src1, i256mem:$src2),
4688 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4690 (IntId256 VR256:$src1, (load addr:$src2)))]>,
4691 Sched<[sched.Folded, sched.ReadAfterFold]>;
4694 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4695 let isCommutable = 0 in {
4696 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4697 VR128, load, i128mem,
4698 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4699 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4700 v16i8, VR128, load, i128mem,
4701 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4703 defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4704 VR128, load, i128mem,
4705 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4708 let ImmT = NoImm, Predicates = [HasAVX] in {
4709 let isCommutable = 0 in {
4710 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4712 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4713 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4715 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4716 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4718 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4719 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4721 SchedWritePHAdd.XMM, 0>, VEX_4V;
4722 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
4723 int_x86_ssse3_psign_b_128,
4724 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4725 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
4726 int_x86_ssse3_psign_w_128,
4727 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4728 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
4729 int_x86_ssse3_psign_d_128,
4730 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4731 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
4732 int_x86_ssse3_phadd_sw_128,
4733 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4734 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
4735 int_x86_ssse3_phsub_sw_128,
4736 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4740 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4741 let isCommutable = 0 in {
4742 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4743 VR256, load, i256mem,
4744 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4745 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4746 v32i8, VR256, load, i256mem,
4747 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4749 defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4750 VR256, load, i256mem,
4751 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4754 let ImmT = NoImm, Predicates = [HasAVX2] in {
4755 let isCommutable = 0 in {
4756 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4757 VR256, load, i256mem,
4758 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4759 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4761 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4762 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4763 VR256, load, i256mem,
4764 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4765 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4767 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
4768 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4769 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4770 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4771 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4772 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4773 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4774 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4775 int_x86_avx2_phadd_sw,
4776 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4777 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4778 int_x86_avx2_phsub_sw,
4779 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4783 // None of these have i8 immediate fields.
4784 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4785 let isCommutable = 0 in {
4786 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4787 memop, i128mem, SchedWritePHAdd.XMM>;
4788 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4789 memop, i128mem, SchedWritePHAdd.XMM>;
4790 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4791 memop, i128mem, SchedWritePHAdd.XMM>;
4792 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4793 memop, i128mem, SchedWritePHAdd.XMM>;
4794 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4795 SchedWriteVecALU.XMM, memop>;
4796 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4797 SchedWriteVecALU.XMM, memop>;
4798 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4799 SchedWriteVecALU.XMM, memop>;
4800 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4801 memop, i128mem, SchedWriteVarShuffle.XMM>;
4802 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
4803 int_x86_ssse3_phadd_sw_128,
4804 SchedWritePHAdd.XMM, memop>;
4805 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
4806 int_x86_ssse3_phsub_sw_128,
4807 SchedWritePHAdd.XMM, memop>;
4808 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4809 v16i8, VR128, memop, i128mem,
4810 SchedWriteVecIMul.XMM>;
4812 defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4813 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4816 //===---------------------------------------------------------------------===//
4817 // SSSE3 - Packed Align Instruction Patterns
4818 //===---------------------------------------------------------------------===//
4820 multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4821 PatFrag memop_frag, X86MemOperand x86memop,
4822 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4823 let hasSideEffects = 0 in {
4824 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4825 (ins RC:$src1, RC:$src2, u8imm:$src3),
4827 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4829 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4830 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
4833 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4834 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4836 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4838 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4839 [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4840 (memop_frag addr:$src2),
4841 (i8 timm:$src3))))]>,
4842 Sched<[sched.Folded, sched.ReadAfterFold]>;
4846 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4847 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4848 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4849 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4850 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4851 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4852 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4853 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4854 SchedWriteShuffle.XMM>;
4856 //===---------------------------------------------------------------------===//
4857 // SSSE3 - Thread synchronization
4858 //===---------------------------------------------------------------------===//
4860 let SchedRW = [WriteSystem] in {
4861 let Uses = [EAX, ECX, EDX] in
4862 def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4863 TB, Requires<[HasSSE3, Not64BitMode]>;
4864 let Uses = [RAX, ECX, EDX] in
4865 def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4866 TB, Requires<[HasSSE3, In64BitMode]>;
4868 let Uses = [ECX, EAX] in
4869 def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
4870 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4873 def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4874 def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4876 def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4877 Requires<[Not64BitMode]>;
4878 def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4879 Requires<[In64BitMode]>;
4881 //===----------------------------------------------------------------------===//
4882 // SSE4.1 - Packed Move with Sign/Zero Extend
4883 // NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
4884 //===----------------------------------------------------------------------===//
4886 multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4887 RegisterClass OutRC, RegisterClass InRC,
4888 X86FoldableSchedWrite sched> {
4889 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
4890 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4893 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
4894 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4895 Sched<[sched.Folded]>;
4898 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
4899 X86MemOperand MemOp, X86MemOperand MemYOp,
4901 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
4902 SchedWriteShuffle.XMM>;
4903 let Predicates = [HasAVX, prd] in
4904 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
4905 VR128, VR128, SchedWriteShuffle.XMM>,
4907 let Predicates = [HasAVX2, prd] in
4908 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
4909 VR256, VR128, WriteShuffle256>,
4910 VEX, VEX_L, VEX_WIG;
4913 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4914 X86MemOperand MemYOp, Predicate prd> {
4915 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
4916 MemOp, MemYOp, prd>;
4917 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
4918 !strconcat("pmovzx", OpcodeStr),
4919 MemOp, MemYOp, prd>;
4922 defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
4923 defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
4924 defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
4926 defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
4927 defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
4929 defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
4932 multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
4933 SDNode ExtOp, SDNode InVecOp> {
4934 // Register-Register patterns
4935 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4936 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
4937 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
4939 let Predicates = [HasAVX2, NoVLX] in {
4940 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
4941 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
4942 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
4943 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
4945 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
4946 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
4947 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
4948 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
4950 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
4951 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
4954 // Simple Register-Memory patterns
4955 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4956 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4957 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4959 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
4960 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4963 let Predicates = [HasAVX2, NoVLX] in {
4964 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4965 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4966 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4967 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
4969 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
4970 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
4971 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
4972 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
4974 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
4975 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
4978 // AVX2 Register-Memory patterns
4979 let Predicates = [HasAVX2, NoVLX] in {
4980 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
4981 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
4983 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
4984 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4985 def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
4986 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4988 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
4989 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
4991 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
4992 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
4993 def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
4994 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
4996 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
4997 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
4998 def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))),
4999 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5003 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5004 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5006 // SSE4.1/AVX patterns.
5007 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5009 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5010 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5011 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5013 let Predicates = [HasAVX, NoVLX] in {
5014 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5015 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5016 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5017 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5019 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5020 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5021 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5022 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5024 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5025 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5027 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5028 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5029 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5031 let Predicates = [HasAVX, NoVLX] in {
5032 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5033 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5034 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5035 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5037 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5038 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5039 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5040 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5042 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5043 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5045 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5046 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5047 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5048 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5049 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5050 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5051 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5052 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5053 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5055 let Predicates = [HasAVX, NoVLX] in {
5056 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5057 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5058 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5059 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5060 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5061 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5063 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5064 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5065 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5066 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5068 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5069 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5070 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5071 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5072 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5073 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5074 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5075 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5077 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5078 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5079 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5080 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5081 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5082 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5084 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5085 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5086 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5087 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5088 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5089 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5090 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5091 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5095 defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5096 defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5098 let Predicates = [UseSSE41] in {
5099 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5100 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5103 //===----------------------------------------------------------------------===//
5104 // SSE4.1 - Extract Instructions
5105 //===----------------------------------------------------------------------===//
5107 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5108 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5109 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5110 (ins VR128:$src1, u8imm:$src2),
5111 !strconcat(OpcodeStr,
5112 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5113 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5115 Sched<[WriteVecExtract]>;
5116 let hasSideEffects = 0, mayStore = 1 in
5117 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5118 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5119 !strconcat(OpcodeStr,
5120 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5121 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
5122 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5125 let Predicates = [HasAVX, NoBWI] in
5126 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
5128 defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
5131 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5132 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5133 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5134 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5135 (ins VR128:$src1, u8imm:$src2),
5136 !strconcat(OpcodeStr,
5137 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5138 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5140 let hasSideEffects = 0, mayStore = 1 in
5141 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5142 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5143 !strconcat(OpcodeStr,
5144 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5145 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
5146 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5149 let Predicates = [HasAVX, NoBWI] in
5150 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
5152 defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
5155 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5156 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5157 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5158 (ins VR128:$src1, u8imm:$src2),
5159 !strconcat(OpcodeStr,
5160 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5162 (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5163 Sched<[WriteVecExtract]>;
5164 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5165 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5166 !strconcat(OpcodeStr,
5167 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5168 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5169 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5172 let Predicates = [HasAVX, NoDQI] in
5173 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5175 defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
5177 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5178 multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5179 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5180 (ins VR128:$src1, u8imm:$src2),
5181 !strconcat(OpcodeStr,
5182 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5184 (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5185 Sched<[WriteVecExtract]>;
5186 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5187 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5188 !strconcat(OpcodeStr,
5189 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5190 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5191 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5194 let Predicates = [HasAVX, NoDQI] in
5195 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5197 defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W;
5199 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5201 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5202 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5203 (ins VR128:$src1, u8imm:$src2),
5204 !strconcat(OpcodeStr,
5205 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5206 [(set GR32orGR64:$dst,
5207 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5208 Sched<[WriteVecExtract]>;
5209 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5210 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5211 !strconcat(OpcodeStr,
5212 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5213 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5214 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5217 let ExeDomain = SSEPackedSingle in {
5218 let Predicates = [UseAVX] in
5219 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5220 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
5223 //===----------------------------------------------------------------------===//
5224 // SSE4.1 - Insert Instructions
5225 //===----------------------------------------------------------------------===//
5227 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5228 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5229 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5231 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5233 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5235 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
5236 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5237 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5238 (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5240 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5242 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5244 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>,
5245 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5248 let Predicates = [HasAVX, NoBWI] in
5249 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
5250 let Constraints = "$src1 = $dst" in
5251 defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
5253 multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5254 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5255 (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5257 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5259 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5261 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5262 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5263 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5264 (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5266 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5268 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5270 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5271 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5274 let Predicates = [HasAVX, NoDQI] in
5275 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5276 let Constraints = "$src1 = $dst" in
5277 defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5279 multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5280 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5281 (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5283 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5285 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5287 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5288 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5289 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5290 (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5292 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5294 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5296 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5297 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5300 let Predicates = [HasAVX, NoDQI] in
5301 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5302 let Constraints = "$src1 = $dst" in
5303 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5305 // insertps has a few different modes, there's the first two here below which
5306 // are optimized inserts that won't zero arbitrary elements in the destination
5307 // vector. The next one matches the intrinsic and could zero arbitrary elements
5308 // in the target vector.
5309 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5310 let isCommutable = 1 in
5311 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5312 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5314 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5316 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5318 (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
5319 Sched<[SchedWriteFShuffle.XMM]>;
5320 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5321 (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5323 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5325 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5327 (X86insertps VR128:$src1,
5328 (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5330 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5333 let ExeDomain = SSEPackedSingle in {
5334 let Predicates = [UseAVX] in
5335 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5337 let Constraints = "$src1 = $dst" in
5338 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5341 //===----------------------------------------------------------------------===//
5342 // SSE4.1 - Round Instructions
5343 //===----------------------------------------------------------------------===//
5345 multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5346 X86MemOperand x86memop, RegisterClass RC,
5347 ValueType VT, PatFrag mem_frag, SDNode OpNode,
5348 X86FoldableSchedWrite sched> {
5349 // Intrinsic operation, reg.
5350 // Vector intrinsic operation, reg
5351 def r : SS4AIi8<opc, MRMSrcReg,
5352 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5353 !strconcat(OpcodeStr,
5354 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5355 [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
5358 // Vector intrinsic operation, mem
5359 def m : SS4AIi8<opc, MRMSrcMem,
5360 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5361 !strconcat(OpcodeStr,
5362 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5364 (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
5365 Sched<[sched.Folded]>;
5368 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5369 string OpcodeStr, X86FoldableSchedWrite sched> {
5370 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5371 def SSr : SS4AIi8<opcss, MRMSrcReg,
5372 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5373 !strconcat(OpcodeStr,
5374 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5375 []>, Sched<[sched]>;
5378 def SSm : SS4AIi8<opcss, MRMSrcMem,
5379 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5380 !strconcat(OpcodeStr,
5381 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5382 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5383 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5385 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5386 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5387 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5388 !strconcat(OpcodeStr,
5389 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5390 []>, Sched<[sched]>;
5393 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5394 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5395 !strconcat(OpcodeStr,
5396 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5397 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5398 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5401 multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5402 string OpcodeStr, X86FoldableSchedWrite sched> {
5403 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5404 def SSr : SS4AIi8<opcss, MRMSrcReg,
5405 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5406 !strconcat(OpcodeStr,
5407 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5408 []>, Sched<[sched]>;
5411 def SSm : SS4AIi8<opcss, MRMSrcMem,
5412 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5413 !strconcat(OpcodeStr,
5414 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5415 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5416 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5418 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5419 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5420 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5421 !strconcat(OpcodeStr,
5422 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5423 []>, Sched<[sched]>;
5426 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5427 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5428 !strconcat(OpcodeStr,
5429 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5430 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5431 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5434 multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5435 string OpcodeStr, X86FoldableSchedWrite sched,
5436 ValueType VT32, ValueType VT64,
5437 SDNode OpNode, bit Is2Addr = 1> {
5438 let ExeDomain = SSEPackedSingle in {
5439 def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5440 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5442 !strconcat(OpcodeStr,
5443 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5444 !strconcat(OpcodeStr,
5445 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5446 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5449 def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5450 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5452 !strconcat(OpcodeStr,
5453 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5454 !strconcat(OpcodeStr,
5455 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5457 (OpNode VR128:$src1, sse_load_f32:$src2, timm:$src3))]>,
5458 Sched<[sched.Folded, sched.ReadAfterFold]>;
5459 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5461 let ExeDomain = SSEPackedDouble in {
5462 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5463 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5465 !strconcat(OpcodeStr,
5466 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5467 !strconcat(OpcodeStr,
5468 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5469 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5472 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5473 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5475 !strconcat(OpcodeStr,
5476 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5477 !strconcat(OpcodeStr,
5478 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5480 (OpNode VR128:$src1, sse_load_f64:$src2, timm:$src3))]>,
5481 Sched<[sched.Folded, sched.ReadAfterFold]>;
5482 } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5485 // FP round - roundss, roundps, roundsd, roundpd
5486 let Predicates = [HasAVX, NoVLX] in {
5487 let ExeDomain = SSEPackedSingle in {
5489 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5490 loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>,
5492 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5493 loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>,
5494 VEX, VEX_L, VEX_WIG;
5497 let ExeDomain = SSEPackedDouble in {
5498 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5499 loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>,
5501 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5502 loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>,
5503 VEX, VEX_L, VEX_WIG;
5506 let Predicates = [UseAVX] in {
5507 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5508 v4f32, v2f64, X86RndScales, 0>,
5509 VEX_4V, VEX_LIG, VEX_WIG;
5510 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5511 VEX_4V, VEX_LIG, VEX_WIG;
5514 let Predicates = [UseAVX] in {
5515 def : Pat<(X86VRndScale FR32:$src1, timm:$src2),
5516 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
5517 def : Pat<(X86VRndScale FR64:$src1, timm:$src2),
5518 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
5521 let Predicates = [UseAVX, OptForSize] in {
5522 def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2),
5523 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5524 def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2),
5525 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5528 let ExeDomain = SSEPackedSingle in
5529 defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5530 memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>;
5531 let ExeDomain = SSEPackedDouble in
5532 defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5533 memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>;
5535 defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5537 let Constraints = "$src1 = $dst" in
5538 defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5539 v4f32, v2f64, X86RndScales>;
5541 let Predicates = [UseSSE41] in {
5542 def : Pat<(X86VRndScale FR32:$src1, timm:$src2),
5543 (ROUNDSSr FR32:$src1, timm:$src2)>;
5544 def : Pat<(X86VRndScale FR64:$src1, timm:$src2),
5545 (ROUNDSDr FR64:$src1, timm:$src2)>;
5548 let Predicates = [UseSSE41, OptForSize] in {
5549 def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2),
5550 (ROUNDSSm addr:$src1, timm:$src2)>;
5551 def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2),
5552 (ROUNDSDm addr:$src1, timm:$src2)>;
5555 //===----------------------------------------------------------------------===//
5556 // SSE4.1 - Packed Bit Test
5557 //===----------------------------------------------------------------------===//
5559 // ptest instruction we'll lower to this in X86ISelLowering primarily from
5560 // the intel intrinsic that corresponds to this.
5561 let Defs = [EFLAGS], Predicates = [HasAVX] in {
5562 def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5563 "vptest\t{$src2, $src1|$src1, $src2}",
5564 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5565 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
5566 def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5567 "vptest\t{$src2, $src1|$src1, $src2}",
5568 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5569 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5572 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5573 "vptest\t{$src2, $src1|$src1, $src2}",
5574 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5575 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
5576 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5577 "vptest\t{$src2, $src1|$src1, $src2}",
5578 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5579 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5580 VEX, VEX_L, VEX_WIG;
5583 let Defs = [EFLAGS] in {
5584 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5585 "ptest\t{$src2, $src1|$src1, $src2}",
5586 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5587 Sched<[SchedWriteVecTest.XMM]>;
5588 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5589 "ptest\t{$src2, $src1|$src1, $src2}",
5590 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5591 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5594 // The bit test instructions below are AVX only
5595 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5596 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5597 X86FoldableSchedWrite sched> {
5598 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5599 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5600 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5601 Sched<[sched]>, VEX;
5602 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5603 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5604 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5605 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5608 let Defs = [EFLAGS], Predicates = [HasAVX] in {
5609 let ExeDomain = SSEPackedSingle in {
5610 defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5611 SchedWriteFTest.XMM>;
5612 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5613 SchedWriteFTest.YMM>, VEX_L;
5615 let ExeDomain = SSEPackedDouble in {
5616 defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5617 SchedWriteFTest.XMM>;
5618 defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5619 SchedWriteFTest.YMM>, VEX_L;
5623 //===----------------------------------------------------------------------===//
5624 // SSE4.1 - Misc Instructions
5625 //===----------------------------------------------------------------------===//
5627 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5628 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5629 "popcnt{w}\t{$src, $dst|$dst, $src}",
5630 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5631 Sched<[WritePOPCNT]>, OpSize16, XS;
5632 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5633 "popcnt{w}\t{$src, $dst|$dst, $src}",
5634 [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5635 (implicit EFLAGS)]>,
5636 Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5638 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5639 "popcnt{l}\t{$src, $dst|$dst, $src}",
5640 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5641 Sched<[WritePOPCNT]>, OpSize32, XS;
5643 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5644 "popcnt{l}\t{$src, $dst|$dst, $src}",
5645 [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5646 (implicit EFLAGS)]>,
5647 Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5649 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5650 "popcnt{q}\t{$src, $dst|$dst, $src}",
5651 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5652 Sched<[WritePOPCNT]>, XS;
5653 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5654 "popcnt{q}\t{$src, $dst|$dst, $src}",
5655 [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5656 (implicit EFLAGS)]>,
5657 Sched<[WritePOPCNT.Folded]>, XS;
5660 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5661 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5662 SDNode OpNode, PatFrag ld_frag,
5663 X86FoldableSchedWrite Sched> {
5664 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5666 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5667 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5669 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5671 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5673 (v8i16 (OpNode (ld_frag addr:$src))))]>,
5674 Sched<[Sched.Folded]>;
5677 // PHMIN has the same profile as PSAD, thus we use the same scheduling
5678 // model, although the naming is misleading.
5679 let Predicates = [HasAVX] in
5680 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5682 WritePHMINPOS>, VEX, VEX_WIG;
5683 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5687 /// SS48I_binop_rm - Simple SSE41 binary operator.
5688 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5689 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5690 X86MemOperand x86memop, X86FoldableSchedWrite sched,
5692 let isCommutable = 1 in
5693 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5694 (ins RC:$src1, RC:$src2),
5696 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5697 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5698 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5700 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5701 (ins RC:$src1, x86memop:$src2),
5703 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5704 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5706 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5707 Sched<[sched.Folded, sched.ReadAfterFold]>;
5710 let Predicates = [HasAVX, NoVLX] in {
5711 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5712 load, i128mem, SchedWriteVecALU.XMM, 0>,
5714 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5715 load, i128mem, SchedWriteVecALU.XMM, 0>,
5717 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5718 load, i128mem, SchedWriteVecALU.XMM, 0>,
5720 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5721 load, i128mem, SchedWriteVecALU.XMM, 0>,
5723 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5724 load, i128mem, SchedWriteVecIMul.XMM, 0>,
5727 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5728 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5729 load, i128mem, SchedWriteVecALU.XMM, 0>,
5731 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5732 load, i128mem, SchedWriteVecALU.XMM, 0>,
5734 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5735 load, i128mem, SchedWriteVecALU.XMM, 0>,
5737 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5738 load, i128mem, SchedWriteVecALU.XMM, 0>,
5742 let Predicates = [HasAVX2, NoVLX] in {
5743 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5744 load, i256mem, SchedWriteVecALU.YMM, 0>,
5745 VEX_4V, VEX_L, VEX_WIG;
5746 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5747 load, i256mem, SchedWriteVecALU.YMM, 0>,
5748 VEX_4V, VEX_L, VEX_WIG;
5749 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5750 load, i256mem, SchedWriteVecALU.YMM, 0>,
5751 VEX_4V, VEX_L, VEX_WIG;
5752 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5753 load, i256mem, SchedWriteVecALU.YMM, 0>,
5754 VEX_4V, VEX_L, VEX_WIG;
5755 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5756 load, i256mem, SchedWriteVecIMul.YMM, 0>,
5757 VEX_4V, VEX_L, VEX_WIG;
5759 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5760 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5761 load, i256mem, SchedWriteVecALU.YMM, 0>,
5762 VEX_4V, VEX_L, VEX_WIG;
5763 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5764 load, i256mem, SchedWriteVecALU.YMM, 0>,
5765 VEX_4V, VEX_L, VEX_WIG;
5766 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5767 load, i256mem, SchedWriteVecALU.YMM, 0>,
5768 VEX_4V, VEX_L, VEX_WIG;
5769 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5770 load, i256mem, SchedWriteVecALU.YMM, 0>,
5771 VEX_4V, VEX_L, VEX_WIG;
5774 let Constraints = "$src1 = $dst" in {
5775 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5776 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5777 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5778 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5779 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5780 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5781 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5782 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5783 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5784 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5785 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5786 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5787 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5788 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5789 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5790 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5791 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5792 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5795 let Predicates = [HasAVX, NoVLX] in
5796 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5797 load, i128mem, SchedWritePMULLD.XMM, 0>,
5799 let Predicates = [HasAVX] in
5800 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5801 load, i128mem, SchedWriteVecALU.XMM, 0>,
5804 let Predicates = [HasAVX2, NoVLX] in
5805 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5806 load, i256mem, SchedWritePMULLD.YMM, 0>,
5807 VEX_4V, VEX_L, VEX_WIG;
5808 let Predicates = [HasAVX2] in
5809 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5810 load, i256mem, SchedWriteVecALU.YMM, 0>,
5811 VEX_4V, VEX_L, VEX_WIG;
5813 let Constraints = "$src1 = $dst" in {
5814 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5815 memop, i128mem, SchedWritePMULLD.XMM, 1>;
5816 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5817 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5820 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5821 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5822 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5823 X86MemOperand x86memop, bit Is2Addr,
5824 X86FoldableSchedWrite sched> {
5825 let isCommutable = 1 in
5826 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5827 (ins RC:$src1, RC:$src2, u8imm:$src3),
5829 !strconcat(OpcodeStr,
5830 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5831 !strconcat(OpcodeStr,
5832 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5833 [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
5835 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5836 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5838 !strconcat(OpcodeStr,
5839 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5840 !strconcat(OpcodeStr,
5841 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5843 (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
5844 Sched<[sched.Folded, sched.ReadAfterFold]>;
5847 /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5848 multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5849 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5850 X86MemOperand x86memop, bit Is2Addr,
5851 X86FoldableSchedWrite sched> {
5852 let isCommutable = 1 in
5853 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5854 (ins RC:$src1, RC:$src2, u8imm:$src3),
5856 !strconcat(OpcodeStr,
5857 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5858 !strconcat(OpcodeStr,
5859 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5860 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
5862 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5863 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5865 !strconcat(OpcodeStr,
5866 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5867 !strconcat(OpcodeStr,
5868 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5870 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
5871 Sched<[sched.Folded, sched.ReadAfterFold]>;
5874 def BlendCommuteImm2 : SDNodeXForm<timm, [{
5875 uint8_t Imm = N->getZExtValue() & 0x03;
5876 return getI8Imm(Imm ^ 0x03, SDLoc(N));
5879 def BlendCommuteImm4 : SDNodeXForm<timm, [{
5880 uint8_t Imm = N->getZExtValue() & 0x0f;
5881 return getI8Imm(Imm ^ 0x0f, SDLoc(N));
5884 def BlendCommuteImm8 : SDNodeXForm<timm, [{
5885 uint8_t Imm = N->getZExtValue() & 0xff;
5886 return getI8Imm(Imm ^ 0xff, SDLoc(N));
5889 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
5890 def BlendScaleImm4 : SDNodeXForm<timm, [{
5891 uint8_t Imm = N->getZExtValue();
5893 for (unsigned i = 0; i != 4; ++i) {
5895 NewImm |= 0x3 << (i * 2);
5897 return getI8Imm(NewImm, SDLoc(N));
5900 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
5901 def BlendScaleImm2 : SDNodeXForm<timm, [{
5902 uint8_t Imm = N->getZExtValue();
5904 for (unsigned i = 0; i != 2; ++i) {
5906 NewImm |= 0xf << (i * 4);
5908 return getI8Imm(NewImm, SDLoc(N));
5911 // Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
5912 def BlendScaleImm2to4 : SDNodeXForm<timm, [{
5913 uint8_t Imm = N->getZExtValue();
5915 for (unsigned i = 0; i != 2; ++i) {
5917 NewImm |= 0x3 << (i * 2);
5919 return getI8Imm(NewImm, SDLoc(N));
5922 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
5923 def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
5924 uint8_t Imm = N->getZExtValue();
5926 for (unsigned i = 0; i != 4; ++i) {
5928 NewImm |= 0x3 << (i * 2);
5930 return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5933 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
5934 def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
5935 uint8_t Imm = N->getZExtValue();
5937 for (unsigned i = 0; i != 2; ++i) {
5939 NewImm |= 0xf << (i * 4);
5941 return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5944 // Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
5945 def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
5946 uint8_t Imm = N->getZExtValue();
5948 for (unsigned i = 0; i != 2; ++i) {
5950 NewImm |= 0x3 << (i * 2);
5952 return getI8Imm(NewImm ^ 0xf, SDLoc(N));
5955 let Predicates = [HasAVX] in {
5956 let isCommutable = 0 in {
5957 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
5958 VR128, load, i128mem, 0,
5959 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
5962 let ExeDomain = SSEPackedSingle in
5963 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
5964 VR128, load, f128mem, 0,
5965 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
5966 let ExeDomain = SSEPackedDouble in
5967 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
5968 VR128, load, f128mem, 0,
5969 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
5970 let ExeDomain = SSEPackedSingle in
5971 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
5972 VR256, load, i256mem, 0,
5973 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
5976 let Predicates = [HasAVX2] in {
5977 let isCommutable = 0 in {
5978 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
5979 VR256, load, i256mem, 0,
5980 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
5984 let Constraints = "$src1 = $dst" in {
5985 let isCommutable = 0 in {
5986 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
5987 VR128, memop, i128mem, 1,
5988 SchedWriteMPSAD.XMM>;
5991 let ExeDomain = SSEPackedSingle in
5992 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
5993 VR128, memop, f128mem, 1,
5994 SchedWriteDPPS.XMM>;
5995 let ExeDomain = SSEPackedDouble in
5996 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
5997 VR128, memop, f128mem, 1,
5998 SchedWriteDPPD.XMM>;
6001 /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6002 multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6003 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6004 X86MemOperand x86memop, bit Is2Addr, Domain d,
6005 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6006 let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6007 let isCommutable = 1 in
6008 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6009 (ins RC:$src1, RC:$src2, u8imm:$src3),
6011 !strconcat(OpcodeStr,
6012 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6013 !strconcat(OpcodeStr,
6014 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6015 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
6017 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6018 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6020 !strconcat(OpcodeStr,
6021 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6022 !strconcat(OpcodeStr,
6023 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6025 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6026 Sched<[sched.Folded, sched.ReadAfterFold]>;
6029 // Pattern to commute if load is in first source.
6030 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
6031 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6032 (commuteXForm timm:$src3))>;
6035 let Predicates = [HasAVX] in {
6036 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6037 VR128, load, f128mem, 0, SSEPackedSingle,
6038 SchedWriteFBlend.XMM, BlendCommuteImm4>,
6040 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6041 VR256, load, f256mem, 0, SSEPackedSingle,
6042 SchedWriteFBlend.YMM, BlendCommuteImm8>,
6043 VEX_4V, VEX_L, VEX_WIG;
6044 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6045 VR128, load, f128mem, 0, SSEPackedDouble,
6046 SchedWriteFBlend.XMM, BlendCommuteImm2>,
6048 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6049 VR256, load, f256mem, 0, SSEPackedDouble,
6050 SchedWriteFBlend.YMM, BlendCommuteImm4>,
6051 VEX_4V, VEX_L, VEX_WIG;
6052 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6053 VR128, load, i128mem, 0, SSEPackedInt,
6054 SchedWriteBlend.XMM, BlendCommuteImm8>,
6058 let Predicates = [HasAVX2] in {
6059 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6060 VR256, load, i256mem, 0, SSEPackedInt,
6061 SchedWriteBlend.YMM, BlendCommuteImm8>,
6062 VEX_4V, VEX_L, VEX_WIG;
6065 // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6066 // ExecutionDomainFixPass will cleanup domains later on.
6067 let Predicates = [HasAVX1Only] in {
6068 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
6069 (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6070 def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
6071 (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6072 def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
6073 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
6075 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6076 // it from becoming movsd via commuting under optsize.
6077 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6078 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6079 def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
6080 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6081 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
6082 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6084 def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
6085 (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6086 def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
6087 (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6088 def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
6089 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
6091 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6092 // it from becoming movss via commuting under optsize.
6093 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6094 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6095 def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
6096 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6097 def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
6098 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6101 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6102 VR128, memop, f128mem, 1, SSEPackedSingle,
6103 SchedWriteFBlend.XMM, BlendCommuteImm4>;
6104 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6105 VR128, memop, f128mem, 1, SSEPackedDouble,
6106 SchedWriteFBlend.XMM, BlendCommuteImm2>;
6107 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6108 VR128, memop, i128mem, 1, SSEPackedInt,
6109 SchedWriteBlend.XMM, BlendCommuteImm8>;
6111 let Predicates = [UseSSE41] in {
6112 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6113 // it from becoming movss via commuting under optsize.
6114 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6115 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6116 def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
6117 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6118 def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
6119 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6121 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6122 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6123 def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
6124 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6125 def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
6126 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6129 // For insertion into the zero index (low half) of a 256-bit vector, it is
6130 // more efficient to generate a blend with immediate instead of an insert*128.
6131 let Predicates = [HasAVX] in {
6132 def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6133 (VBLENDPDYrri VR256:$src1,
6134 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6135 VR128:$src2, sub_xmm), 0x3)>;
6136 def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6137 (VBLENDPSYrri VR256:$src1,
6138 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6139 VR128:$src2, sub_xmm), 0xf)>;
6141 def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6142 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6143 VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6144 def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6145 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6146 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6149 /// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6150 multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6151 X86MemOperand x86memop, ValueType VT,
6152 PatFrag mem_frag, SDNode OpNode,
6153 X86FoldableSchedWrite sched> {
6154 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6155 (ins RC:$src1, RC:$src2, RC:$src3),
6156 !strconcat(OpcodeStr,
6157 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6158 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6159 SSEPackedInt>, TAPD, VEX_4V,
6162 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6163 (ins RC:$src1, x86memop:$src2, RC:$src3),
6164 !strconcat(OpcodeStr,
6165 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6167 (OpNode RC:$src3, (mem_frag addr:$src2),
6168 RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6169 Sched<[sched.Folded, sched.ReadAfterFold,
6171 ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6174 sched.ReadAfterFold]>;
6177 let Predicates = [HasAVX] in {
6178 let ExeDomain = SSEPackedDouble in {
6179 defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6180 v2f64, loadv2f64, X86Blendv,
6181 SchedWriteFVarBlend.XMM>;
6182 defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6183 v4f64, loadv4f64, X86Blendv,
6184 SchedWriteFVarBlend.YMM>, VEX_L;
6185 } // ExeDomain = SSEPackedDouble
6186 let ExeDomain = SSEPackedSingle in {
6187 defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6188 v4f32, loadv4f32, X86Blendv,
6189 SchedWriteFVarBlend.XMM>;
6190 defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6191 v8f32, loadv8f32, X86Blendv,
6192 SchedWriteFVarBlend.YMM>, VEX_L;
6193 } // ExeDomain = SSEPackedSingle
6194 defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6195 v16i8, loadv16i8, X86Blendv,
6196 SchedWriteVarBlend.XMM>;
6199 let Predicates = [HasAVX2] in {
6200 defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6201 v32i8, loadv32i8, X86Blendv,
6202 SchedWriteVarBlend.YMM>, VEX_L;
6205 let Predicates = [HasAVX] in {
6206 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6207 (v4i32 VR128:$src2))),
6208 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6209 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6210 (v2i64 VR128:$src2))),
6211 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6212 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6213 (v8i32 VR256:$src2))),
6214 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6215 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6216 (v4i64 VR256:$src2))),
6217 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6220 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6221 // changed to use blends because blends have better throughput on sandybridge
6222 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6223 let Predicates = [HasAVX, OptForSpeed] in {
6224 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6225 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6226 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6227 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6229 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6230 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6231 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6232 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6233 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6234 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6236 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6237 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6238 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6239 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6240 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6241 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6243 // Move low f32 and clear high bits.
6244 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6245 (SUBREG_TO_REG (i32 0),
6246 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6247 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6248 (i8 1))), sub_xmm)>;
6249 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6250 (SUBREG_TO_REG (i32 0),
6251 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6252 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6253 (i8 3))), sub_xmm)>;
6256 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6257 // changed to use blends because blends have better throughput on sandybridge
6258 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6259 let Predicates = [UseSSE41, OptForSpeed] in {
6260 // With SSE41 we can use blends for these patterns.
6261 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6262 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6263 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6264 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6266 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6267 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6268 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6269 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6270 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6271 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6273 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6274 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6275 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6276 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6277 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6278 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6282 /// SS41I_ternary - SSE 4.1 ternary operator
6283 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6284 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6285 PatFrag mem_frag, X86MemOperand x86memop,
6286 SDNode OpNode, X86FoldableSchedWrite sched> {
6287 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6288 (ins VR128:$src1, VR128:$src2),
6289 !strconcat(OpcodeStr,
6290 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6292 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6295 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6296 (ins VR128:$src1, x86memop:$src2),
6297 !strconcat(OpcodeStr,
6298 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6300 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6301 Sched<[sched.Folded, sched.ReadAfterFold]>;
6305 let ExeDomain = SSEPackedDouble in
6306 defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6307 X86Blendv, SchedWriteFVarBlend.XMM>;
6308 let ExeDomain = SSEPackedSingle in
6309 defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6310 X86Blendv, SchedWriteFVarBlend.XMM>;
6311 defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6312 X86Blendv, SchedWriteVarBlend.XMM>;
6314 // Aliases with the implicit xmm0 argument
6315 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6316 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6317 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6318 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6319 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6320 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6321 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6322 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6323 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6324 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6325 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6326 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6328 let Predicates = [UseSSE41] in {
6329 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6330 (v4i32 VR128:$src2))),
6331 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6332 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6333 (v2i64 VR128:$src2))),
6334 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6337 let AddedComplexity = 400 in { // Prefer non-temporal versions
6339 let Predicates = [HasAVX, NoVLX] in
6340 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6341 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6342 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6343 let Predicates = [HasAVX2, NoVLX] in
6344 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6345 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6346 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6347 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6348 "movntdqa\t{$src, $dst|$dst, $src}", []>,
6349 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6351 let Predicates = [HasAVX2, NoVLX] in {
6352 def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6353 (VMOVNTDQAYrm addr:$src)>;
6354 def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6355 (VMOVNTDQAYrm addr:$src)>;
6356 def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6357 (VMOVNTDQAYrm addr:$src)>;
6358 def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6359 (VMOVNTDQAYrm addr:$src)>;
6360 def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6361 (VMOVNTDQAYrm addr:$src)>;
6362 def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6363 (VMOVNTDQAYrm addr:$src)>;
6366 let Predicates = [HasAVX, NoVLX] in {
6367 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6368 (VMOVNTDQArm addr:$src)>;
6369 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6370 (VMOVNTDQArm addr:$src)>;
6371 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6372 (VMOVNTDQArm addr:$src)>;
6373 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6374 (VMOVNTDQArm addr:$src)>;
6375 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6376 (VMOVNTDQArm addr:$src)>;
6377 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6378 (VMOVNTDQArm addr:$src)>;
6381 let Predicates = [UseSSE41] in {
6382 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6383 (MOVNTDQArm addr:$src)>;
6384 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6385 (MOVNTDQArm addr:$src)>;
6386 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6387 (MOVNTDQArm addr:$src)>;
6388 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6389 (MOVNTDQArm addr:$src)>;
6390 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6391 (MOVNTDQArm addr:$src)>;
6392 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6393 (MOVNTDQArm addr:$src)>;
6396 } // AddedComplexity
6398 //===----------------------------------------------------------------------===//
6399 // SSE4.2 - Compare Instructions
6400 //===----------------------------------------------------------------------===//
6402 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
6403 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6404 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6405 X86MemOperand x86memop, X86FoldableSchedWrite sched,
6407 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6408 (ins RC:$src1, RC:$src2),
6410 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6411 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6412 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6414 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6415 (ins RC:$src1, x86memop:$src2),
6417 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6418 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6420 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6421 Sched<[sched.Folded, sched.ReadAfterFold]>;
6424 let Predicates = [HasAVX] in
6425 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6426 load, i128mem, SchedWriteVecALU.XMM, 0>,
6429 let Predicates = [HasAVX2] in
6430 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6431 load, i256mem, SchedWriteVecALU.YMM, 0>,
6432 VEX_4V, VEX_L, VEX_WIG;
6434 let Constraints = "$src1 = $dst" in
6435 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6436 memop, i128mem, SchedWriteVecALU.XMM>;
6438 //===----------------------------------------------------------------------===//
6439 // SSE4.2 - String/text Processing Instructions
6440 //===----------------------------------------------------------------------===//
6442 multiclass pcmpistrm_SS42AI<string asm> {
6443 def rr : SS42AI<0x62, MRMSrcReg, (outs),
6444 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6445 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6446 []>, Sched<[WritePCmpIStrM]>;
6448 def rm :SS42AI<0x62, MRMSrcMem, (outs),
6449 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6450 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6451 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6454 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6455 let Predicates = [HasAVX] in
6456 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
6457 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
6460 multiclass SS42AI_pcmpestrm<string asm> {
6461 def rr : SS42AI<0x60, MRMSrcReg, (outs),
6462 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6463 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6464 []>, Sched<[WritePCmpEStrM]>;
6466 def rm : SS42AI<0x60, MRMSrcMem, (outs),
6467 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6468 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6469 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6472 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6473 let Predicates = [HasAVX] in
6474 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
6475 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
6478 multiclass SS42AI_pcmpistri<string asm> {
6479 def rr : SS42AI<0x63, MRMSrcReg, (outs),
6480 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6481 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6482 []>, Sched<[WritePCmpIStrI]>;
6484 def rm : SS42AI<0x63, MRMSrcMem, (outs),
6485 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6486 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6487 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6490 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6491 let Predicates = [HasAVX] in
6492 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
6493 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
6496 multiclass SS42AI_pcmpestri<string asm> {
6497 def rr : SS42AI<0x61, MRMSrcReg, (outs),
6498 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6499 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6500 []>, Sched<[WritePCmpEStrI]>;
6502 def rm : SS42AI<0x61, MRMSrcMem, (outs),
6503 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6504 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6505 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6508 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6509 let Predicates = [HasAVX] in
6510 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
6511 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
6514 //===----------------------------------------------------------------------===//
6515 // SSE4.2 - CRC Instructions
6516 //===----------------------------------------------------------------------===//
6518 // No CRC instructions have AVX equivalents
6520 // crc intrinsic instruction
6521 // This set of instructions are only rm, the only difference is the size
6523 class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6524 RegisterClass RCIn, SDPatternOperator Int> :
6525 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6526 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6527 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6528 Sched<[WriteCRC32]>;
6530 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6531 X86MemOperand x86memop, SDPatternOperator Int> :
6532 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6533 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6534 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6535 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6537 let Constraints = "$src1 = $dst" in {
6538 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6539 int_x86_sse42_crc32_32_8>;
6540 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6541 int_x86_sse42_crc32_32_8>;
6542 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6543 int_x86_sse42_crc32_32_16>, OpSize16;
6544 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6545 int_x86_sse42_crc32_32_16>, OpSize16;
6546 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6547 int_x86_sse42_crc32_32_32>, OpSize32;
6548 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6549 int_x86_sse42_crc32_32_32>, OpSize32;
6550 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6551 int_x86_sse42_crc32_64_64>, REX_W;
6552 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6553 int_x86_sse42_crc32_64_64>, REX_W;
6554 let hasSideEffects = 0 in {
6556 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6558 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6563 //===----------------------------------------------------------------------===//
6564 // SHA-NI Instructions
6565 //===----------------------------------------------------------------------===//
6567 // FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6568 multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6569 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6570 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6571 (ins VR128:$src1, VR128:$src2),
6573 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6574 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6576 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6577 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6580 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6581 (ins VR128:$src1, i128mem:$src2),
6583 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6584 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6586 (set VR128:$dst, (IntId VR128:$src1,
6587 (memop addr:$src2), XMM0)),
6588 (set VR128:$dst, (IntId VR128:$src1,
6589 (memop addr:$src2))))]>, T8,
6590 Sched<[sched.Folded, sched.ReadAfterFold]>;
6593 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6594 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6595 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6596 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6598 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6599 (i8 timm:$src3)))]>, TA,
6600 Sched<[SchedWriteVecIMul.XMM]>;
6601 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6602 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6603 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6605 (int_x86_sha1rnds4 VR128:$src1,
6607 (i8 timm:$src3)))]>, TA,
6608 Sched<[SchedWriteVecIMul.XMM.Folded,
6609 SchedWriteVecIMul.XMM.ReadAfterFold]>;
6611 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6612 SchedWriteVecIMul.XMM>;
6613 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6614 SchedWriteVecIMul.XMM>;
6615 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6616 SchedWriteVecIMul.XMM>;
6619 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6620 SchedWriteVecIMul.XMM, 1>;
6622 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6623 SchedWriteVecIMul.XMM>;
6624 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6625 SchedWriteVecIMul.XMM>;
6628 // Aliases with explicit %xmm0
6629 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6630 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6631 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6632 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6634 //===----------------------------------------------------------------------===//
6635 // AES-NI Instructions
6636 //===----------------------------------------------------------------------===//
6638 multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6639 Intrinsic IntId, PatFrag ld_frag,
6640 bit Is2Addr = 0, RegisterClass RC = VR128,
6641 X86MemOperand MemOp = i128mem> {
6642 let AsmString = OpcodeStr##
6643 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6644 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6645 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6646 (ins RC:$src1, RC:$src2), "",
6647 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6648 Sched<[WriteAESDecEnc]>;
6649 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6650 (ins RC:$src1, MemOp:$src2), "",
6651 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6652 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6656 // Perform One Round of an AES Encryption/Decryption Flow
6657 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6658 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
6659 int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
6660 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
6661 int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
6662 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
6663 int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
6664 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
6665 int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
6668 let Predicates = [NoVLX, HasVAES] in {
6669 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc",
6670 int_x86_aesni_aesenc_256, load, 0, VR256,
6671 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6672 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast",
6673 int_x86_aesni_aesenclast_256, load, 0, VR256,
6674 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6675 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec",
6676 int_x86_aesni_aesdec_256, load, 0, VR256,
6677 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6678 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast",
6679 int_x86_aesni_aesdeclast_256, load, 0, VR256,
6680 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6683 let Constraints = "$src1 = $dst" in {
6684 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
6685 int_x86_aesni_aesenc, memop, 1>;
6686 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
6687 int_x86_aesni_aesenclast, memop, 1>;
6688 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
6689 int_x86_aesni_aesdec, memop, 1>;
6690 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
6691 int_x86_aesni_aesdeclast, memop, 1>;
6694 // Perform the AES InvMixColumn Transformation
6695 let Predicates = [HasAVX, HasAES] in {
6696 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6698 "vaesimc\t{$src1, $dst|$dst, $src1}",
6700 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6702 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6703 (ins i128mem:$src1),
6704 "vaesimc\t{$src1, $dst|$dst, $src1}",
6705 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6706 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
6708 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6710 "aesimc\t{$src1, $dst|$dst, $src1}",
6712 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6713 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6714 (ins i128mem:$src1),
6715 "aesimc\t{$src1, $dst|$dst, $src1}",
6716 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6717 Sched<[WriteAESIMC.Folded]>;
6719 // AES Round Key Generation Assist
6720 let Predicates = [HasAVX, HasAES] in {
6721 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6722 (ins VR128:$src1, u8imm:$src2),
6723 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6725 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6726 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
6727 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6728 (ins i128mem:$src1, u8imm:$src2),
6729 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6731 (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
6732 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
6734 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6735 (ins VR128:$src1, u8imm:$src2),
6736 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6738 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6739 Sched<[WriteAESKeyGen]>;
6740 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6741 (ins i128mem:$src1, u8imm:$src2),
6742 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6744 (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
6745 Sched<[WriteAESKeyGen.Folded]>;
6747 //===----------------------------------------------------------------------===//
6748 // PCLMUL Instructions
6749 //===----------------------------------------------------------------------===//
6751 // Immediate transform to help with commuting.
6752 def PCLMULCommuteImm : SDNodeXForm<timm, [{
6753 uint8_t Imm = N->getZExtValue();
6754 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6757 // SSE carry-less Multiplication instructions
6758 let Predicates = [NoAVX, HasPCLMUL] in {
6759 let Constraints = "$src1 = $dst" in {
6760 let isCommutable = 1 in
6761 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6762 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6763 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6765 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
6766 Sched<[WriteCLMul]>;
6768 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6769 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6770 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6772 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6774 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6775 } // Constraints = "$src1 = $dst"
6777 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6779 (PCLMULQDQrm VR128:$src1, addr:$src2,
6780 (PCLMULCommuteImm timm:$src3))>;
6781 } // Predicates = [NoAVX, HasPCLMUL]
6784 foreach HI = ["hq","lq"] in
6785 foreach LO = ["hq","lq"] in {
6786 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6787 (PCLMULQDQrr VR128:$dst, VR128:$src,
6788 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6789 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6790 (PCLMULQDQrm VR128:$dst, i128mem:$src,
6791 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6794 // AVX carry-less Multiplication instructions
6795 multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6796 PatFrag LdFrag, Intrinsic IntId> {
6797 let isCommutable = 1 in
6798 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6799 (ins RC:$src1, RC:$src2, u8imm:$src3),
6800 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6802 (IntId RC:$src1, RC:$src2, timm:$src3))]>,
6803 Sched<[WriteCLMul]>;
6805 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6806 (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6807 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6809 (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
6810 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6812 // We can commute a load in the first operand by swapping the sources and
6813 // rotating the immediate.
6814 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
6815 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6816 (PCLMULCommuteImm timm:$src3))>;
6819 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6820 defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6821 int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
6823 let Predicates = [NoVLX, HasVPCLMULQDQ] in
6824 defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6825 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
6827 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6828 X86MemOperand MemOp, string Hi, string Lo> {
6829 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6830 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6831 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6832 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6833 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6834 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6837 multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6838 X86MemOperand MemOp> {
6839 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6840 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6841 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6842 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6846 defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6847 defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6849 //===----------------------------------------------------------------------===//
6850 // SSE4A Instructions
6851 //===----------------------------------------------------------------------===//
6853 let Predicates = [HasSSE4A] in {
6855 let ExeDomain = SSEPackedInt in {
6856 let Constraints = "$src = $dst" in {
6857 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
6858 (ins VR128:$src, u8imm:$len, u8imm:$idx),
6859 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
6860 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
6862 PD, Sched<[SchedWriteVecALU.XMM]>;
6863 def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
6864 (ins VR128:$src, VR128:$mask),
6865 "extrq\t{$mask, $src|$src, $mask}",
6866 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
6868 PD, Sched<[SchedWriteVecALU.XMM]>;
6870 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
6871 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
6872 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
6873 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
6874 timm:$len, timm:$idx))]>,
6875 XD, Sched<[SchedWriteVecALU.XMM]>;
6876 def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
6877 (ins VR128:$src, VR128:$mask),
6878 "insertq\t{$mask, $src|$src, $mask}",
6879 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
6881 XD, Sched<[SchedWriteVecALU.XMM]>;
6883 } // ExeDomain = SSEPackedInt
6885 // Non-temporal (unaligned) scalar stores.
6886 let AddedComplexity = 400 in { // Prefer non-temporal versions
6887 let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
6888 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
6889 "movntss\t{$src, $dst|$dst, $src}", []>, XS;
6891 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
6892 "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
6895 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
6896 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
6898 def : Pat<(nontemporalstore FR64:$src, addr:$dst),
6899 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
6901 } // AddedComplexity
6904 //===----------------------------------------------------------------------===//
6906 //===----------------------------------------------------------------------===//
6908 //===----------------------------------------------------------------------===//
6909 // VBROADCAST - Load from memory and broadcast to all elements of the
6910 // destination operand
6912 class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
6913 X86MemOperand x86memop, ValueType VT,
6914 PatFrag bcast_frag, SchedWrite Sched> :
6915 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
6916 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6917 [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
6918 Sched<[Sched]>, VEX;
6920 // AVX2 adds register forms
6921 class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
6922 ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
6923 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
6924 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6925 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
6926 Sched<[Sched]>, VEX;
6928 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
6929 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
6930 f32mem, v4f32, X86VBroadcastld32,
6931 SchedWriteFShuffle.XMM.Folded>;
6932 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
6933 f32mem, v8f32, X86VBroadcastld32,
6934 SchedWriteFShuffle.XMM.Folded>, VEX_L;
6936 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
6937 def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
6938 v4f64, X86VBroadcastld64,
6939 SchedWriteFShuffle.XMM.Folded>, VEX_L;
6941 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
6942 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
6943 v4f32, v4f32, SchedWriteFShuffle.XMM>;
6944 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
6945 v8f32, v4f32, WriteFShuffle256>, VEX_L;
6947 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
6948 def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
6949 v4f64, v2f64, WriteFShuffle256>, VEX_L;
6951 //===----------------------------------------------------------------------===//
6952 // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
6953 // halves of a 256-bit vector.
6955 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
6956 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
6958 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
6959 Sched<[WriteShuffleLd]>, VEX, VEX_L;
6961 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
6962 ExeDomain = SSEPackedSingle in
6963 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
6965 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
6966 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
6968 let Predicates = [HasAVX, NoVLX] in {
6969 def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
6970 (VBROADCASTF128 addr:$src)>;
6971 def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
6972 (VBROADCASTF128 addr:$src)>;
6975 // NOTE: We're using FP instructions here, but execution domain fixing can
6976 // convert to integer when profitable.
6977 let Predicates = [HasAVX, NoVLX] in {
6978 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
6979 (VBROADCASTF128 addr:$src)>;
6980 def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
6981 (VBROADCASTF128 addr:$src)>;
6982 def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
6983 (VBROADCASTF128 addr:$src)>;
6984 def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
6985 (VBROADCASTF128 addr:$src)>;
6988 //===----------------------------------------------------------------------===//
6989 // VINSERTF128 - Insert packed floating-point values
6991 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
6992 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
6993 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
6994 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6995 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
6997 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
6998 (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
6999 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7000 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7003 // To create a 256-bit all ones value, we should produce VCMPTRUEPS
7004 // with YMM register containing zero.
7005 // FIXME: Avoid producing vxorps to clear the fake inputs.
7006 let Predicates = [HasAVX1Only] in {
7007 def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7010 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
7011 PatFrag memop_frag> {
7012 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7014 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7015 (INSERT_get_vinsert128_imm VR256:$ins))>;
7016 def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7017 (From (memop_frag addr:$src2)),
7019 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7020 (INSERT_get_vinsert128_imm VR256:$ins))>;
7023 let Predicates = [HasAVX, NoVLX] in {
7024 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
7025 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
7028 let Predicates = [HasAVX1Only] in {
7029 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>;
7030 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>;
7031 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
7032 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>;
7035 //===----------------------------------------------------------------------===//
7036 // VEXTRACTF128 - Extract packed floating-point values
7038 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7039 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7040 (ins VR256:$src1, u8imm:$src2),
7041 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7042 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7044 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7045 (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7046 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7047 []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7050 multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7051 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7052 (To (!cast<Instruction>(InstrStr#rr)
7054 (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7055 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7056 (iPTR imm))), addr:$dst),
7057 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7058 (EXTRACT_get_vextract128_imm VR128:$ext))>;
7062 let Predicates = [HasAVX, NoVLX] in {
7063 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7064 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7067 let Predicates = [HasAVX1Only] in {
7068 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
7069 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
7070 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7071 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
7074 //===----------------------------------------------------------------------===//
7075 // VMASKMOV - Conditional SIMD Packed Loads and Stores
7077 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7078 Intrinsic IntLd, Intrinsic IntLd256,
7079 Intrinsic IntSt, Intrinsic IntSt256,
7080 X86SchedWriteMaskMove schedX,
7081 X86SchedWriteMaskMove schedY> {
7082 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7083 (ins VR128:$src1, f128mem:$src2),
7084 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7085 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7086 VEX_4V, Sched<[schedX.RM]>;
7087 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7088 (ins VR256:$src1, f256mem:$src2),
7089 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7090 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7091 VEX_4V, VEX_L, Sched<[schedY.RM]>;
7092 def mr : AVX8I<opc_mr, MRMDestMem, (outs),
7093 (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7094 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7095 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7096 VEX_4V, Sched<[schedX.MR]>;
7097 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7098 (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7099 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7100 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7101 VEX_4V, VEX_L, Sched<[schedY.MR]>;
7104 let ExeDomain = SSEPackedSingle in
7105 defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7106 int_x86_avx_maskload_ps,
7107 int_x86_avx_maskload_ps_256,
7108 int_x86_avx_maskstore_ps,
7109 int_x86_avx_maskstore_ps_256,
7110 WriteFMaskMove32, WriteFMaskMove32Y>;
7111 let ExeDomain = SSEPackedDouble in
7112 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7113 int_x86_avx_maskload_pd,
7114 int_x86_avx_maskload_pd_256,
7115 int_x86_avx_maskstore_pd,
7116 int_x86_avx_maskstore_pd_256,
7117 WriteFMaskMove64, WriteFMaskMove64Y>;
7119 //===----------------------------------------------------------------------===//
7120 // VPERMIL - Permute Single and Double Floating-Point Values
7123 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7124 RegisterClass RC, X86MemOperand x86memop_f,
7125 X86MemOperand x86memop_i,
7126 ValueType f_vt, ValueType i_vt,
7127 X86FoldableSchedWrite sched,
7128 X86FoldableSchedWrite varsched> {
7129 let Predicates = [HasAVX, NoVLX] in {
7130 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7131 (ins RC:$src1, RC:$src2),
7132 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7133 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7135 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7136 (ins RC:$src1, x86memop_i:$src2),
7137 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7138 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7139 (i_vt (load addr:$src2)))))]>, VEX_4V,
7140 Sched<[varsched.Folded, sched.ReadAfterFold]>;
7142 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7143 (ins RC:$src1, u8imm:$src2),
7144 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7145 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
7147 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7148 (ins x86memop_f:$src1, u8imm:$src2),
7149 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7151 (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
7152 Sched<[sched.Folded]>;
7153 }// Predicates = [HasAVX, NoVLX]
7156 let ExeDomain = SSEPackedSingle in {
7157 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7158 v4f32, v4i32, SchedWriteFShuffle.XMM,
7159 SchedWriteFVarShuffle.XMM>;
7160 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7161 v8f32, v8i32, SchedWriteFShuffle.YMM,
7162 SchedWriteFVarShuffle.YMM>, VEX_L;
7164 let ExeDomain = SSEPackedDouble in {
7165 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7166 v2f64, v2i64, SchedWriteFShuffle.XMM,
7167 SchedWriteFVarShuffle.XMM>;
7168 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7169 v4f64, v4i64, SchedWriteFShuffle.YMM,
7170 SchedWriteFVarShuffle.YMM>, VEX_L;
7173 //===----------------------------------------------------------------------===//
7174 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7177 let ExeDomain = SSEPackedSingle in {
7178 let isCommutable = 1 in
7179 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7180 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7181 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7182 [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7183 (i8 timm:$src3))))]>, VEX_4V, VEX_L,
7184 Sched<[WriteFShuffle256]>;
7185 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7186 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7187 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7188 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
7189 (i8 timm:$src3)))]>, VEX_4V, VEX_L,
7190 Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7193 // Immediate transform to help with commuting.
7194 def Perm2XCommuteImm : SDNodeXForm<timm, [{
7195 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7198 let Predicates = [HasAVX] in {
7199 // Pattern with load in other operand.
7200 def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
7201 VR256:$src1, (i8 timm:$imm))),
7202 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7205 let Predicates = [HasAVX1Only] in {
7206 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7207 (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>;
7208 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7209 (loadv4i64 addr:$src2), (i8 timm:$imm))),
7210 (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>;
7211 // Pattern with load in other operand.
7212 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7213 VR256:$src1, (i8 timm:$imm))),
7214 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7217 //===----------------------------------------------------------------------===//
7218 // VZERO - Zero YMM registers
7219 // Note: These instruction do not affect the YMM16-YMM31.
7222 let SchedRW = [WriteSystem] in {
7223 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7224 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7225 // Zero All YMM registers
7226 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7227 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7228 Requires<[HasAVX]>, VEX_WIG;
7230 // Zero Upper bits of YMM registers
7231 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7232 [(int_x86_avx_vzeroupper)]>, PS, VEX,
7233 Requires<[HasAVX]>, VEX_WIG;
7237 //===----------------------------------------------------------------------===//
7238 // Half precision conversion instructions
7241 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7242 X86FoldableSchedWrite sched> {
7243 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7244 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7245 [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
7246 T8PD, VEX, Sched<[sched]>;
7247 let hasSideEffects = 0, mayLoad = 1 in
7248 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7249 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7250 [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
7251 T8PD, VEX, Sched<[sched.Folded]>;
7254 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7255 SchedWrite RR, SchedWrite MR> {
7256 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7257 (ins RC:$src1, i32u8imm:$src2),
7258 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7259 [(set VR128:$dst, (X86cvtps2ph RC:$src1, timm:$src2))]>,
7260 TAPD, VEX, Sched<[RR]>;
7261 let hasSideEffects = 0, mayStore = 1 in
7262 def mr : Ii8<0x1D, MRMDestMem, (outs),
7263 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7264 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7265 TAPD, VEX, Sched<[MR]>;
7268 let Predicates = [HasF16C, NoVLX] in {
7269 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>;
7270 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L;
7271 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7273 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7274 WriteCvtPS2PHYSt>, VEX_L;
7276 // Pattern match vcvtph2ps of a scalar i64 load.
7277 def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7278 (VCVTPH2PSrm addr:$src)>;
7279 def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16
7280 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7281 (VCVTPH2PSrm addr:$src)>;
7283 def : Pat<(store (f64 (extractelt
7284 (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
7285 (iPTR 0))), addr:$dst),
7286 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7287 def : Pat<(store (i64 (extractelt
7288 (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
7289 (iPTR 0))), addr:$dst),
7290 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7291 def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
7292 (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
7295 // Patterns for matching conversions from float to half-float and vice versa.
7296 let Predicates = [HasF16C, NoVLX] in {
7297 // Use MXCSR.RC for rounding instead of explicitly specifying the default
7298 // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
7299 // configurations we support (the default). However, falling back to MXCSR is
7300 // more consistent with other instructions, which are always controlled by it.
7301 // It's encoded as 0b100.
7302 def : Pat<(fp_to_f16 FR32:$src),
7303 (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
7304 (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
7306 def : Pat<(f16_to_fp GR16:$src),
7307 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7308 (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
7310 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
7311 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7312 (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
7315 //===----------------------------------------------------------------------===//
7316 // AVX2 Instructions
7317 //===----------------------------------------------------------------------===//
7319 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7320 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7321 ValueType OpVT, X86FoldableSchedWrite sched,
7323 X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7324 let isCommutable = 1 in
7325 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7326 (ins RC:$src1, RC:$src2, u8imm:$src3),
7327 !strconcat(OpcodeStr,
7328 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7329 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
7330 Sched<[sched]>, VEX_4V;
7331 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7332 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7333 !strconcat(OpcodeStr,
7334 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7336 (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
7337 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7339 // Pattern to commute if load is in first source.
7340 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
7341 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7342 (commuteXForm timm:$src3))>;
7345 let Predicates = [HasAVX2] in {
7346 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7347 SchedWriteBlend.XMM, VR128, i128mem,
7349 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7350 SchedWriteBlend.YMM, VR256, i256mem,
7351 BlendCommuteImm8>, VEX_L;
7353 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
7354 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
7355 def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
7356 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
7357 def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
7358 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
7360 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
7361 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
7362 def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
7363 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
7364 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
7365 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
7368 // For insertion into the zero index (low half) of a 256-bit vector, it is
7369 // more efficient to generate a blend with immediate instead of an insert*128.
7370 // NOTE: We're using FP instructions here, but exeuction domain fixing should
7371 // take care of using integer instructions when profitable.
7372 let Predicates = [HasAVX] in {
7373 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7374 (VBLENDPSYrri VR256:$src1,
7375 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7376 VR128:$src2, sub_xmm), 0xf)>;
7377 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7378 (VBLENDPSYrri VR256:$src1,
7379 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7380 VR128:$src2, sub_xmm), 0xf)>;
7381 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7382 (VBLENDPSYrri VR256:$src1,
7383 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7384 VR128:$src2, sub_xmm), 0xf)>;
7385 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7386 (VBLENDPSYrri VR256:$src1,
7387 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7388 VR128:$src2, sub_xmm), 0xf)>;
7390 def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7391 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7392 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7393 def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7394 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7395 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7396 def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7397 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7398 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7399 def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7400 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7401 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7404 //===----------------------------------------------------------------------===//
7405 // VPBROADCAST - Load from memory and broadcast to all elements of the
7406 // destination operand
7408 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7409 X86MemOperand x86memop, PatFrag bcast_frag,
7410 ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7411 let Predicates = [HasAVX2, prd] in {
7412 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7413 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7415 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7416 Sched<[SchedWriteShuffle.XMM]>, VEX;
7417 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7418 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7420 (OpVT128 (bcast_frag addr:$src)))]>,
7421 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7422 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7423 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7425 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7426 Sched<[WriteShuffle256]>, VEX, VEX_L;
7427 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7428 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7430 (OpVT256 (bcast_frag addr:$src)))]>,
7431 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7433 // Provide aliases for broadcast from the same register class that
7434 // automatically does the extract.
7435 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7436 (!cast<Instruction>(NAME#"Yrr")
7437 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7441 defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
7442 v16i8, v32i8, NoVLX_Or_NoBWI>;
7443 defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
7444 v8i16, v16i16, NoVLX_Or_NoBWI>;
7445 defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
7446 v4i32, v8i32, NoVLX>;
7447 defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
7448 v2i64, v4i64, NoVLX>;
7450 let Predicates = [HasAVX2, NoVLX] in {
7451 // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
7452 def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
7453 (VPBROADCASTQrm addr:$src)>;
7454 def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
7455 (VPBROADCASTQYrm addr:$src)>;
7457 // FIXME this is to handle aligned extloads from i8/i16.
7458 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
7459 (VPBROADCASTDrm addr:$src)>;
7460 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
7461 (VPBROADCASTDYrm addr:$src)>;
7463 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7464 // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
7465 // This means we'll encounter truncated i32 loads; match that here.
7466 def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7467 (VPBROADCASTWrm addr:$src)>;
7468 def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7469 (VPBROADCASTWYrm addr:$src)>;
7470 def : Pat<(v8i16 (X86VBroadcast
7471 (i16 (trunc (i32 (extloadi16 addr:$src)))))),
7472 (VPBROADCASTWrm addr:$src)>;
7473 def : Pat<(v8i16 (X86VBroadcast
7474 (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7475 (VPBROADCASTWrm addr:$src)>;
7476 def : Pat<(v16i16 (X86VBroadcast
7477 (i16 (trunc (i32 (extloadi16 addr:$src)))))),
7478 (VPBROADCASTWYrm addr:$src)>;
7479 def : Pat<(v16i16 (X86VBroadcast
7480 (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7481 (VPBROADCASTWYrm addr:$src)>;
7483 // FIXME this is to handle aligned extloads from i8.
7484 def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
7485 (VPBROADCASTWrm addr:$src)>;
7486 def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
7487 (VPBROADCASTWYrm addr:$src)>;
7490 let Predicates = [HasAVX2, NoVLX] in {
7491 // Provide fallback in case the load node that is used in the patterns above
7492 // is used by additional users, which prevents the pattern selection.
7493 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7494 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7495 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7496 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7497 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7498 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7501 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7502 def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7503 (VPBROADCASTBrr (VMOVDI2PDIrr
7504 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7505 GR8:$src, sub_8bit))))>;
7506 def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7507 (VPBROADCASTBYrr (VMOVDI2PDIrr
7508 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7509 GR8:$src, sub_8bit))))>;
7511 def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7512 (VPBROADCASTWrr (VMOVDI2PDIrr
7513 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7514 GR16:$src, sub_16bit))))>;
7515 def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7516 (VPBROADCASTWYrr (VMOVDI2PDIrr
7517 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7518 GR16:$src, sub_16bit))))>;
7520 let Predicates = [HasAVX2, NoVLX] in {
7521 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7522 (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
7523 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7524 (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
7525 def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7526 (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
7527 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7528 (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
7531 // AVX1 broadcast patterns
7532 let Predicates = [HasAVX1Only] in {
7533 def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
7534 (VBROADCASTSSYrm addr:$src)>;
7535 def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
7536 (VBROADCASTSDYrm addr:$src)>;
7537 def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
7538 (VBROADCASTSSrm addr:$src)>;
7541 // Provide fallback in case the load node that is used in the patterns above
7542 // is used by additional users, which prevents the pattern selection.
7543 let Predicates = [HasAVX, NoVLX] in {
7544 // 128bit broadcasts:
7545 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7546 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7547 def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
7548 (VMOVDDUPrm addr:$src)>;
7550 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7551 (VMOVDDUPrr VR128:$src)>;
7552 def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
7553 (VMOVDDUPrm addr:$src)>;
7554 def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
7555 (VMOVDDUPrm addr:$src)>;
7558 let Predicates = [HasAVX1Only] in {
7559 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7560 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7561 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7562 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7563 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7564 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7565 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7566 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7567 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7568 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7570 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7571 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
7572 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7573 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7574 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
7575 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
7576 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7577 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7578 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
7579 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
7581 def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7582 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
7583 def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
7584 (VMOVDDUPrm addr:$src)>;
7587 //===----------------------------------------------------------------------===//
7588 // VPERM - Permute instructions
7591 multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7592 ValueType OpVT, X86FoldableSchedWrite Sched,
7593 X86MemOperand memOp> {
7594 let Predicates = [HasAVX2, NoVLX] in {
7595 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7596 (ins VR256:$src1, VR256:$src2),
7597 !strconcat(OpcodeStr,
7598 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7600 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7601 Sched<[Sched]>, VEX_4V, VEX_L;
7602 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7603 (ins VR256:$src1, memOp:$src2),
7604 !strconcat(OpcodeStr,
7605 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7607 (OpVT (X86VPermv VR256:$src1,
7608 (load addr:$src2))))]>,
7609 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7613 defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7614 let ExeDomain = SSEPackedSingle in
7615 defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7617 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7618 ValueType OpVT, X86FoldableSchedWrite Sched,
7619 X86MemOperand memOp> {
7620 let Predicates = [HasAVX2, NoVLX] in {
7621 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7622 (ins VR256:$src1, u8imm:$src2),
7623 !strconcat(OpcodeStr,
7624 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7626 (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
7627 Sched<[Sched]>, VEX, VEX_L;
7628 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7629 (ins memOp:$src1, u8imm:$src2),
7630 !strconcat(OpcodeStr,
7631 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7633 (OpVT (X86VPermi (mem_frag addr:$src1),
7634 (i8 timm:$src2))))]>,
7635 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7639 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7640 WriteShuffle256, i256mem>, VEX_W;
7641 let ExeDomain = SSEPackedDouble in
7642 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7643 WriteFShuffle256, f256mem>, VEX_W;
7645 //===----------------------------------------------------------------------===//
7646 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
7648 let isCommutable = 1 in
7649 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7650 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7651 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7652 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7653 (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>,
7655 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7656 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7657 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7658 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
7659 (i8 timm:$src3)))]>,
7660 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7662 let Predicates = [HasAVX2] in
7663 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7664 VR256:$src1, (i8 timm:$imm))),
7665 (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7668 //===----------------------------------------------------------------------===//
7669 // VINSERTI128 - Insert packed integer values
7671 let hasSideEffects = 0 in {
7672 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7673 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7674 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7675 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7677 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7678 (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7679 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7680 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7683 let Predicates = [HasAVX2, NoVLX] in {
7684 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>;
7685 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>;
7686 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
7687 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>;
7690 //===----------------------------------------------------------------------===//
7691 // VEXTRACTI128 - Extract packed integer values
7693 def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7694 (ins VR256:$src1, u8imm:$src2),
7695 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7696 Sched<[WriteShuffle256]>, VEX, VEX_L;
7697 let hasSideEffects = 0, mayStore = 1 in
7698 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7699 (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7700 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7701 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7703 let Predicates = [HasAVX2, NoVLX] in {
7704 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
7705 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
7706 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7707 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
7710 //===----------------------------------------------------------------------===//
7711 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7713 multiclass avx2_pmovmask<string OpcodeStr,
7714 Intrinsic IntLd128, Intrinsic IntLd256,
7715 Intrinsic IntSt128, Intrinsic IntSt256> {
7716 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7717 (ins VR128:$src1, i128mem:$src2),
7718 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7719 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7720 VEX_4V, Sched<[WriteVecMaskedLoad]>;
7721 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7722 (ins VR256:$src1, i256mem:$src2),
7723 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7724 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7725 VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
7726 def mr : AVX28I<0x8e, MRMDestMem, (outs),
7727 (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7728 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7729 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7730 VEX_4V, Sched<[WriteVecMaskedStore]>;
7731 def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7732 (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7733 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7734 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7735 VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
7738 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7739 int_x86_avx2_maskload_d,
7740 int_x86_avx2_maskload_d_256,
7741 int_x86_avx2_maskstore_d,
7742 int_x86_avx2_maskstore_d_256>;
7743 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7744 int_x86_avx2_maskload_q,
7745 int_x86_avx2_maskload_q_256,
7746 int_x86_avx2_maskstore_q,
7747 int_x86_avx2_maskstore_q_256>, VEX_W;
7749 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7752 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7753 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7755 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7756 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7757 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7758 (VT immAllZerosV))),
7759 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7761 let Predicates = [HasAVX] in {
7762 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
7763 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
7764 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
7765 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
7767 let Predicates = [HasAVX1Only] in {
7768 // load/store i32/i64 not supported use ps/pd version
7769 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
7770 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
7771 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
7772 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
7774 let Predicates = [HasAVX2] in {
7775 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
7776 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
7777 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
7778 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
7781 //===----------------------------------------------------------------------===//
7782 // SubVector Broadcasts
7783 // Provide fallback in case the load node that is used in the patterns above
7784 // is used by additional users, which prevents the pattern selection.
7786 let Predicates = [HasAVX, NoVLX] in {
7787 def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
7788 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7789 (v2f64 VR128:$src), 1)>;
7790 def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
7791 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7792 (v4f32 VR128:$src), 1)>;
7795 // NOTE: We're using FP instructions here, but execution domain fixing can
7796 // convert to integer when profitable.
7797 let Predicates = [HasAVX, NoVLX] in {
7798 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
7799 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7800 (v2i64 VR128:$src), 1)>;
7801 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
7802 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7803 (v4i32 VR128:$src), 1)>;
7804 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
7805 (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7806 (v8i16 VR128:$src), 1)>;
7807 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
7808 (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7809 (v16i8 VR128:$src), 1)>;
7812 //===----------------------------------------------------------------------===//
7813 // Variable Bit Shifts
7815 multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7816 ValueType vt128, ValueType vt256> {
7817 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7818 (ins VR128:$src1, VR128:$src2),
7819 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7821 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7822 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7823 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7824 (ins VR128:$src1, i128mem:$src2),
7825 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7827 (vt128 (OpNode VR128:$src1,
7828 (vt128 (load addr:$src2)))))]>,
7829 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7830 SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7831 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7832 (ins VR256:$src1, VR256:$src2),
7833 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7835 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
7836 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
7837 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7838 (ins VR256:$src1, i256mem:$src2),
7839 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7841 (vt256 (OpNode VR256:$src1,
7842 (vt256 (load addr:$src2)))))]>,
7843 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
7844 SchedWriteVarVecShift.YMM.ReadAfterFold]>;
7847 let Predicates = [HasAVX2, NoVLX] in {
7848 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
7849 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
7850 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
7851 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
7852 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
7855 //===----------------------------------------------------------------------===//
7856 // VGATHER - GATHER Operations
7858 // FIXME: Improve scheduling of gather instructions.
7859 multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
7860 ValueType VTy, PatFrag GatherNode128,
7861 PatFrag GatherNode256, RegisterClass RC256,
7862 X86MemOperand memop128, X86MemOperand memop256,
7863 ValueType MTx = VTx, ValueType MTy = VTy> {
7864 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
7865 (ins VR128:$src1, memop128:$src2, VR128:$mask),
7866 !strconcat(OpcodeStr,
7867 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7868 [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
7869 (GatherNode128 VR128:$src1, VR128:$mask,
7870 vectoraddr:$src2))]>,
7871 VEX, Sched<[WriteLoad]>;
7872 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
7873 (ins RC256:$src1, memop256:$src2, RC256:$mask),
7874 !strconcat(OpcodeStr,
7875 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7876 [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
7877 (GatherNode256 RC256:$src1, RC256:$mask,
7878 vectoraddr:$src2))]>,
7879 VEX, VEX_L, Sched<[WriteLoad]>;
7882 let Predicates = [HasAVX2] in {
7883 let mayLoad = 1, hasSideEffects = 0, Constraints
7884 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
7886 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
7887 mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
7888 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
7889 mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
7890 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
7891 mgatherv8i32, VR256, vx128mem, vy256mem>;
7892 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
7893 mgatherv4i64, VR128, vx64mem, vy128mem>;
7895 let ExeDomain = SSEPackedDouble in {
7896 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
7897 mgatherv4i32, VR256, vx128mem, vx256mem,
7898 v2i64, v4i64>, VEX_W;
7899 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
7900 mgatherv4i64, VR256, vx128mem, vy256mem,
7901 v2i64, v4i64>, VEX_W;
7904 let ExeDomain = SSEPackedSingle in {
7905 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
7906 mgatherv8i32, VR256, vx128mem, vy256mem,
7908 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
7909 mgatherv4i64, VR128, vx64mem, vy128mem,
7915 //===----------------------------------------------------------------------===//
7916 // GFNI instructions
7917 //===----------------------------------------------------------------------===//
7919 multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
7920 RegisterClass RC, PatFrag MemOpFrag,
7921 X86MemOperand X86MemOp, bit Is2Addr = 0> {
7922 let ExeDomain = SSEPackedInt,
7923 AsmString = !if(Is2Addr,
7924 OpcodeStr##"\t{$src2, $dst|$dst, $src2}",
7925 OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
7926 let isCommutable = 1 in
7927 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
7928 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
7929 Sched<[SchedWriteVecALU.XMM]>, T8PD;
7931 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
7932 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
7933 (MemOpFrag addr:$src2))))]>,
7934 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
7938 multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
7939 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
7940 X86MemOperand X86MemOp, bit Is2Addr = 0> {
7941 let AsmString = !if(Is2Addr,
7942 OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7943 OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
7944 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
7945 (ins RC:$src1, RC:$src2, u8imm:$src3), "",
7946 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
7947 SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
7948 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
7949 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
7950 [(set RC:$dst, (OpVT (OpNode RC:$src1,
7951 (MemOpFrag addr:$src2),
7952 timm:$src3)))], SSEPackedInt>,
7953 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
7957 multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
7958 let Constraints = "$src1 = $dst",
7959 Predicates = [HasGFNI, UseSSE2] in
7960 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
7961 VR128, load, i128mem, 1>;
7962 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
7963 defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
7964 load, i128mem>, VEX_4V, VEX_W;
7965 defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
7966 load, i256mem>, VEX_4V, VEX_L, VEX_W;
7971 let Constraints = "$src1 = $dst",
7972 Predicates = [HasGFNI, UseSSE2] in
7973 defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
7975 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
7976 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
7978 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
7979 i256mem>, VEX_4V, VEX_L;
7981 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
7982 let isCommutable = 0 in {
7983 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
7984 X86GF2P8affineinvqb>, TAPD;
7985 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
7986 X86GF2P8affineqb>, TAPD;