1 //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file describes the X86 SSE instruction set, defining the instructions,
10 // and properties of the instructions which are needed for code generation,
11 // machine code emission, and analysis.
13 //===----------------------------------------------------------------------===//
15 //===----------------------------------------------------------------------===//
16 // SSE 1 & 2 Instructions Classes
17 //===----------------------------------------------------------------------===//
19 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
21 RegisterClass RC, X86MemOperand x86memop,
22 Domain d, X86FoldableSchedWrite sched,
24 let isCodeGenOnly = 1 in {
25 let isCommutable = 1 in {
26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38 Sched<[sched.Folded, sched.ReadAfterFold]>;
42 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43 multiclass sse12_fp_scalar_int<bits<8> opc,
44 SDPatternOperator OpNode, RegisterClass RC,
45 ValueType VT, string asm, Operand memopr,
46 PatFrags mem_frags, Domain d,
47 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48 let hasSideEffects = 0 in {
49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60 [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
61 Sched<[sched.Folded, sched.ReadAfterFold]>;
65 /// sse12_fp_packed - SSE 1 & 2 packed instructions class
66 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
67 RegisterClass RC, ValueType vt,
68 X86MemOperand x86memop, PatFrag mem_frag,
69 Domain d, X86FoldableSchedWrite sched,
71 let isCommutable = 1 in
72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
85 Sched<[sched.Folded, sched.ReadAfterFold]>;
88 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89 multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90 string OpcodeStr, X86MemOperand x86memop,
91 X86FoldableSchedWrite sched,
92 list<dag> pat_rr, list<dag> pat_rm,
94 let isCommutable = 1, hasSideEffects = 0 in
95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
101 let hasSideEffects = 0, mayLoad = 1 in
102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
107 Sched<[sched.Folded, sched.ReadAfterFold]>;
111 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112 // This is expanded by ExpandPostRAPseudos.
113 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114 isPseudo = 1, SchedRW = [WriteZero] in {
115 def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "",
116 [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>;
117 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
118 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
119 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
120 [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
121 def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
122 [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
125 //===----------------------------------------------------------------------===//
126 // AVX & SSE - Zero/One Vectors
127 //===----------------------------------------------------------------------===//
129 // Alias instruction that maps zero vector to pxor / xorp* for sse.
130 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
131 // swizzled by ExecutionDomainFix to pxor.
132 // We set canFoldAsLoad because this can be converted to a constant-pool
133 // load of an all-zeros value if folding it would be beneficial.
134 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
135 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
136 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
137 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
140 let Predicates = [NoAVX512] in {
141 def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
142 def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
143 def : Pat<(v8f16 immAllZerosV), (V_SET0)>;
144 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
145 def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
146 def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
150 // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
151 // and doesn't need it because on sandy bridge the register is set to zero
152 // at the rename stage without using any execution unit, so SET0PSY
153 // and SET0PDY can be used for vector int instructions without penalty
154 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
155 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
156 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
157 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
160 let Predicates = [NoAVX512] in {
161 def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
162 def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
163 def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>;
164 def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
165 def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
166 def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
169 // We set canFoldAsLoad because this can be converted to a constant-pool
170 // load of an all-ones value if folding it would be beneficial.
171 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
172 isPseudo = 1, SchedRW = [WriteZero] in {
173 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
174 [(set VR128:$dst, (v4i32 immAllOnesV))]>;
175 let Predicates = [HasAVX1Only, OptForMinSize] in {
176 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
177 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
179 let Predicates = [HasAVX2] in
180 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
181 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
184 //===----------------------------------------------------------------------===//
185 // SSE 1 & 2 - Move FP Scalar Instructions
187 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
188 // register copies because it's a partial register update; Register-to-register
189 // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
190 // that the insert be implementable in terms of a copy, and just mentioned, we
191 // don't use movss/movsd for copies.
192 //===----------------------------------------------------------------------===//
194 multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
195 string asm_opr, Domain d> {
196 let isCommutable = 1 in
197 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
198 (ins VR128:$src1, VR128:$src2),
199 !strconcat(base_opc, asm_opr),
200 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
201 Sched<[SchedWriteFShuffle.XMM]>;
203 // For the disassembler
204 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
205 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
206 (ins VR128:$src1, VR128:$src2),
207 !strconcat(base_opc, asm_opr), []>,
208 Sched<[SchedWriteFShuffle.XMM]>;
211 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
212 X86MemOperand x86memop, string OpcodeStr,
213 Domain d, Predicate pred> {
215 let Predicates = [UseAVX, OptForSize] in
216 defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
217 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
218 VEX_4V, VEX_LIG, WIG;
220 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
221 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
222 [(store RC:$src, addr:$dst)], d>,
223 VEX, VEX_LIG, Sched<[WriteFStore]>, WIG;
225 let Constraints = "$src1 = $dst" in {
226 let Predicates = [pred, NoSSE41_Or_OptForSize] in
227 defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
228 "\t{$src2, $dst|$dst, $src2}", d>;
231 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
232 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
233 [(store RC:$src, addr:$dst)], d>,
234 Sched<[WriteFStore]>;
236 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
237 (!cast<Instruction>("V"#NAME#"rr_REV")
238 VR128:$dst, VR128:$src1, VR128:$src2), 0>;
239 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
240 (!cast<Instruction>(NAME#"rr_REV")
241 VR128:$dst, VR128:$src2), 0>;
244 // Loading from memory automatically zeroing upper bits.
245 multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
246 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
248 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
249 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
250 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
251 VEX, VEX_LIG, Sched<[WriteFLoad]>, WIG;
252 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
253 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
254 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
257 // _alt version uses FR32/FR64 register class.
258 let isCodeGenOnly = 1 in {
259 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
260 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
261 [(set RC:$dst, (mem_pat addr:$src))], d>,
262 VEX, VEX_LIG, Sched<[WriteFLoad]>, WIG;
263 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
264 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
265 [(set RC:$dst, (mem_pat addr:$src))], d>,
270 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
271 SSEPackedSingle, UseSSE1>, XS;
272 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
273 SSEPackedDouble, UseSSE2>, XD;
275 let canFoldAsLoad = 1, isReMaterializable = 1 in {
276 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
277 SSEPackedSingle>, XS;
278 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
279 SSEPackedDouble>, XD;
283 let Predicates = [UseAVX] in {
284 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
285 (VMOVSSrm addr:$src)>;
286 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
287 (VMOVSDrm addr:$src)>;
289 // Represent the same patterns above but in the form they appear for
291 def : Pat<(v8f32 (X86vzload32 addr:$src)),
292 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
293 def : Pat<(v4f64 (X86vzload64 addr:$src)),
294 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
297 let Predicates = [UseAVX, OptForSize] in {
298 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
299 // MOVSS to the lower bits.
300 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
301 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
302 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
303 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
305 // Move low f32 and clear high bits.
306 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
307 (SUBREG_TO_REG (i32 0),
308 (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
309 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
310 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
311 (SUBREG_TO_REG (i32 0),
312 (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
313 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
316 let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
317 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
318 // MOVSS to the lower bits.
319 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
320 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
321 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
322 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
325 let Predicates = [UseSSE2] in
326 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
327 (MOVSDrm addr:$src)>;
329 let Predicates = [UseSSE1] in
330 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
331 (MOVSSrm addr:$src)>;
333 //===----------------------------------------------------------------------===//
334 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
335 //===----------------------------------------------------------------------===//
337 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
338 X86MemOperand x86memop, PatFrag ld_frag,
339 string asm, Domain d,
340 X86SchedWriteMoveLS sched> {
341 let hasSideEffects = 0, isMoveReg = 1 in
342 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
343 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
345 let canFoldAsLoad = 1, isReMaterializable = 1 in
346 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
347 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
348 [(set RC:$dst, (ld_frag addr:$src))], d>,
352 let Predicates = [HasAVX, NoVLX] in {
353 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
354 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
356 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
357 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
359 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
360 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
362 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
363 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
366 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
367 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
369 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
370 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
372 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
373 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
375 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
376 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
380 let Predicates = [UseSSE1] in {
381 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
382 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
384 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
385 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
388 let Predicates = [UseSSE2] in {
389 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
390 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
392 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
393 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
397 let Predicates = [HasAVX, NoVLX] in {
398 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
399 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
400 "movaps\t{$src, $dst|$dst, $src}",
401 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
403 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
404 "movapd\t{$src, $dst|$dst, $src}",
405 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
407 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
408 "movups\t{$src, $dst|$dst, $src}",
409 [(store (v4f32 VR128:$src), addr:$dst)]>,
411 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
412 "movupd\t{$src, $dst|$dst, $src}",
413 [(store (v2f64 VR128:$src), addr:$dst)]>,
417 let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
418 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
419 "movaps\t{$src, $dst|$dst, $src}",
420 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
422 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
423 "movapd\t{$src, $dst|$dst, $src}",
424 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
426 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
427 "movups\t{$src, $dst|$dst, $src}",
428 [(store (v8f32 VR256:$src), addr:$dst)]>,
430 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
431 "movupd\t{$src, $dst|$dst, $src}",
432 [(store (v4f64 VR256:$src), addr:$dst)]>,
438 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
440 let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
441 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
443 "movaps\t{$src, $dst|$dst, $src}", []>,
445 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
447 "movapd\t{$src, $dst|$dst, $src}", []>,
449 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
451 "movups\t{$src, $dst|$dst, $src}", []>,
453 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
455 "movupd\t{$src, $dst|$dst, $src}", []>,
459 let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
460 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
462 "movaps\t{$src, $dst|$dst, $src}", []>,
464 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
466 "movapd\t{$src, $dst|$dst, $src}", []>,
468 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
470 "movups\t{$src, $dst|$dst, $src}", []>,
472 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
474 "movupd\t{$src, $dst|$dst, $src}", []>,
479 // Reversed version with ".s" suffix for GAS compatibility.
480 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
481 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
482 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
483 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
484 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
485 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
486 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
487 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
488 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
489 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
490 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
491 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
492 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
493 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
494 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
495 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
497 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
498 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
499 "movaps\t{$src, $dst|$dst, $src}",
500 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
501 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
502 "movapd\t{$src, $dst|$dst, $src}",
503 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
504 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
505 "movups\t{$src, $dst|$dst, $src}",
506 [(store (v4f32 VR128:$src), addr:$dst)]>;
507 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
508 "movupd\t{$src, $dst|$dst, $src}",
509 [(store (v2f64 VR128:$src), addr:$dst)]>;
513 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
514 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
515 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
516 "movaps\t{$src, $dst|$dst, $src}", []>;
517 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
518 "movapd\t{$src, $dst|$dst, $src}", []>;
519 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
520 "movups\t{$src, $dst|$dst, $src}", []>;
521 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
522 "movupd\t{$src, $dst|$dst, $src}", []>;
525 // Reversed version with ".s" suffix for GAS compatibility.
526 def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
527 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
528 def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
529 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
530 def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
531 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
532 def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
533 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
535 let Predicates = [HasAVX, NoVLX] in {
536 // 256-bit load/store need to use floating point load/store in case we don't
537 // have AVX2. Execution domain fixing will convert to integer if AVX2 is
538 // available and changing the domain is beneficial.
539 def : Pat<(alignedloadv4i64 addr:$src),
540 (VMOVAPSYrm addr:$src)>;
541 def : Pat<(alignedloadv8i32 addr:$src),
542 (VMOVAPSYrm addr:$src)>;
543 def : Pat<(alignedloadv16i16 addr:$src),
544 (VMOVAPSYrm addr:$src)>;
545 def : Pat<(alignedloadv32i8 addr:$src),
546 (VMOVAPSYrm addr:$src)>;
547 def : Pat<(loadv4i64 addr:$src),
548 (VMOVUPSYrm addr:$src)>;
549 def : Pat<(loadv8i32 addr:$src),
550 (VMOVUPSYrm addr:$src)>;
551 def : Pat<(loadv16i16 addr:$src),
552 (VMOVUPSYrm addr:$src)>;
553 def : Pat<(loadv32i8 addr:$src),
554 (VMOVUPSYrm addr:$src)>;
556 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
557 (VMOVAPSYmr addr:$dst, VR256:$src)>;
558 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
559 (VMOVAPSYmr addr:$dst, VR256:$src)>;
560 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
561 (VMOVAPSYmr addr:$dst, VR256:$src)>;
562 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
563 (VMOVAPSYmr addr:$dst, VR256:$src)>;
564 def : Pat<(store (v4i64 VR256:$src), addr:$dst),
565 (VMOVUPSYmr addr:$dst, VR256:$src)>;
566 def : Pat<(store (v8i32 VR256:$src), addr:$dst),
567 (VMOVUPSYmr addr:$dst, VR256:$src)>;
568 def : Pat<(store (v16i16 VR256:$src), addr:$dst),
569 (VMOVUPSYmr addr:$dst, VR256:$src)>;
570 def : Pat<(store (v32i8 VR256:$src), addr:$dst),
571 (VMOVUPSYmr addr:$dst, VR256:$src)>;
573 def : Pat<(alignedloadv8f16 addr:$src),
574 (VMOVAPSrm addr:$src)>;
575 def : Pat<(alignedloadv8bf16 addr:$src),
576 (VMOVAPSrm addr:$src)>;
577 def : Pat<(loadv8f16 addr:$src),
578 (VMOVUPSrm addr:$src)>;
579 def : Pat<(loadv8bf16 addr:$src),
580 (VMOVUPSrm addr:$src)>;
581 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
582 (VMOVAPSmr addr:$dst, VR128:$src)>;
583 def : Pat<(alignedstore (v8bf16 VR128:$src), addr:$dst),
584 (VMOVAPSmr addr:$dst, VR128:$src)>;
585 def : Pat<(store (v8f16 VR128:$src), addr:$dst),
586 (VMOVUPSmr addr:$dst, VR128:$src)>;
587 def : Pat<(store (v8bf16 VR128:$src), addr:$dst),
588 (VMOVUPSmr addr:$dst, VR128:$src)>;
590 def : Pat<(alignedloadv16f16 addr:$src),
591 (VMOVAPSYrm addr:$src)>;
592 def : Pat<(alignedloadv16bf16 addr:$src),
593 (VMOVAPSYrm addr:$src)>;
594 def : Pat<(loadv16f16 addr:$src),
595 (VMOVUPSYrm addr:$src)>;
596 def : Pat<(loadv16bf16 addr:$src),
597 (VMOVUPSYrm addr:$src)>;
598 def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst),
599 (VMOVAPSYmr addr:$dst, VR256:$src)>;
600 def : Pat<(alignedstore (v16bf16 VR256:$src), addr:$dst),
601 (VMOVAPSYmr addr:$dst, VR256:$src)>;
602 def : Pat<(store (v16f16 VR256:$src), addr:$dst),
603 (VMOVUPSYmr addr:$dst, VR256:$src)>;
604 def : Pat<(store (v16bf16 VR256:$src), addr:$dst),
605 (VMOVUPSYmr addr:$dst, VR256:$src)>;
608 // Use movaps / movups for SSE integer load / store (one byte shorter).
609 // The instructions selected below are then converted to MOVDQA/MOVDQU
610 // during the SSE domain pass.
611 let Predicates = [UseSSE1] in {
612 def : Pat<(alignedloadv2i64 addr:$src),
613 (MOVAPSrm addr:$src)>;
614 def : Pat<(alignedloadv4i32 addr:$src),
615 (MOVAPSrm addr:$src)>;
616 def : Pat<(alignedloadv8i16 addr:$src),
617 (MOVAPSrm addr:$src)>;
618 def : Pat<(alignedloadv16i8 addr:$src),
619 (MOVAPSrm addr:$src)>;
620 def : Pat<(loadv2i64 addr:$src),
621 (MOVUPSrm addr:$src)>;
622 def : Pat<(loadv4i32 addr:$src),
623 (MOVUPSrm addr:$src)>;
624 def : Pat<(loadv8i16 addr:$src),
625 (MOVUPSrm addr:$src)>;
626 def : Pat<(loadv16i8 addr:$src),
627 (MOVUPSrm addr:$src)>;
629 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
630 (MOVAPSmr addr:$dst, VR128:$src)>;
631 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
632 (MOVAPSmr addr:$dst, VR128:$src)>;
633 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
634 (MOVAPSmr addr:$dst, VR128:$src)>;
635 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
636 (MOVAPSmr addr:$dst, VR128:$src)>;
637 def : Pat<(store (v2i64 VR128:$src), addr:$dst),
638 (MOVUPSmr addr:$dst, VR128:$src)>;
639 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
640 (MOVUPSmr addr:$dst, VR128:$src)>;
641 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
642 (MOVUPSmr addr:$dst, VR128:$src)>;
643 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
644 (MOVUPSmr addr:$dst, VR128:$src)>;
647 let Predicates = [UseSSE2] in {
648 def : Pat<(alignedloadv8f16 addr:$src),
649 (MOVAPSrm addr:$src)>;
650 def : Pat<(loadv8f16 addr:$src),
651 (MOVUPSrm addr:$src)>;
652 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
653 (MOVAPSmr addr:$dst, VR128:$src)>;
654 def : Pat<(store (v8f16 VR128:$src), addr:$dst),
655 (MOVUPSmr addr:$dst, VR128:$src)>;
658 //===----------------------------------------------------------------------===//
659 // SSE 1 & 2 - Move Low packed FP Instructions
660 //===----------------------------------------------------------------------===//
662 multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
663 string base_opc, string asm_opr> {
664 // No pattern as they need be special cased between high and low.
665 let hasSideEffects = 0, mayLoad = 1 in
666 def PSrm : PI<opc, MRMSrcMem,
667 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
668 !strconcat(base_opc, "s", asm_opr),
669 [], SSEPackedSingle>, PS,
670 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
672 def PDrm : PI<opc, MRMSrcMem,
673 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
674 !strconcat(base_opc, "d", asm_opr),
675 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
676 (scalar_to_vector (loadf64 addr:$src2)))))],
677 SSEPackedDouble>, PD,
678 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
681 multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
683 let Predicates = [UseAVX] in
684 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
685 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
688 let Constraints = "$src1 = $dst" in
689 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
690 "\t{$src2, $dst|$dst, $src2}">;
693 defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
695 let SchedRW = [WriteFStore] in {
696 let Predicates = [UseAVX] in {
697 let mayStore = 1, hasSideEffects = 0 in
698 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
699 "movlps\t{$src, $dst|$dst, $src}",
702 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
703 "movlpd\t{$src, $dst|$dst, $src}",
704 [(store (f64 (extractelt (v2f64 VR128:$src),
705 (iPTR 0))), addr:$dst)]>,
708 let mayStore = 1, hasSideEffects = 0 in
709 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
710 "movlps\t{$src, $dst|$dst, $src}",
712 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
713 "movlpd\t{$src, $dst|$dst, $src}",
714 [(store (f64 (extractelt (v2f64 VR128:$src),
715 (iPTR 0))), addr:$dst)]>;
718 let Predicates = [UseSSE1] in {
719 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
720 // end up with a movsd or blend instead of shufp.
721 // No need for aligned load, we're only loading 64-bits.
722 def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
724 (MOVLPSrm VR128:$src1, addr:$src2)>;
725 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
726 (MOVLPSrm VR128:$src1, addr:$src2)>;
728 def : Pat<(v4f32 (X86vzload64 addr:$src)),
729 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
730 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
731 (MOVLPSmr addr:$dst, VR128:$src)>;
734 //===----------------------------------------------------------------------===//
735 // SSE 1 & 2 - Move Hi packed FP Instructions
736 //===----------------------------------------------------------------------===//
738 defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
740 let SchedRW = [WriteFStore] in {
741 // v2f64 extract element 1 is always custom lowered to unpack high to low
742 // and extract element 0 so the non-store version isn't too horrible.
743 let Predicates = [UseAVX] in {
744 let mayStore = 1, hasSideEffects = 0 in
745 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
746 "movhps\t{$src, $dst|$dst, $src}",
748 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
749 "movhpd\t{$src, $dst|$dst, $src}",
750 [(store (f64 (extractelt
751 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
752 (iPTR 0))), addr:$dst)]>, VEX, WIG;
754 let mayStore = 1, hasSideEffects = 0 in
755 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
756 "movhps\t{$src, $dst|$dst, $src}",
758 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
759 "movhpd\t{$src, $dst|$dst, $src}",
760 [(store (f64 (extractelt
761 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
762 (iPTR 0))), addr:$dst)]>;
765 let Predicates = [UseAVX] in {
767 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
768 (VMOVHPDrm VR128:$src1, addr:$src2)>;
770 def : Pat<(store (f64 (extractelt
771 (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
772 (iPTR 0))), addr:$dst),
773 (VMOVHPDmr addr:$dst, VR128:$src)>;
776 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
777 (VMOVLPDrm VR128:$src1, addr:$src2)>;
780 let Predicates = [UseSSE1] in {
781 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
782 // end up with a movsd or blend instead of shufp.
783 // No need for aligned load, we're only loading 64-bits.
784 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
785 (MOVHPSrm VR128:$src1, addr:$src2)>;
786 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
787 (MOVHPSrm VR128:$src1, addr:$src2)>;
789 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
791 (MOVHPSmr addr:$dst, VR128:$src)>;
794 let Predicates = [UseSSE2] in {
796 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
797 (MOVHPDrm VR128:$src1, addr:$src2)>;
799 def : Pat<(store (f64 (extractelt
800 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
801 (iPTR 0))), addr:$dst),
802 (MOVHPDmr addr:$dst, VR128:$src)>;
805 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
806 (MOVLPDrm VR128:$src1, addr:$src2)>;
809 let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
810 // Use MOVLPD to load into the low bits from a full vector unless we can use
812 def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
813 (MOVLPDrm VR128:$src1, addr:$src2)>;
816 //===----------------------------------------------------------------------===//
817 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
818 //===----------------------------------------------------------------------===//
820 let Predicates = [UseAVX] in {
821 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
822 (ins VR128:$src1, VR128:$src2),
823 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
825 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
826 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, WIG;
827 let isCommutable = 1 in
828 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
829 (ins VR128:$src1, VR128:$src2),
830 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
832 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
833 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, WIG;
835 let Constraints = "$src1 = $dst" in {
836 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
837 (ins VR128:$src1, VR128:$src2),
838 "movlhps\t{$src2, $dst|$dst, $src2}",
840 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
841 Sched<[SchedWriteFShuffle.XMM]>;
842 let isCommutable = 1 in
843 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
844 (ins VR128:$src1, VR128:$src2),
845 "movhlps\t{$src2, $dst|$dst, $src2}",
847 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
848 Sched<[SchedWriteFShuffle.XMM]>;
851 //===----------------------------------------------------------------------===//
852 // SSE 1 & 2 - Conversion Instructions
853 //===----------------------------------------------------------------------===//
855 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
856 SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag,
857 string asm, string mem, X86FoldableSchedWrite sched,
859 SchedRead Int2Fpu = ReadDefault> {
860 let ExeDomain = d in {
861 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
862 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
863 [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
864 Sched<[sched, Int2Fpu]>;
865 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
866 mem#"\t{$src, $dst|$dst, $src}",
867 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
868 Sched<[sched.Folded]>;
872 multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
873 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
874 string asm, Domain d, X86FoldableSchedWrite sched> {
875 let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
876 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
877 [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
880 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
881 [(set RC:$dst, (DstTy (any_sint_to_fp
882 (SrcTy (ld_frag addr:$src)))))], d>,
883 Sched<[sched.Folded]>;
887 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
888 X86MemOperand x86memop, string asm, string mem,
889 X86FoldableSchedWrite sched, Domain d> {
890 let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
891 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
892 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
893 Sched<[sched, ReadDefault, ReadInt2Fpu]>;
895 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
896 (ins DstRC:$src1, x86memop:$src),
897 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
898 Sched<[sched.Folded, sched.ReadAfterFold]>;
899 } // hasSideEffects = 0
902 let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
903 defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
904 "cvttss2si", "cvttss2si",
905 WriteCvtSS2I, SSEPackedSingle>,
907 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
908 "cvttss2si", "cvttss2si",
909 WriteCvtSS2I, SSEPackedSingle>,
910 XS, VEX, REX_W, VEX_LIG;
911 defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
912 "cvttsd2si", "cvttsd2si",
913 WriteCvtSD2I, SSEPackedDouble>,
915 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
916 "cvttsd2si", "cvttsd2si",
917 WriteCvtSD2I, SSEPackedDouble>,
918 XD, VEX, REX_W, VEX_LIG;
920 defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
921 "cvtss2si", "cvtss2si",
922 WriteCvtSS2I, SSEPackedSingle>,
924 defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
925 "cvtss2si", "cvtss2si",
926 WriteCvtSS2I, SSEPackedSingle>,
927 XS, VEX, REX_W, VEX_LIG;
928 defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
929 "cvtsd2si", "cvtsd2si",
930 WriteCvtSD2I, SSEPackedDouble>,
932 defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
933 "cvtsd2si", "cvtsd2si",
934 WriteCvtSD2I, SSEPackedDouble>,
935 XD, VEX, REX_W, VEX_LIG;
938 // The assembler can recognize rr 64-bit instructions by seeing a rxx
939 // register, but the same isn't true when only using memory operands,
940 // provide other assembly "l" and "q" forms to address this explicitly
941 // where appropriate to do so.
942 let isCodeGenOnly = 1 in {
943 defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
944 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
946 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
947 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
948 REX_W, VEX_LIG, SIMD_EXC;
949 defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
950 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
952 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
953 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
954 REX_W, VEX_LIG, SIMD_EXC;
955 } // isCodeGenOnly = 1
957 let Predicates = [UseAVX] in {
958 def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
959 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
960 def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
961 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
962 def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
963 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
964 def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
965 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
967 def : Pat<(f32 (any_sint_to_fp GR32:$src)),
968 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
969 def : Pat<(f32 (any_sint_to_fp GR64:$src)),
970 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
971 def : Pat<(f64 (any_sint_to_fp GR32:$src)),
972 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
973 def : Pat<(f64 (any_sint_to_fp GR64:$src)),
974 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
976 def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
977 def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
979 def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
980 def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
983 let isCodeGenOnly = 1 in {
984 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
985 "cvttss2si", "cvttss2si",
986 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
987 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
988 "cvttss2si", "cvttss2si",
989 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
990 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
991 "cvttsd2si", "cvttsd2si",
992 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
993 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
994 "cvttsd2si", "cvttsd2si",
995 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
997 defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
998 "cvtss2si", "cvtss2si",
999 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
1000 defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
1001 "cvtss2si", "cvtss2si",
1002 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
1003 defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
1004 "cvtsd2si", "cvtsd2si",
1005 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
1006 defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
1007 "cvtsd2si", "cvtsd2si",
1008 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
1010 defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
1011 "cvtsi2ss", "cvtsi2ss{l}",
1012 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
1013 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
1014 "cvtsi2ss", "cvtsi2ss{q}",
1015 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
1016 defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
1017 "cvtsi2sd", "cvtsi2sd{l}",
1018 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
1019 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
1020 "cvtsi2sd", "cvtsi2sd{q}",
1021 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
1022 } // isCodeGenOnly = 1
1024 let Predicates = [UseSSE1] in {
1025 def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
1026 def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
1029 let Predicates = [UseSSE2] in {
1030 def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
1031 def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
1034 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
1035 // and/or XMM operand(s).
1037 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1038 ValueType DstVT, ValueType SrcVT, SDNode OpNode,
1039 Operand memop, PatFrags mem_frags, string asm,
1040 X86FoldableSchedWrite sched, Domain d> {
1041 let ExeDomain = d in {
1042 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1043 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1044 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
1046 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1047 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1048 [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
1049 Sched<[sched.Folded]>;
1053 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1054 RegisterClass DstRC, X86MemOperand x86memop,
1055 string asm, string mem, X86FoldableSchedWrite sched,
1056 Domain d, bit Is2Addr = 1> {
1057 let hasSideEffects = 0, ExeDomain = d in {
1058 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1060 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1061 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1062 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
1064 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1065 (ins DstRC:$src1, x86memop:$src2),
1067 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
1068 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
1069 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
1073 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1074 let Predicates = [UseAVX] in {
1075 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
1076 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1077 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1078 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
1079 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1080 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, REX_W, VEX_LIG;
1082 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
1083 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1084 SSEPackedDouble>, XD;
1085 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
1086 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1087 SSEPackedDouble>, XD, REX_W;
1090 let Predicates = [UseAVX] in {
1091 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1092 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
1093 XS, VEX_4V, VEX_LIG, SIMD_EXC;
1094 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1095 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
1096 XS, VEX_4V, VEX_LIG, REX_W, SIMD_EXC;
1097 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1098 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
1099 XD, VEX_4V, VEX_LIG;
1100 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1101 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
1102 XD, VEX_4V, VEX_LIG, REX_W, SIMD_EXC;
1104 let Constraints = "$src1 = $dst" in {
1105 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1106 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
1108 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1109 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
1110 XS, REX_W, SIMD_EXC;
1111 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1112 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
1114 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1115 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
1116 XD, REX_W, SIMD_EXC;
1119 def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1120 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1121 def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1122 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1123 def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1124 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1125 def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1126 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1128 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1129 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1130 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1131 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1133 def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1134 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1135 def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1136 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1137 def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1138 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1139 def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1140 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1142 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1143 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1144 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1145 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1149 // Aliases for intrinsics
1150 let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1151 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1152 ssmem, sse_load_f32, "cvttss2si",
1153 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1154 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1155 X86cvtts2Int, ssmem, sse_load_f32,
1156 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1157 XS, VEX, VEX_LIG, REX_W;
1158 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1159 sdmem, sse_load_f64, "cvttsd2si",
1160 WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1161 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1162 X86cvtts2Int, sdmem, sse_load_f64,
1163 "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
1164 XD, VEX, VEX_LIG, REX_W;
1166 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1167 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1168 ssmem, sse_load_f32, "cvttss2si",
1169 WriteCvtSS2I, SSEPackedSingle>, XS;
1170 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1171 X86cvtts2Int, ssmem, sse_load_f32,
1172 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1174 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1175 sdmem, sse_load_f64, "cvttsd2si",
1176 WriteCvtSD2I, SSEPackedDouble>, XD;
1177 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1178 X86cvtts2Int, sdmem, sse_load_f64,
1179 "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
1183 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1184 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1185 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1186 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1187 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1188 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1189 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1190 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1191 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1192 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1193 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1194 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1195 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1196 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1197 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1198 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1200 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1201 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1202 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1203 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1204 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1205 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1206 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1207 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1208 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1209 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1210 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1211 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1212 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1213 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1214 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1215 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1217 let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1218 defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1219 ssmem, sse_load_f32, "cvtss2si",
1220 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1221 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1222 ssmem, sse_load_f32, "cvtss2si",
1223 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, REX_W, VEX_LIG;
1225 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1226 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1227 ssmem, sse_load_f32, "cvtss2si",
1228 WriteCvtSS2I, SSEPackedSingle>, XS;
1229 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1230 ssmem, sse_load_f32, "cvtss2si",
1231 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
1233 defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1234 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1235 SSEPackedSingle, WriteCvtI2PS>,
1236 PS, VEX, Requires<[HasAVX, NoVLX]>, WIG;
1237 defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1238 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1239 SSEPackedSingle, WriteCvtI2PSY>,
1240 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, WIG;
1242 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1243 "cvtdq2ps\t{$src, $dst|$dst, $src}",
1244 SSEPackedSingle, WriteCvtI2PS>,
1245 PS, Requires<[UseSSE2]>;
1249 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1250 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1251 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1252 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1253 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1254 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1255 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1256 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1257 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1258 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1259 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1260 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1261 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1262 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1263 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1264 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1267 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1268 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1269 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1270 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1271 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1272 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1273 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1274 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1275 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1276 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1277 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1278 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1279 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1280 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1281 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1282 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1286 // Convert scalar double to scalar single
1287 let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
1288 ExeDomain = SSEPackedSingle in {
1289 def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1290 (ins FR32:$src1, FR64:$src2),
1291 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1292 VEX_4V, VEX_LIG, WIG,
1293 Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1295 def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1296 (ins FR32:$src1, f64mem:$src2),
1297 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1298 XD, VEX_4V, VEX_LIG, WIG,
1299 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1302 def : Pat<(f32 (any_fpround FR64:$src)),
1303 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1306 let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
1307 def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1308 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1309 [(set FR32:$dst, (any_fpround FR64:$src))]>,
1310 Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1311 def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1312 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1313 [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
1314 XD, Requires<[UseSSE2, OptForSize]>,
1315 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1318 let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
1319 def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1320 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1321 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1323 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1324 XD, VEX_4V, VEX_LIG, WIG, Requires<[UseAVX]>,
1325 Sched<[WriteCvtSD2SS]>;
1326 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1327 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1328 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1330 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1331 XD, VEX_4V, VEX_LIG, WIG, Requires<[UseAVX]>,
1332 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1333 let Constraints = "$src1 = $dst" in {
1334 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1335 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1336 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1338 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1339 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1340 def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1341 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1342 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1344 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1345 XD, Requires<[UseSSE2]>,
1346 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1350 // Convert scalar single to scalar double
1351 // SSE2 instructions with XS prefix
1352 let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
1353 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1354 (ins FR64:$src1, FR32:$src2),
1355 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1356 XS, VEX_4V, VEX_LIG, WIG,
1357 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
1359 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1360 (ins FR64:$src1, f32mem:$src2),
1361 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1362 XS, VEX_4V, VEX_LIG, WIG,
1363 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1364 Requires<[UseAVX, OptForSize]>, SIMD_EXC;
1365 } // isCodeGenOnly = 1, hasSideEffects = 0
1367 def : Pat<(f64 (any_fpextend FR32:$src)),
1368 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1369 def : Pat<(any_fpextend (loadf32 addr:$src)),
1370 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1372 let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
1373 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1374 "cvtss2sd\t{$src, $dst|$dst, $src}",
1375 [(set FR64:$dst, (any_fpextend FR32:$src))]>,
1376 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
1377 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1378 "cvtss2sd\t{$src, $dst|$dst, $src}",
1379 [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
1380 XS, Requires<[UseSSE2, OptForSize]>,
1381 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, SIMD_EXC;
1382 } // isCodeGenOnly = 1
1384 let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
1385 ExeDomain = SSEPackedSingle in {
1386 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1387 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1388 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1389 []>, XS, VEX_4V, VEX_LIG, WIG,
1390 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1392 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1393 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1394 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1395 []>, XS, VEX_4V, VEX_LIG, WIG, Requires<[HasAVX]>,
1396 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1397 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1398 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1399 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1400 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1401 []>, XS, Requires<[UseSSE2]>,
1402 Sched<[WriteCvtSS2SD]>;
1404 def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1405 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1406 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1407 []>, XS, Requires<[UseSSE2]>,
1408 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1410 } // hasSideEffects = 0
1412 // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1413 // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1414 // vmovs{s,d} instructions
1415 let Predicates = [UseAVX] in {
1416 def : Pat<(v4f32 (X86Movss
1418 (v4f32 (scalar_to_vector
1419 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1420 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1422 def : Pat<(v2f64 (X86Movsd
1424 (v2f64 (scalar_to_vector
1425 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1426 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1428 def : Pat<(v4f32 (X86Movss
1430 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1431 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1433 def : Pat<(v4f32 (X86Movss
1435 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1436 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1438 def : Pat<(v4f32 (X86Movss
1440 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1441 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1443 def : Pat<(v4f32 (X86Movss
1445 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1446 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1448 def : Pat<(v2f64 (X86Movsd
1450 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1451 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1453 def : Pat<(v2f64 (X86Movsd
1455 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1456 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1458 def : Pat<(v2f64 (X86Movsd
1460 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1461 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1463 def : Pat<(v2f64 (X86Movsd
1465 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1466 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1467 } // Predicates = [UseAVX]
1469 let Predicates = [UseSSE2] in {
1470 def : Pat<(v4f32 (X86Movss
1472 (v4f32 (scalar_to_vector
1473 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1474 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1476 def : Pat<(v2f64 (X86Movsd
1478 (v2f64 (scalar_to_vector
1479 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1480 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1482 def : Pat<(v2f64 (X86Movsd
1484 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1485 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1487 def : Pat<(v2f64 (X86Movsd
1489 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1490 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1492 def : Pat<(v2f64 (X86Movsd
1494 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1495 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1497 def : Pat<(v2f64 (X86Movsd
1499 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1500 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1501 } // Predicates = [UseSSE2]
1503 let Predicates = [UseSSE1] in {
1504 def : Pat<(v4f32 (X86Movss
1506 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1507 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1509 def : Pat<(v4f32 (X86Movss
1511 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1512 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1514 def : Pat<(v4f32 (X86Movss
1516 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1517 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1519 def : Pat<(v4f32 (X86Movss
1521 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1522 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1523 } // Predicates = [UseSSE1]
1525 let Predicates = [HasAVX, NoVLX] in {
1526 // Convert packed single/double fp to doubleword
1527 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1528 "cvtps2dq\t{$src, $dst|$dst, $src}",
1529 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1530 VEX, Sched<[WriteCvtPS2I]>, WIG, SIMD_EXC;
1531 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1532 "cvtps2dq\t{$src, $dst|$dst, $src}",
1534 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1535 VEX, Sched<[WriteCvtPS2ILd]>, WIG, SIMD_EXC;
1536 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1537 "cvtps2dq\t{$src, $dst|$dst, $src}",
1539 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1540 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, WIG, SIMD_EXC;
1541 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1542 "cvtps2dq\t{$src, $dst|$dst, $src}",
1544 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1545 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, WIG, SIMD_EXC;
1547 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1548 "cvtps2dq\t{$src, $dst|$dst, $src}",
1549 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1550 Sched<[WriteCvtPS2I]>, SIMD_EXC;
1551 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1552 "cvtps2dq\t{$src, $dst|$dst, $src}",
1554 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1555 Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
1558 // Convert Packed Double FP to Packed DW Integers
1559 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1560 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1561 // register, but the same isn't true when using memory operands instead.
1562 // Provide other assembly rr and rm forms to address this explicitly.
1563 def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1564 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1566 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1567 VEX, Sched<[WriteCvtPD2I]>, WIG;
1570 def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1571 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1573 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1574 Sched<[WriteCvtPD2ILd]>, WIG;
1577 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1578 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1580 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1581 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, WIG;
1582 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1583 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1585 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1586 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG;
1589 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1590 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1591 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1592 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1594 def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1595 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1597 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1598 Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1599 def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1600 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1602 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1603 Sched<[WriteCvtPD2I]>, SIMD_EXC;
1605 // Convert with truncation packed single/double fp to doubleword
1606 // SSE2 packed instructions with XS prefix
1607 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1608 let Predicates = [HasAVX, NoVLX] in {
1609 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1610 "cvttps2dq\t{$src, $dst|$dst, $src}",
1612 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1613 VEX, Sched<[WriteCvtPS2I]>, WIG;
1614 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1615 "cvttps2dq\t{$src, $dst|$dst, $src}",
1617 (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
1618 VEX, Sched<[WriteCvtPS2ILd]>, WIG;
1619 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1620 "cvttps2dq\t{$src, $dst|$dst, $src}",
1622 (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
1623 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, WIG;
1624 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1625 "cvttps2dq\t{$src, $dst|$dst, $src}",
1627 (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
1629 Sched<[WriteCvtPS2IYLd]>, WIG;
1632 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1633 "cvttps2dq\t{$src, $dst|$dst, $src}",
1635 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1636 Sched<[WriteCvtPS2I]>;
1637 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1638 "cvttps2dq\t{$src, $dst|$dst, $src}",
1640 (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
1641 Sched<[WriteCvtPS2ILd]>;
1644 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1645 // register, but the same isn't true when using memory operands instead.
1646 // Provide other assembly rr and rm forms to address this explicitly.
1647 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1649 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1650 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1652 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1653 VEX, Sched<[WriteCvtPD2I]>, WIG;
1654 def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1655 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1657 (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
1658 VEX, Sched<[WriteCvtPD2ILd]>, WIG;
1661 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1662 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1664 (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
1665 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, WIG;
1666 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1667 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1669 (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
1670 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG;
1671 } // Predicates = [HasAVX, NoVLX]
1673 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1674 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1675 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1676 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1678 let Predicates = [HasAVX, NoVLX] in {
1679 def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
1680 (VCVTTPD2DQYrr VR256:$src)>;
1681 def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
1682 (VCVTTPD2DQYrm addr:$src)>;
1685 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1686 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1688 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1689 Sched<[WriteCvtPD2I]>, SIMD_EXC;
1690 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1691 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1693 (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
1694 Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1696 // Convert packed single to packed double
1697 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1698 // SSE2 instructions without OpSize prefix
1699 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1700 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1701 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1702 PS, VEX, Sched<[WriteCvtPS2PD]>, WIG;
1703 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1704 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1705 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1706 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, WIG;
1707 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1708 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1709 [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
1710 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, WIG;
1711 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1712 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1713 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1714 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, WIG;
1717 let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
1718 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1719 "cvtps2pd\t{$src, $dst|$dst, $src}",
1720 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1721 PS, Sched<[WriteCvtPS2PD]>;
1722 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1723 "cvtps2pd\t{$src, $dst|$dst, $src}",
1724 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1725 PS, Sched<[WriteCvtPS2PD.Folded]>;
1728 // Convert Packed DW Integers to Packed Double FP
1729 let Predicates = [HasAVX, NoVLX] in {
1730 let hasSideEffects = 0, mayLoad = 1 in
1731 def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1732 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1734 (v2f64 (X86any_VSintToFP
1736 (v2i64 (scalar_to_vector
1737 (loadi64 addr:$src)))))))]>,
1738 VEX, Sched<[WriteCvtI2PDLd]>, WIG;
1739 def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1740 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1742 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1743 VEX, Sched<[WriteCvtI2PD]>, WIG;
1744 def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1745 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1747 (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
1748 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1750 def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1751 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1753 (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
1754 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, WIG;
1757 let hasSideEffects = 0, mayLoad = 1 in
1758 def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1759 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1761 (v2f64 (X86any_VSintToFP
1763 (v2i64 (scalar_to_vector
1764 (loadi64 addr:$src)))))))]>,
1765 Sched<[WriteCvtI2PDLd]>;
1766 def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1767 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1769 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1770 Sched<[WriteCvtI2PD]>;
1772 // AVX register conversion intrinsics
1773 let Predicates = [HasAVX, NoVLX] in {
1774 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1775 (VCVTDQ2PDrm addr:$src)>;
1776 } // Predicates = [HasAVX, NoVLX]
1778 // SSE2 register conversion intrinsics
1779 let Predicates = [UseSSE2] in {
1780 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1781 (CVTDQ2PDrm addr:$src)>;
1782 } // Predicates = [UseSSE2]
1784 // Convert packed double to packed single
1785 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1786 // register, but the same isn't true when using memory operands instead.
1787 // Provide other assembly rr and rm forms to address this explicitly.
1788 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1790 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1791 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1792 [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
1793 VEX, Sched<[WriteCvtPD2PS]>, WIG;
1794 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1795 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1796 [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>,
1797 VEX, Sched<[WriteCvtPD2PS.Folded]>, WIG;
1799 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1800 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1801 [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>,
1802 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, WIG;
1803 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1804 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1805 [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>,
1806 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, WIG;
1807 } // Predicates = [HasAVX, NoVLX]
1809 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1810 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1811 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1812 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1814 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1815 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1816 [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
1817 Sched<[WriteCvtPD2PS]>, SIMD_EXC;
1818 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1819 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1820 [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>,
1821 Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
1823 //===----------------------------------------------------------------------===//
1824 // SSE 1 & 2 - Compare Instructions
1825 //===----------------------------------------------------------------------===//
1827 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1828 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1829 Operand memop, SDNode OpNode, ValueType VT,
1830 PatFrag ld_frag, string asm,
1831 X86FoldableSchedWrite sched,
1832 PatFrags mem_frags> {
1833 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1834 (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
1835 [(set VR128:$dst, (OpNode (VT VR128:$src1),
1836 VR128:$src2, timm:$cc))]>,
1837 Sched<[sched]>, SIMD_EXC;
1839 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1840 (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
1841 [(set VR128:$dst, (OpNode (VT VR128:$src1),
1842 (mem_frags addr:$src2), timm:$cc))]>,
1843 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1845 let isCodeGenOnly = 1 in {
1846 let isCommutable = 1 in
1847 def rr : SIi8<0xC2, MRMSrcReg,
1848 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1849 [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
1850 Sched<[sched]>, SIMD_EXC;
1851 def rm : SIi8<0xC2, MRMSrcMem,
1852 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1853 [(set RC:$dst, (OpNode RC:$src1,
1854 (ld_frag addr:$src2), timm:$cc))]>,
1855 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1859 let ExeDomain = SSEPackedSingle in
1860 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1861 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1862 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1863 XS, VEX_4V, VEX_LIG, WIG;
1864 let ExeDomain = SSEPackedDouble in
1865 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1866 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1867 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1868 XD, VEX_4V, VEX_LIG, WIG;
1870 let Constraints = "$src1 = $dst" in {
1871 let ExeDomain = SSEPackedSingle in
1872 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1873 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1874 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1875 let ExeDomain = SSEPackedDouble in
1876 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1877 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1878 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1881 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1882 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode,
1883 ValueType vt, X86MemOperand x86memop,
1884 PatFrag ld_frag, string OpcodeStr, Domain d,
1885 X86FoldableSchedWrite sched = WriteFComX> {
1886 let ExeDomain = d in {
1887 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1888 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1889 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1890 Sched<[sched]>, SIMD_EXC;
1892 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1893 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1894 [(set EFLAGS, (OpNode (vt RC:$src1),
1895 (ld_frag addr:$src2)))]>,
1896 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1900 // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1901 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1902 ValueType vt, Operand memop,
1903 PatFrags mem_frags, string OpcodeStr,
1905 X86FoldableSchedWrite sched = WriteFComX> {
1906 let ExeDomain = d in {
1907 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1908 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1909 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1910 Sched<[sched]>, SIMD_EXC;
1912 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1913 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1914 [(set EFLAGS, (OpNode (vt RC:$src1),
1915 (mem_frags addr:$src2)))]>,
1916 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1920 let Defs = [EFLAGS] in {
1921 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1922 "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
1923 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1924 "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
1925 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1926 "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
1927 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1928 "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
1930 let isCodeGenOnly = 1 in {
1931 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1932 sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
1933 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1934 sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
1936 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1937 sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
1938 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1939 sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
1941 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1942 "ucomiss", SSEPackedSingle>, PS;
1943 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1944 "ucomisd", SSEPackedDouble>, PD;
1945 defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1946 "comiss", SSEPackedSingle>, PS;
1947 defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1948 "comisd", SSEPackedDouble>, PD;
1950 let isCodeGenOnly = 1 in {
1951 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1952 sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
1953 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1954 sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
1956 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1957 sse_load_f32, "comiss", SSEPackedSingle>, PS;
1958 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1959 sse_load_f64, "comisd", SSEPackedDouble>, PD;
1961 } // Defs = [EFLAGS]
1963 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
1964 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1965 ValueType VT, string asm,
1966 X86FoldableSchedWrite sched,
1967 Domain d, PatFrag ld_frag> {
1968 let isCommutable = 1 in
1969 def rri : PIi8<0xC2, MRMSrcReg,
1970 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1971 [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
1972 Sched<[sched]>, SIMD_EXC;
1973 def rmi : PIi8<0xC2, MRMSrcMem,
1974 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1976 (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
1977 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1980 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1981 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1982 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, WIG;
1983 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1984 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1985 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, WIG;
1986 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1987 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1988 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, WIG;
1989 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1990 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1991 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, WIG;
1992 let Constraints = "$src1 = $dst" in {
1993 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1994 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1995 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1996 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1997 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1998 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
2001 def CommutableCMPCC : PatLeaf<(timm), [{
2002 uint64_t Imm = N->getZExtValue() & 0x7;
2003 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
2006 // Patterns to select compares with loads in first operand.
2007 let Predicates = [HasAVX] in {
2008 def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
2009 CommutableCMPCC:$cc)),
2010 (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
2012 def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
2013 CommutableCMPCC:$cc)),
2014 (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
2016 def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
2017 CommutableCMPCC:$cc)),
2018 (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
2020 def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
2021 CommutableCMPCC:$cc)),
2022 (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2024 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2025 CommutableCMPCC:$cc)),
2026 (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
2028 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2029 CommutableCMPCC:$cc)),
2030 (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2033 let Predicates = [UseSSE2] in {
2034 def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
2035 CommutableCMPCC:$cc)),
2036 (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
2038 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2039 CommutableCMPCC:$cc)),
2040 (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
2043 let Predicates = [UseSSE1] in {
2044 def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
2045 CommutableCMPCC:$cc)),
2046 (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2048 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2049 CommutableCMPCC:$cc)),
2050 (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2053 //===----------------------------------------------------------------------===//
2054 // SSE 1 & 2 - Shuffle Instructions
2055 //===----------------------------------------------------------------------===//
2057 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2058 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2059 ValueType vt, string asm, PatFrag mem_frag,
2060 X86FoldableSchedWrite sched, Domain d,
2061 bit IsCommutable = 0> {
2062 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2063 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2064 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2065 (i8 timm:$src3))))], d>,
2066 Sched<[sched.Folded, sched.ReadAfterFold]>;
2067 let isCommutable = IsCommutable in
2068 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2069 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2070 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2071 (i8 timm:$src3))))], d>,
2075 let Predicates = [HasAVX, NoVLX] in {
2076 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2077 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2078 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2080 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2081 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2082 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2083 PS, VEX_4V, VEX_L, WIG;
2084 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2085 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2086 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2088 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2089 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2090 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2091 PD, VEX_4V, VEX_L, WIG;
2093 let Constraints = "$src1 = $dst" in {
2094 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2095 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2096 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2097 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2098 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2099 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2102 //===----------------------------------------------------------------------===//
2103 // SSE 1 & 2 - Unpack FP Instructions
2104 //===----------------------------------------------------------------------===//
2106 /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2107 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2108 PatFrag mem_frag, RegisterClass RC,
2109 X86MemOperand x86memop, string asm,
2110 X86FoldableSchedWrite sched, Domain d,
2111 bit IsCommutable = 0> {
2112 let isCommutable = IsCommutable in
2113 def rr : PI<opc, MRMSrcReg,
2114 (outs RC:$dst), (ins RC:$src1, RC:$src2),
2116 (vt (OpNode RC:$src1, RC:$src2)))], d>,
2118 def rm : PI<opc, MRMSrcMem,
2119 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2121 (vt (OpNode RC:$src1,
2122 (mem_frag addr:$src2))))], d>,
2123 Sched<[sched.Folded, sched.ReadAfterFold]>;
2126 let Predicates = [HasAVX, NoVLX] in {
2127 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2128 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2129 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, WIG;
2130 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2131 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2132 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, WIG;
2133 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2134 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2135 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, WIG;
2136 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2137 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2138 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, WIG;
2140 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2141 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2142 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, WIG;
2143 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2144 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2145 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, WIG;
2146 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2147 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2148 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, WIG;
2149 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2150 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2151 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, WIG;
2152 }// Predicates = [HasAVX, NoVLX]
2154 let Constraints = "$src1 = $dst" in {
2155 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2156 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2157 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2158 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2159 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2160 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2161 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2162 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2163 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2164 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2165 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2166 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2167 } // Constraints = "$src1 = $dst"
2169 let Predicates = [HasAVX1Only] in {
2170 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2171 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2172 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2173 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2174 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2175 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2176 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2177 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2179 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2180 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2181 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2182 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2183 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2184 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2185 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2186 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2189 let Predicates = [UseSSE2] in {
2190 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2191 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2192 (v2f64 (simple_load addr:$src2)))),
2193 (MOVHPDrm VR128:$src1, addr:$src2)>;
2196 //===----------------------------------------------------------------------===//
2197 // SSE 1 & 2 - Extract Floating-Point Sign mask
2198 //===----------------------------------------------------------------------===//
2200 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2201 multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2202 string asm, Domain d> {
2203 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2204 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2205 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2206 Sched<[WriteFMOVMSK]>;
2209 let Predicates = [HasAVX] in {
2210 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2211 SSEPackedSingle>, PS, VEX, WIG;
2212 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2213 SSEPackedDouble>, PD, VEX, WIG;
2214 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2215 SSEPackedSingle>, PS, VEX, VEX_L, WIG;
2216 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2217 SSEPackedDouble>, PD, VEX, VEX_L, WIG;
2219 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2220 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2221 (VMOVMSKPSrr VR128:$src)>;
2222 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2223 (VMOVMSKPDrr VR128:$src)>;
2224 def : Pat<(X86movmsk (v8i32 VR256:$src)),
2225 (VMOVMSKPSYrr VR256:$src)>;
2226 def : Pat<(X86movmsk (v4i64 VR256:$src)),
2227 (VMOVMSKPDYrr VR256:$src)>;
2230 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2231 SSEPackedSingle>, PS;
2232 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2233 SSEPackedDouble>, PD;
2235 let Predicates = [UseSSE2] in {
2236 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2237 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2238 (MOVMSKPSrr VR128:$src)>;
2239 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2240 (MOVMSKPDrr VR128:$src)>;
2243 //===---------------------------------------------------------------------===//
2244 // SSE2 - Packed Integer Logical Instructions
2245 //===---------------------------------------------------------------------===//
2247 let ExeDomain = SSEPackedInt in { // SSE integer instructions
2249 /// PDI_binop_rm - Simple SSE2 binary operator.
2250 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2251 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2252 X86MemOperand x86memop, X86FoldableSchedWrite sched,
2253 bit IsCommutable, bit Is2Addr> {
2254 let isCommutable = IsCommutable in
2255 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2256 (ins RC:$src1, RC:$src2),
2258 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2259 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2260 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2262 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2263 (ins RC:$src1, x86memop:$src2),
2265 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2266 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2267 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2268 Sched<[sched.Folded, sched.ReadAfterFold]>;
2270 } // ExeDomain = SSEPackedInt
2272 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2273 ValueType OpVT128, ValueType OpVT256,
2274 X86SchedWriteWidths sched, bit IsCommutable,
2276 let Predicates = [HasAVX, prd] in
2277 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2278 VR128, load, i128mem, sched.XMM,
2279 IsCommutable, 0>, VEX_4V, WIG;
2281 let Constraints = "$src1 = $dst" in
2282 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2283 memop, i128mem, sched.XMM, IsCommutable, 1>;
2285 let Predicates = [HasAVX2, prd] in
2286 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2287 OpVT256, VR256, load, i256mem, sched.YMM,
2288 IsCommutable, 0>, VEX_4V, VEX_L, WIG;
2291 // These are ordered here for pattern ordering requirements with the fp versions
2293 defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2294 SchedWriteVecLogic, 1, NoVLX>;
2295 defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2296 SchedWriteVecLogic, 1, NoVLX>;
2297 defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2298 SchedWriteVecLogic, 1, NoVLX>;
2299 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2300 SchedWriteVecLogic, 0, NoVLX>;
2302 //===----------------------------------------------------------------------===//
2303 // SSE 1 & 2 - Logical Instructions
2304 //===----------------------------------------------------------------------===//
2306 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2308 /// There are no patterns here because isel prefers integer versions for SSE2
2309 /// and later. There are SSE1 v4f32 patterns later.
2310 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2311 X86SchedWriteWidths sched> {
2312 let Predicates = [HasAVX, NoVLX] in {
2313 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2314 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2315 [], [], 0>, PS, VEX_4V, VEX_L, WIG;
2317 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2318 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2319 [], [], 0>, PD, VEX_4V, VEX_L, WIG;
2321 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2322 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2323 [], [], 0>, PS, VEX_4V, WIG;
2325 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2326 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2327 [], [], 0>, PD, VEX_4V, WIG;
2330 let Constraints = "$src1 = $dst" in {
2331 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2332 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2335 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2336 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2341 defm AND : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>;
2342 defm OR : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>;
2343 defm XOR : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>;
2344 let isCommutable = 0 in
2345 defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>;
2347 let Predicates = [HasAVX2, NoVLX] in {
2348 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2349 (VPANDYrr VR256:$src1, VR256:$src2)>;
2350 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2351 (VPANDYrr VR256:$src1, VR256:$src2)>;
2352 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2353 (VPANDYrr VR256:$src1, VR256:$src2)>;
2355 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2356 (VPORYrr VR256:$src1, VR256:$src2)>;
2357 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2358 (VPORYrr VR256:$src1, VR256:$src2)>;
2359 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2360 (VPORYrr VR256:$src1, VR256:$src2)>;
2362 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2363 (VPXORYrr VR256:$src1, VR256:$src2)>;
2364 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2365 (VPXORYrr VR256:$src1, VR256:$src2)>;
2366 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2367 (VPXORYrr VR256:$src1, VR256:$src2)>;
2369 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2370 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2371 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2372 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2373 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2374 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2376 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2377 (VPANDYrm VR256:$src1, addr:$src2)>;
2378 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2379 (VPANDYrm VR256:$src1, addr:$src2)>;
2380 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2381 (VPANDYrm VR256:$src1, addr:$src2)>;
2383 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2384 (VPORYrm VR256:$src1, addr:$src2)>;
2385 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2386 (VPORYrm VR256:$src1, addr:$src2)>;
2387 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2388 (VPORYrm VR256:$src1, addr:$src2)>;
2390 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2391 (VPXORYrm VR256:$src1, addr:$src2)>;
2392 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2393 (VPXORYrm VR256:$src1, addr:$src2)>;
2394 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2395 (VPXORYrm VR256:$src1, addr:$src2)>;
2397 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2398 (VPANDNYrm VR256:$src1, addr:$src2)>;
2399 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2400 (VPANDNYrm VR256:$src1, addr:$src2)>;
2401 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2402 (VPANDNYrm VR256:$src1, addr:$src2)>;
2405 // If only AVX1 is supported, we need to handle integer operations with
2406 // floating point instructions since the integer versions aren't available.
2407 let Predicates = [HasAVX1Only] in {
2408 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2409 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2410 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2411 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2412 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2413 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2414 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2415 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2417 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2418 (VORPSYrr VR256:$src1, VR256:$src2)>;
2419 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2420 (VORPSYrr VR256:$src1, VR256:$src2)>;
2421 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2422 (VORPSYrr VR256:$src1, VR256:$src2)>;
2423 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2424 (VORPSYrr VR256:$src1, VR256:$src2)>;
2426 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2427 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2428 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2429 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2430 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2431 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2432 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2433 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2435 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2436 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2437 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2438 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2439 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2440 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2441 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2442 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2444 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2445 (VANDPSYrm VR256:$src1, addr:$src2)>;
2446 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2447 (VANDPSYrm VR256:$src1, addr:$src2)>;
2448 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2449 (VANDPSYrm VR256:$src1, addr:$src2)>;
2450 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2451 (VANDPSYrm VR256:$src1, addr:$src2)>;
2453 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2454 (VORPSYrm VR256:$src1, addr:$src2)>;
2455 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2456 (VORPSYrm VR256:$src1, addr:$src2)>;
2457 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2458 (VORPSYrm VR256:$src1, addr:$src2)>;
2459 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2460 (VORPSYrm VR256:$src1, addr:$src2)>;
2462 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2463 (VXORPSYrm VR256:$src1, addr:$src2)>;
2464 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2465 (VXORPSYrm VR256:$src1, addr:$src2)>;
2466 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2467 (VXORPSYrm VR256:$src1, addr:$src2)>;
2468 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2469 (VXORPSYrm VR256:$src1, addr:$src2)>;
2471 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2472 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2473 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2474 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2475 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2476 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2477 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2478 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2481 let Predicates = [HasAVX, NoVLX] in {
2482 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2483 (VPANDrr VR128:$src1, VR128:$src2)>;
2484 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2485 (VPANDrr VR128:$src1, VR128:$src2)>;
2486 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2487 (VPANDrr VR128:$src1, VR128:$src2)>;
2489 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2490 (VPORrr VR128:$src1, VR128:$src2)>;
2491 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2492 (VPORrr VR128:$src1, VR128:$src2)>;
2493 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2494 (VPORrr VR128:$src1, VR128:$src2)>;
2496 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2497 (VPXORrr VR128:$src1, VR128:$src2)>;
2498 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2499 (VPXORrr VR128:$src1, VR128:$src2)>;
2500 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2501 (VPXORrr VR128:$src1, VR128:$src2)>;
2503 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2504 (VPANDNrr VR128:$src1, VR128:$src2)>;
2505 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2506 (VPANDNrr VR128:$src1, VR128:$src2)>;
2507 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2508 (VPANDNrr VR128:$src1, VR128:$src2)>;
2510 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2511 (VPANDrm VR128:$src1, addr:$src2)>;
2512 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2513 (VPANDrm VR128:$src1, addr:$src2)>;
2514 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2515 (VPANDrm VR128:$src1, addr:$src2)>;
2517 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2518 (VPORrm VR128:$src1, addr:$src2)>;
2519 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2520 (VPORrm VR128:$src1, addr:$src2)>;
2521 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2522 (VPORrm VR128:$src1, addr:$src2)>;
2524 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2525 (VPXORrm VR128:$src1, addr:$src2)>;
2526 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2527 (VPXORrm VR128:$src1, addr:$src2)>;
2528 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2529 (VPXORrm VR128:$src1, addr:$src2)>;
2531 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2532 (VPANDNrm VR128:$src1, addr:$src2)>;
2533 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2534 (VPANDNrm VR128:$src1, addr:$src2)>;
2535 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2536 (VPANDNrm VR128:$src1, addr:$src2)>;
2539 let Predicates = [UseSSE2] in {
2540 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2541 (PANDrr VR128:$src1, VR128:$src2)>;
2542 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2543 (PANDrr VR128:$src1, VR128:$src2)>;
2544 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2545 (PANDrr VR128:$src1, VR128:$src2)>;
2547 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2548 (PORrr VR128:$src1, VR128:$src2)>;
2549 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2550 (PORrr VR128:$src1, VR128:$src2)>;
2551 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2552 (PORrr VR128:$src1, VR128:$src2)>;
2554 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2555 (PXORrr VR128:$src1, VR128:$src2)>;
2556 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2557 (PXORrr VR128:$src1, VR128:$src2)>;
2558 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2559 (PXORrr VR128:$src1, VR128:$src2)>;
2561 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2562 (PANDNrr VR128:$src1, VR128:$src2)>;
2563 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2564 (PANDNrr VR128:$src1, VR128:$src2)>;
2565 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2566 (PANDNrr VR128:$src1, VR128:$src2)>;
2568 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2569 (PANDrm VR128:$src1, addr:$src2)>;
2570 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2571 (PANDrm VR128:$src1, addr:$src2)>;
2572 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2573 (PANDrm VR128:$src1, addr:$src2)>;
2575 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2576 (PORrm VR128:$src1, addr:$src2)>;
2577 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2578 (PORrm VR128:$src1, addr:$src2)>;
2579 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2580 (PORrm VR128:$src1, addr:$src2)>;
2582 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2583 (PXORrm VR128:$src1, addr:$src2)>;
2584 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2585 (PXORrm VR128:$src1, addr:$src2)>;
2586 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2587 (PXORrm VR128:$src1, addr:$src2)>;
2589 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2590 (PANDNrm VR128:$src1, addr:$src2)>;
2591 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2592 (PANDNrm VR128:$src1, addr:$src2)>;
2593 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2594 (PANDNrm VR128:$src1, addr:$src2)>;
2597 // Patterns for packed operations when we don't have integer type available.
2598 def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2599 (ANDPSrr VR128:$src1, VR128:$src2)>;
2600 def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2601 (ORPSrr VR128:$src1, VR128:$src2)>;
2602 def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2603 (XORPSrr VR128:$src1, VR128:$src2)>;
2604 def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2605 (ANDNPSrr VR128:$src1, VR128:$src2)>;
2607 def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2608 (ANDPSrm VR128:$src1, addr:$src2)>;
2609 def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2610 (ORPSrm VR128:$src1, addr:$src2)>;
2611 def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2612 (XORPSrm VR128:$src1, addr:$src2)>;
2613 def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2614 (ANDNPSrm VR128:$src1, addr:$src2)>;
2616 //===----------------------------------------------------------------------===//
2617 // SSE 1 & 2 - Arithmetic Instructions
2618 //===----------------------------------------------------------------------===//
2620 /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2623 /// In addition, we also have a special variant of the scalar form here to
2624 /// represent the associated intrinsic operation. This form is unlike the
2625 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
2626 /// and leaves the top elements unmodified (therefore these cannot be commuted).
2628 /// These three forms can each be reg+reg or reg+mem.
2631 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2633 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2634 SDPatternOperator OpNode, X86SchedWriteSizes sched> {
2635 let Uses = [MXCSR], mayRaiseFPException = 1 in {
2636 let Predicates = [HasAVX, NoVLX] in {
2637 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2638 VR128, v4f32, f128mem, loadv4f32,
2639 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, WIG;
2640 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2641 VR128, v2f64, f128mem, loadv2f64,
2642 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, WIG;
2644 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2645 OpNode, VR256, v8f32, f256mem, loadv8f32,
2646 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, WIG;
2647 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2648 OpNode, VR256, v4f64, f256mem, loadv4f64,
2649 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, WIG;
2652 let Constraints = "$src1 = $dst" in {
2653 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2654 v4f32, f128mem, memopv4f32, SSEPackedSingle,
2656 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2657 v2f64, f128mem, memopv2f64, SSEPackedDouble,
2663 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2664 X86SchedWriteSizes sched> {
2665 let Uses = [MXCSR], mayRaiseFPException = 1 in {
2666 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2667 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2668 XS, VEX_4V, VEX_LIG, WIG;
2669 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2670 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2671 XD, VEX_4V, VEX_LIG, WIG;
2673 let Constraints = "$src1 = $dst" in {
2674 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2675 OpNode, FR32, f32mem, SSEPackedSingle,
2677 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2678 OpNode, FR64, f64mem, SSEPackedDouble,
2684 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2685 SDPatternOperator OpNode,
2686 X86SchedWriteSizes sched> {
2687 let Uses = [MXCSR], mayRaiseFPException = 1 in {
2688 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
2689 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2690 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, WIG;
2691 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
2692 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2693 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, WIG;
2695 let Constraints = "$src1 = $dst" in {
2696 defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
2697 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2698 SSEPackedSingle, sched.PS.Scl>, XS;
2699 defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
2700 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2701 SSEPackedDouble, sched.PD.Scl>, XD;
2706 // Binary Arithmetic instructions
2707 defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2708 basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2709 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2710 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2711 basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2712 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2713 let isCommutable = 0 in {
2714 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2715 basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2716 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2717 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2718 basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2719 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2720 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2721 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2722 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2723 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2724 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2725 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2728 let isCodeGenOnly = 1 in {
2729 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2730 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2731 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2732 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2735 // Patterns used to select SSE scalar fp arithmetic instructions from
2738 // (1) a scalar fp operation followed by a blend
2740 // The effect is that the backend no longer emits unnecessary vector
2741 // insert instructions immediately after SSE scalar fp instructions
2742 // like addss or mulss.
2744 // For example, given the following code:
2745 // __m128 foo(__m128 A, __m128 B) {
2750 // Previously we generated:
2751 // addss %xmm0, %xmm1
2752 // movss %xmm1, %xmm0
2755 // addss %xmm1, %xmm0
2757 // (2) a vector packed single/double fp operation followed by a vector insert
2759 // The effect is that the backend converts the packed fp instruction
2760 // followed by a vector insert into a single SSE scalar fp instruction.
2762 // For example, given the following code:
2763 // __m128 foo(__m128 A, __m128 B) {
2764 // __m128 C = A + B;
2765 // return (__m128) {c[0], a[1], a[2], a[3]};
2768 // Previously we generated:
2769 // addps %xmm0, %xmm1
2770 // movss %xmm1, %xmm0
2773 // addss %xmm1, %xmm0
2775 // TODO: Some canonicalization in lowering would simplify the number of
2776 // patterns we have to try to match.
2777 multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move,
2778 ValueType VT, ValueType EltTy,
2779 RegisterClass RC, PatFrag ld_frag,
2780 Predicate BasePredicate> {
2781 let Predicates = [BasePredicate] in {
2782 // extracted scalar math op with insert via movss/movsd
2783 def : Pat<(VT (Move (VT VR128:$dst),
2784 (VT (scalar_to_vector
2785 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2787 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2788 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2789 def : Pat<(VT (Move (VT VR128:$dst),
2790 (VT (scalar_to_vector
2791 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2792 (ld_frag addr:$src)))))),
2793 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2796 // Repeat for AVX versions of the instructions.
2797 let Predicates = [UseAVX] in {
2798 // extracted scalar math op with insert via movss/movsd
2799 def : Pat<(VT (Move (VT VR128:$dst),
2800 (VT (scalar_to_vector
2801 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2803 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2804 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2805 def : Pat<(VT (Move (VT VR128:$dst),
2806 (VT (scalar_to_vector
2807 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2808 (ld_frag addr:$src)))))),
2809 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2813 defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2814 defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2815 defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2816 defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2818 defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2819 defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2820 defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2821 defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2824 /// In addition, we also have a special variant of the scalar form here to
2825 /// represent the associated intrinsic operation. This form is unlike the
2826 /// plain scalar form, in that it takes an entire vector (instead of a
2827 /// scalar) and leaves the top elements undefined.
2829 /// And, we have a special variant form for a full-vector intrinsic form.
2831 /// sse_fp_unop_s - SSE1 unops in scalar form
2832 /// For the non-AVX defs, we need $src1 to be tied to $dst because
2833 /// the HW instructions are 2 operand / destructive.
2834 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2835 X86MemOperand x86memop, Operand intmemop,
2836 SDPatternOperator OpNode, Domain d,
2837 X86FoldableSchedWrite sched, Predicate target> {
2838 let isCodeGenOnly = 1, hasSideEffects = 0 in {
2839 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2840 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2841 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2844 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2845 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2846 [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2847 Sched<[sched.Folded]>,
2848 Requires<[target, OptForSize]>;
2851 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2852 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2853 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2856 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2857 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2858 Sched<[sched.Folded, sched.ReadAfterFold]>;
2863 multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
2864 Intrinsic Intr, Predicate target> {
2865 let Predicates = [target] in {
2866 // These are unary operations, but they are modeled as having 2 source operands
2867 // because the high elements of the destination are unchanged in SSE.
2868 def : Pat<(Intr VR128:$src),
2869 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2871 // We don't want to fold scalar loads into these instructions unless
2872 // optimizing for size. This is because the folded instruction will have a
2873 // partial register update, while the unfolded sequence will not, e.g.
2875 // rcpss %xmm0, %xmm0
2876 // which has a clobber before the rcp, vs.
2878 let Predicates = [target, OptForSize] in {
2879 def : Pat<(Intr (mem_frags addr:$src2)),
2880 (!cast<Instruction>(NAME#m_Int)
2881 (vt (IMPLICIT_DEF)), addr:$src2)>;
2885 multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
2886 Intrinsic Intr, Predicate target> {
2887 let Predicates = [target] in {
2888 def : Pat<(Intr VR128:$src),
2889 (!cast<Instruction>(NAME#r_Int) VR128:$src,
2892 let Predicates = [target, OptForSize] in {
2893 def : Pat<(Intr (mem_frags addr:$src2)),
2894 (!cast<Instruction>(NAME#m_Int)
2895 (vt (IMPLICIT_DEF)), addr:$src2)>;
2899 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2900 ValueType ScalarVT, X86MemOperand x86memop,
2901 Operand intmemop, SDPatternOperator OpNode, Domain d,
2902 X86FoldableSchedWrite sched, Predicate target> {
2903 let isCodeGenOnly = 1, hasSideEffects = 0 in {
2904 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2905 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2906 [], d>, Sched<[sched]>;
2908 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2909 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2910 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2912 let hasSideEffects = 0, ExeDomain = d in {
2913 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2914 (ins VR128:$src1, VR128:$src2),
2915 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2916 []>, Sched<[sched]>;
2918 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2919 (ins VR128:$src1, intmemop:$src2),
2920 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2921 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2924 // We don't want to fold scalar loads into these instructions unless
2925 // optimizing for size. This is because the folded instruction will have a
2926 // partial register update, while the unfolded sequence will not, e.g.
2927 // vmovss mem, %xmm0
2928 // vrcpss %xmm0, %xmm0, %xmm0
2929 // which has a clobber before the rcp, vs.
2930 // vrcpss mem, %xmm0, %xmm0
2931 // TODO: In theory, we could fold the load, and avoid the stall caused by
2932 // the partial register store, either in BreakFalseDeps or with smarter RA.
2933 let Predicates = [target] in {
2934 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r)
2935 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2937 let Predicates = [target, OptForSize] in {
2938 def : Pat<(ScalarVT (OpNode (load addr:$src))),
2939 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2944 /// sse1_fp_unop_p - SSE1 unops in packed form.
2945 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2946 X86SchedWriteWidths sched, list<Predicate> prds> {
2947 let Predicates = prds in {
2948 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2949 !strconcat("v", OpcodeStr,
2950 "ps\t{$src, $dst|$dst, $src}"),
2951 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2952 VEX, Sched<[sched.XMM]>, WIG;
2953 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2954 !strconcat("v", OpcodeStr,
2955 "ps\t{$src, $dst|$dst, $src}"),
2956 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2957 VEX, Sched<[sched.XMM.Folded]>, WIG;
2958 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2959 !strconcat("v", OpcodeStr,
2960 "ps\t{$src, $dst|$dst, $src}"),
2961 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2962 VEX, VEX_L, Sched<[sched.YMM]>, WIG;
2963 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2964 !strconcat("v", OpcodeStr,
2965 "ps\t{$src, $dst|$dst, $src}"),
2966 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2967 VEX, VEX_L, Sched<[sched.YMM.Folded]>, WIG;
2970 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2971 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2972 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2974 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2975 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2976 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2977 Sched<[sched.XMM.Folded]>;
2980 /// sse2_fp_unop_p - SSE2 unops in vector forms.
2981 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2982 SDPatternOperator OpNode, X86SchedWriteWidths sched> {
2983 let Predicates = [HasAVX, NoVLX] in {
2984 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2985 !strconcat("v", OpcodeStr,
2986 "pd\t{$src, $dst|$dst, $src}"),
2987 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2988 VEX, Sched<[sched.XMM]>, WIG;
2989 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2990 !strconcat("v", OpcodeStr,
2991 "pd\t{$src, $dst|$dst, $src}"),
2992 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2993 VEX, Sched<[sched.XMM.Folded]>, WIG;
2994 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2995 !strconcat("v", OpcodeStr,
2996 "pd\t{$src, $dst|$dst, $src}"),
2997 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2998 VEX, VEX_L, Sched<[sched.YMM]>, WIG;
2999 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3000 !strconcat("v", OpcodeStr,
3001 "pd\t{$src, $dst|$dst, $src}"),
3002 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
3003 VEX, VEX_L, Sched<[sched.YMM.Folded]>, WIG;
3006 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3007 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3008 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
3010 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3011 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3012 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
3013 Sched<[sched.XMM.Folded]>;
3016 multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
3017 defm SS : sse_fp_unop_s_intr<v4f32, sse_load_f32,
3018 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
3020 defm V#NAME#SS : avx_fp_unop_s_intr<v4f32, sse_load_f32,
3021 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
3023 XS, VEX_4V, VEX_LIG, WIG;
3026 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
3027 X86SchedWriteWidths sched, Predicate AVXTarget> {
3028 defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem,
3029 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
3030 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
3031 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
3032 XS, VEX_4V, VEX_LIG, WIG;
3035 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
3036 X86SchedWriteWidths sched, Predicate AVXTarget> {
3037 defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem,
3038 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
3039 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
3040 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
3041 XD, VEX_4V, VEX_LIG, WIG;
3045 defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
3046 sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
3047 sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
3048 sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
3050 // Reciprocal approximations. Note that these typically require refinement
3051 // in order to obtain suitable precision.
3052 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3053 sse1_fp_unop_s_intr<"rsqrt", HasAVX>,
3054 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
3055 defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3056 sse1_fp_unop_s_intr<"rcp", HasAVX>,
3057 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
3059 // There is no f64 version of the reciprocal approximation instructions.
3061 multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move,
3062 ValueType VT, Predicate BasePredicate> {
3063 let Predicates = [BasePredicate] in {
3064 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3065 (OpNode (extractelt VT:$src, 0))))),
3066 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3069 // Repeat for AVX versions of the instructions.
3070 let Predicates = [UseAVX] in {
3071 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3072 (OpNode (extractelt VT:$src, 0))))),
3073 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3077 defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
3078 defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
3080 multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3081 SDNode Move, ValueType VT,
3082 Predicate BasePredicate> {
3083 let Predicates = [BasePredicate] in {
3084 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3085 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3088 // Repeat for AVX versions of the instructions.
3089 let Predicates = [HasAVX] in {
3090 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3091 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3095 defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3097 defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3101 //===----------------------------------------------------------------------===//
3102 // SSE 1 & 2 - Non-temporal stores
3103 //===----------------------------------------------------------------------===//
3105 let AddedComplexity = 400 in { // Prefer non-temporal versions
3106 let Predicates = [HasAVX, NoVLX] in {
3107 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3108 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3109 (ins f128mem:$dst, VR128:$src),
3110 "movntps\t{$src, $dst|$dst, $src}",
3111 [(alignednontemporalstore (v4f32 VR128:$src),
3112 addr:$dst)]>, VEX, WIG;
3113 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3114 (ins f128mem:$dst, VR128:$src),
3115 "movntpd\t{$src, $dst|$dst, $src}",
3116 [(alignednontemporalstore (v2f64 VR128:$src),
3117 addr:$dst)]>, VEX, WIG;
3120 let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3121 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3122 (ins f256mem:$dst, VR256:$src),
3123 "movntps\t{$src, $dst|$dst, $src}",
3124 [(alignednontemporalstore (v8f32 VR256:$src),
3125 addr:$dst)]>, VEX, VEX_L, WIG;
3126 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3127 (ins f256mem:$dst, VR256:$src),
3128 "movntpd\t{$src, $dst|$dst, $src}",
3129 [(alignednontemporalstore (v4f64 VR256:$src),
3130 addr:$dst)]>, VEX, VEX_L, WIG;
3133 let ExeDomain = SSEPackedInt in {
3134 def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
3135 (ins i128mem:$dst, VR128:$src),
3136 "movntdq\t{$src, $dst|$dst, $src}",
3137 [(alignednontemporalstore (v2i64 VR128:$src),
3138 addr:$dst)]>, VEX, WIG,
3139 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3140 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3141 (ins i256mem:$dst, VR256:$src),
3142 "movntdq\t{$src, $dst|$dst, $src}",
3143 [(alignednontemporalstore (v4i64 VR256:$src),
3144 addr:$dst)]>, VEX, VEX_L, WIG,
3145 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3149 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3150 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3151 "movntps\t{$src, $dst|$dst, $src}",
3152 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3153 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3154 "movntpd\t{$src, $dst|$dst, $src}",
3155 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3158 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3159 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3160 "movntdq\t{$src, $dst|$dst, $src}",
3161 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3163 let SchedRW = [WriteStoreNT] in {
3164 // There is no AVX form for instructions below this point
3165 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3166 "movnti{l}\t{$src, $dst|$dst, $src}",
3167 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3168 PS, Requires<[HasSSE2]>;
3169 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3170 "movnti{q}\t{$src, $dst|$dst, $src}",
3171 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3172 PS, Requires<[HasSSE2]>;
3173 } // SchedRW = [WriteStoreNT]
3175 let Predicates = [HasAVX, NoVLX] in {
3176 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3177 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3178 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3179 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3180 def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst),
3181 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3182 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3183 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3185 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3186 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3187 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3188 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3189 def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
3190 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3191 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3192 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3195 let Predicates = [UseSSE2] in {
3196 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3197 (MOVNTDQmr addr:$dst, VR128:$src)>;
3198 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3199 (MOVNTDQmr addr:$dst, VR128:$src)>;
3200 def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
3201 (MOVNTDQmr addr:$dst, VR128:$src)>;
3202 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3203 (MOVNTDQmr addr:$dst, VR128:$src)>;
3206 } // AddedComplexity
3208 //===----------------------------------------------------------------------===//
3209 // SSE 1 & 2 - Prefetch and memory fence
3210 //===----------------------------------------------------------------------===//
3212 // Prefetch intrinsic.
3213 let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3214 def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3215 "prefetcht0\t$src", [(prefetch addr:$src, timm, (i32 3), (i32 1))]>, TB;
3216 def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3217 "prefetcht1\t$src", [(prefetch addr:$src, timm, (i32 2), (i32 1))]>, TB;
3218 def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3219 "prefetcht2\t$src", [(prefetch addr:$src, timm, (i32 1), (i32 1))]>, TB;
3220 def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3221 "prefetchnta\t$src", [(prefetch addr:$src, timm, (i32 0), (i32 1))]>, TB;
3224 // FIXME: How should flush instruction be modeled?
3225 let SchedRW = [WriteLoad] in {
3227 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3228 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3229 PS, Requires<[HasCLFLUSH]>;
3232 let SchedRW = [WriteNop] in {
3233 // Pause. This "instruction" is encoded as "rep; nop", so even though it
3234 // was introduced with SSE2, it's backward compatible.
3235 def PAUSE : I<0x90, RawFrm, (outs), (ins),
3236 "pause", [(int_x86_sse2_pause)]>, OBXS;
3239 let SchedRW = [WriteFence] in {
3240 // Load, store, and memory fence
3241 // TODO: As with mfence, we may want to ease the availability of sfence/lfence
3242 // to include any 64-bit target.
3243 def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3244 PS, Requires<[HasSSE1]>;
3245 def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3246 PS, Requires<[HasSSE2]>;
3247 def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3248 PS, Requires<[HasMFence]>;
3251 def : Pat<(X86MFence), (MFENCE)>;
3253 //===----------------------------------------------------------------------===//
3254 // SSE 1 & 2 - Load/Store XCSR register
3255 //===----------------------------------------------------------------------===//
3257 let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
3258 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3259 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3260 VEX, Sched<[WriteLDMXCSR]>, WIG;
3261 let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
3262 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3263 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3264 VEX, Sched<[WriteSTMXCSR]>, WIG;
3266 let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
3267 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3268 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3269 PS, Sched<[WriteLDMXCSR]>;
3270 let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
3271 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3272 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3273 PS, Sched<[WriteSTMXCSR]>;
3275 //===---------------------------------------------------------------------===//
3276 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3277 //===---------------------------------------------------------------------===//
3279 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3281 let hasSideEffects = 0 in {
3282 def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3283 "movdqa\t{$src, $dst|$dst, $src}", []>,
3284 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, WIG;
3285 def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3286 "movdqu\t{$src, $dst|$dst, $src}", []>,
3287 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, WIG;
3288 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3289 "movdqa\t{$src, $dst|$dst, $src}", []>,
3290 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, WIG;
3291 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3292 "movdqu\t{$src, $dst|$dst, $src}", []>,
3293 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, WIG;
3297 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3298 def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3299 "movdqa\t{$src, $dst|$dst, $src}", []>,
3300 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3302 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3303 "movdqa\t{$src, $dst|$dst, $src}", []>,
3304 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3306 def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3307 "movdqu\t{$src, $dst|$dst, $src}", []>,
3308 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3310 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3311 "movdqu\t{$src, $dst|$dst, $src}", []>,
3312 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3316 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3317 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3318 def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3319 "movdqa\t{$src, $dst|$dst, $src}",
3320 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3321 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, WIG;
3322 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3323 "movdqa\t{$src, $dst|$dst, $src}", []>,
3324 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3326 def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3327 "vmovdqu\t{$src, $dst|$dst, $src}",
3328 [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3329 Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3331 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3332 "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3333 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3334 XS, VEX, VEX_L, WIG;
3337 let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3338 def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
3339 (ins i128mem:$dst, VR128:$src),
3340 "movdqa\t{$src, $dst|$dst, $src}",
3341 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3342 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, WIG;
3343 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3344 (ins i256mem:$dst, VR256:$src),
3345 "movdqa\t{$src, $dst|$dst, $src}", []>,
3346 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, WIG;
3347 def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3348 "vmovdqu\t{$src, $dst|$dst, $src}",
3349 [(store (v2i64 VR128:$src), addr:$dst)]>,
3350 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, WIG;
3351 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3352 "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3353 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, WIG;
3356 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3357 let hasSideEffects = 0 in {
3358 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3359 "movdqa\t{$src, $dst|$dst, $src}", []>;
3361 def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3362 "movdqu\t{$src, $dst|$dst, $src}", []>,
3363 XS, Requires<[UseSSE2]>;
3367 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3368 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3369 "movdqa\t{$src, $dst|$dst, $src}", []>;
3371 def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3372 "movdqu\t{$src, $dst|$dst, $src}", []>,
3373 XS, Requires<[UseSSE2]>;
3377 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3378 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3379 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3380 "movdqa\t{$src, $dst|$dst, $src}",
3381 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3382 def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3383 "movdqu\t{$src, $dst|$dst, $src}",
3384 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3385 XS, Requires<[UseSSE2]>;
3388 let mayStore = 1, hasSideEffects = 0,
3389 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3390 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3391 "movdqa\t{$src, $dst|$dst, $src}",
3392 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3393 def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3394 "movdqu\t{$src, $dst|$dst, $src}",
3395 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3396 XS, Requires<[UseSSE2]>;
3399 } // ExeDomain = SSEPackedInt
3401 // Reversed version with ".s" suffix for GAS compatibility.
3402 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3403 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3404 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3405 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3406 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3407 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3408 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3409 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3411 // Reversed version with ".s" suffix for GAS compatibility.
3412 def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3413 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3414 def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3415 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3417 let Predicates = [HasAVX, NoVLX] in {
3418 // Additional patterns for other integer sizes.
3419 def : Pat<(alignedloadv4i32 addr:$src),
3420 (VMOVDQArm addr:$src)>;
3421 def : Pat<(alignedloadv8i16 addr:$src),
3422 (VMOVDQArm addr:$src)>;
3423 def : Pat<(alignedloadv8f16 addr:$src),
3424 (VMOVDQArm addr:$src)>;
3425 def : Pat<(alignedloadv16i8 addr:$src),
3426 (VMOVDQArm addr:$src)>;
3427 def : Pat<(loadv4i32 addr:$src),
3428 (VMOVDQUrm addr:$src)>;
3429 def : Pat<(loadv8i16 addr:$src),
3430 (VMOVDQUrm addr:$src)>;
3431 def : Pat<(loadv8f16 addr:$src),
3432 (VMOVDQUrm addr:$src)>;
3433 def : Pat<(loadv16i8 addr:$src),
3434 (VMOVDQUrm addr:$src)>;
3436 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3437 (VMOVDQAmr addr:$dst, VR128:$src)>;
3438 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3439 (VMOVDQAmr addr:$dst, VR128:$src)>;
3440 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
3441 (VMOVDQAmr addr:$dst, VR128:$src)>;
3442 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3443 (VMOVDQAmr addr:$dst, VR128:$src)>;
3444 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3445 (VMOVDQUmr addr:$dst, VR128:$src)>;
3446 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3447 (VMOVDQUmr addr:$dst, VR128:$src)>;
3448 def : Pat<(store (v8f16 VR128:$src), addr:$dst),
3449 (VMOVDQUmr addr:$dst, VR128:$src)>;
3450 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3451 (VMOVDQUmr addr:$dst, VR128:$src)>;
3454 //===---------------------------------------------------------------------===//
3455 // SSE2 - Packed Integer Arithmetic Instructions
3456 //===---------------------------------------------------------------------===//
3458 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3460 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3461 multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3462 ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3463 PatFrag memop_frag, X86MemOperand x86memop,
3464 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3465 let isCommutable = 1 in
3466 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3467 (ins RC:$src1, RC:$src2),
3469 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3470 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3471 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3473 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3474 (ins RC:$src1, x86memop:$src2),
3476 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3477 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3478 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3479 (memop_frag addr:$src2))))]>,
3480 Sched<[sched.Folded, sched.ReadAfterFold]>;
3482 } // ExeDomain = SSEPackedInt
3484 defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3485 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3486 defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3487 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3488 defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3489 SchedWriteVecALU, 1, NoVLX>;
3490 defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3491 SchedWriteVecALU, 1, NoVLX>;
3492 defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3493 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3494 defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3495 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3496 defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3497 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3498 defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3499 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3500 defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3501 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3502 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3503 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3504 defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3505 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3506 defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3507 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3508 defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3509 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3510 defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3511 SchedWriteVecALU, 0, NoVLX>;
3512 defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3513 SchedWriteVecALU, 0, NoVLX>;
3514 defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3515 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3516 defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3517 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3518 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3519 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3520 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3521 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3522 defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3523 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3524 defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3525 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3526 defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3527 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3528 defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3529 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3530 defm PAVGB : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8,
3531 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3532 defm PAVGW : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16,
3533 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3534 defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3535 SchedWriteVecIMul, 1, NoVLX>;
3537 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3538 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3539 load, i128mem, SchedWriteVecIMul.XMM, 0>,
3542 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3543 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3544 VR256, load, i256mem, SchedWriteVecIMul.YMM,
3545 0>, VEX_4V, VEX_L, WIG;
3546 let Constraints = "$src1 = $dst" in
3547 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3548 memop, i128mem, SchedWriteVecIMul.XMM>;
3550 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3551 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3552 load, i128mem, SchedWritePSADBW.XMM, 0>,
3554 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3555 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3556 load, i256mem, SchedWritePSADBW.YMM, 0>,
3558 let Constraints = "$src1 = $dst" in
3559 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3560 memop, i128mem, SchedWritePSADBW.XMM>;
3562 //===---------------------------------------------------------------------===//
3563 // SSE2 - Packed Integer Logical Instructions
3564 //===---------------------------------------------------------------------===//
3566 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3567 string OpcodeStr, SDNode OpNode,
3568 SDNode OpNode2, RegisterClass RC,
3569 X86FoldableSchedWrite sched,
3570 X86FoldableSchedWrite schedImm,
3571 ValueType DstVT, ValueType SrcVT,
3572 PatFrag ld_frag, bit Is2Addr = 1> {
3573 // src2 is always 128-bit
3574 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3575 (ins RC:$src1, VR128:$src2),
3577 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3578 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3579 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3581 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3582 (ins RC:$src1, i128mem:$src2),
3584 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3585 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3586 [(set RC:$dst, (DstVT (OpNode RC:$src1,
3587 (SrcVT (ld_frag addr:$src2)))))]>,
3588 Sched<[sched.Folded, sched.ReadAfterFold]>;
3589 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3590 (ins RC:$src1, u8imm:$src2),
3592 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3593 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3594 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
3598 multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3599 string OpcodeStr, SDNode OpNode,
3600 SDNode OpNode2, ValueType DstVT128,
3601 ValueType DstVT256, ValueType SrcVT,
3602 X86SchedWriteWidths sched,
3603 X86SchedWriteWidths schedImm, Predicate prd> {
3604 let Predicates = [HasAVX, prd] in
3605 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3606 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3607 DstVT128, SrcVT, load, 0>, VEX_4V, WIG;
3608 let Predicates = [HasAVX2, prd] in
3609 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3610 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3611 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3613 let Constraints = "$src1 = $dst" in
3614 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3615 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3619 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3620 SDNode OpNode, RegisterClass RC, ValueType VT,
3621 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3622 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3624 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3625 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3626 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
3630 multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3631 SDNode OpNode, X86SchedWriteWidths sched> {
3632 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3633 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3634 VR128, v16i8, sched.XMM, 0>, VEX_4V, WIG;
3635 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3636 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3637 VR256, v32i8, sched.YMM, 0>,
3639 let Constraints = "$src1 = $dst" in
3640 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3644 let ExeDomain = SSEPackedInt in {
3645 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3646 v8i16, v16i16, v8i16, SchedWriteVecShift,
3647 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3648 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3649 v4i32, v8i32, v4i32, SchedWriteVecShift,
3650 SchedWriteVecShiftImm, NoVLX>;
3651 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3652 v2i64, v4i64, v2i64, SchedWriteVecShift,
3653 SchedWriteVecShiftImm, NoVLX>;
3655 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3656 v8i16, v16i16, v8i16, SchedWriteVecShift,
3657 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3658 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3659 v4i32, v8i32, v4i32, SchedWriteVecShift,
3660 SchedWriteVecShiftImm, NoVLX>;
3661 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3662 v2i64, v4i64, v2i64, SchedWriteVecShift,
3663 SchedWriteVecShiftImm, NoVLX>;
3665 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3666 v8i16, v16i16, v8i16, SchedWriteVecShift,
3667 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3668 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3669 v4i32, v8i32, v4i32, SchedWriteVecShift,
3670 SchedWriteVecShiftImm, NoVLX>;
3672 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3674 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3676 } // ExeDomain = SSEPackedInt
3678 //===---------------------------------------------------------------------===//
3679 // SSE2 - Packed Integer Comparison Instructions
3680 //===---------------------------------------------------------------------===//
3682 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3683 SchedWriteVecALU, 1, TruePredicate>;
3684 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3685 SchedWriteVecALU, 1, TruePredicate>;
3686 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3687 SchedWriteVecALU, 1, TruePredicate>;
3688 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3689 SchedWriteVecALU, 0, TruePredicate>;
3690 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3691 SchedWriteVecALU, 0, TruePredicate>;
3692 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3693 SchedWriteVecALU, 0, TruePredicate>;
3695 //===---------------------------------------------------------------------===//
3696 // SSE2 - Packed Integer Shuffle Instructions
3697 //===---------------------------------------------------------------------===//
3699 let ExeDomain = SSEPackedInt in {
3700 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3701 SDNode OpNode, X86SchedWriteWidths sched,
3703 let Predicates = [HasAVX, prd] in {
3704 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3705 (ins VR128:$src1, u8imm:$src2),
3706 !strconcat("v", OpcodeStr,
3707 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3709 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3710 VEX, Sched<[sched.XMM]>, WIG;
3711 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3712 (ins i128mem:$src1, u8imm:$src2),
3713 !strconcat("v", OpcodeStr,
3714 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3716 (vt128 (OpNode (load addr:$src1),
3717 (i8 timm:$src2))))]>, VEX,
3718 Sched<[sched.XMM.Folded]>, WIG;
3721 let Predicates = [HasAVX2, prd] in {
3722 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3723 (ins VR256:$src1, u8imm:$src2),
3724 !strconcat("v", OpcodeStr,
3725 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3727 (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
3728 VEX, VEX_L, Sched<[sched.YMM]>, WIG;
3729 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3730 (ins i256mem:$src1, u8imm:$src2),
3731 !strconcat("v", OpcodeStr,
3732 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3734 (vt256 (OpNode (load addr:$src1),
3735 (i8 timm:$src2))))]>, VEX, VEX_L,
3736 Sched<[sched.YMM.Folded]>, WIG;
3739 let Predicates = [UseSSE2] in {
3740 def ri : Ii8<0x70, MRMSrcReg,
3741 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3742 !strconcat(OpcodeStr,
3743 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3745 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3747 def mi : Ii8<0x70, MRMSrcMem,
3748 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3749 !strconcat(OpcodeStr,
3750 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3752 (vt128 (OpNode (memop addr:$src1),
3753 (i8 timm:$src2))))]>,
3754 Sched<[sched.XMM.Folded]>;
3757 } // ExeDomain = SSEPackedInt
3759 defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3760 SchedWriteShuffle, NoVLX>, PD;
3761 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3762 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3763 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3764 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3766 //===---------------------------------------------------------------------===//
3767 // Packed Integer Pack Instructions (SSE & AVX)
3768 //===---------------------------------------------------------------------===//
3770 let ExeDomain = SSEPackedInt in {
3771 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3772 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3773 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3774 PatFrag ld_frag, bit Is2Addr = 1> {
3775 def rr : PDI<opc, MRMSrcReg,
3776 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3778 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3779 !strconcat(OpcodeStr,
3780 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3782 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3784 def rm : PDI<opc, MRMSrcMem,
3785 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3787 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3788 !strconcat(OpcodeStr,
3789 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3791 (OutVT (OpNode (ArgVT RC:$src1),
3792 (ld_frag addr:$src2))))]>,
3793 Sched<[sched.Folded, sched.ReadAfterFold]>;
3796 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3797 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3798 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3799 PatFrag ld_frag, bit Is2Addr = 1> {
3800 def rr : SS48I<opc, MRMSrcReg,
3801 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3803 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3804 !strconcat(OpcodeStr,
3805 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3807 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3809 def rm : SS48I<opc, MRMSrcMem,
3810 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3812 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3813 !strconcat(OpcodeStr,
3814 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3816 (OutVT (OpNode (ArgVT RC:$src1),
3817 (ld_frag addr:$src2))))]>,
3818 Sched<[sched.Folded, sched.ReadAfterFold]>;
3821 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3822 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3823 i128mem, SchedWriteShuffle.XMM, load, 0>,
3825 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3826 i128mem, SchedWriteShuffle.XMM, load, 0>,
3829 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3830 i128mem, SchedWriteShuffle.XMM, load, 0>,
3832 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3833 i128mem, SchedWriteShuffle.XMM, load, 0>,
3837 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3838 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3839 i256mem, SchedWriteShuffle.YMM, load, 0>,
3841 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3842 i256mem, SchedWriteShuffle.YMM, load, 0>,
3845 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3846 i256mem, SchedWriteShuffle.YMM, load, 0>,
3848 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3849 i256mem, SchedWriteShuffle.YMM, load, 0>,
3853 let Constraints = "$src1 = $dst" in {
3854 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3855 i128mem, SchedWriteShuffle.XMM, memop>;
3856 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3857 i128mem, SchedWriteShuffle.XMM, memop>;
3859 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3860 i128mem, SchedWriteShuffle.XMM, memop>;
3862 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3863 i128mem, SchedWriteShuffle.XMM, memop>;
3865 } // ExeDomain = SSEPackedInt
3867 //===---------------------------------------------------------------------===//
3868 // SSE2 - Packed Integer Unpack Instructions
3869 //===---------------------------------------------------------------------===//
3871 let ExeDomain = SSEPackedInt in {
3872 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3873 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3874 X86FoldableSchedWrite sched, PatFrag ld_frag,
3876 def rr : PDI<opc, MRMSrcReg,
3877 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3879 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3880 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3881 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3883 def rm : PDI<opc, MRMSrcMem,
3884 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3886 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3887 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3888 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3889 Sched<[sched.Folded, sched.ReadAfterFold]>;
3892 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3893 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3894 i128mem, SchedWriteShuffle.XMM, load, 0>,
3896 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3897 i128mem, SchedWriteShuffle.XMM, load, 0>,
3899 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3900 i128mem, SchedWriteShuffle.XMM, load, 0>,
3902 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3903 i128mem, SchedWriteShuffle.XMM, load, 0>,
3907 let Predicates = [HasAVX, NoVLX] in {
3908 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3909 i128mem, SchedWriteShuffle.XMM, load, 0>,
3911 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3912 i128mem, SchedWriteShuffle.XMM, load, 0>,
3914 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3915 i128mem, SchedWriteShuffle.XMM, load, 0>,
3917 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3918 i128mem, SchedWriteShuffle.XMM, load, 0>,
3922 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3923 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3924 i256mem, SchedWriteShuffle.YMM, load, 0>,
3926 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3927 i256mem, SchedWriteShuffle.YMM, load, 0>,
3929 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3930 i256mem, SchedWriteShuffle.YMM, load, 0>,
3932 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3933 i256mem, SchedWriteShuffle.YMM, load, 0>,
3937 let Predicates = [HasAVX2, NoVLX] in {
3938 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3939 i256mem, SchedWriteShuffle.YMM, load, 0>,
3941 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3942 i256mem, SchedWriteShuffle.YMM, load, 0>,
3944 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3945 i256mem, SchedWriteShuffle.YMM, load, 0>,
3947 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3948 i256mem, SchedWriteShuffle.YMM, load, 0>,
3952 let Constraints = "$src1 = $dst" in {
3953 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3954 i128mem, SchedWriteShuffle.XMM, memop>;
3955 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3956 i128mem, SchedWriteShuffle.XMM, memop>;
3957 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3958 i128mem, SchedWriteShuffle.XMM, memop>;
3959 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3960 i128mem, SchedWriteShuffle.XMM, memop>;
3962 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3963 i128mem, SchedWriteShuffle.XMM, memop>;
3964 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3965 i128mem, SchedWriteShuffle.XMM, memop>;
3966 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3967 i128mem, SchedWriteShuffle.XMM, memop>;
3968 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3969 i128mem, SchedWriteShuffle.XMM, memop>;
3971 } // ExeDomain = SSEPackedInt
3973 //===---------------------------------------------------------------------===//
3974 // SSE2 - Packed Integer Extract and Insert
3975 //===---------------------------------------------------------------------===//
3977 let ExeDomain = SSEPackedInt in {
3978 multiclass sse2_pinsrw<bit Is2Addr = 1> {
3979 def rr : Ii8<0xC4, MRMSrcReg,
3980 (outs VR128:$dst), (ins VR128:$src1,
3981 GR32orGR64:$src2, u8imm:$src3),
3983 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3984 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3986 (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
3987 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3988 def rm : Ii8<0xC4, MRMSrcMem,
3989 (outs VR128:$dst), (ins VR128:$src1,
3990 i16mem:$src2, u8imm:$src3),
3992 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3993 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3995 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3997 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
4001 let Predicates = [HasAVX, NoBWI] in
4002 def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
4003 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4004 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4005 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4007 PD, VEX, WIG, Sched<[WriteVecExtract]>;
4008 def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
4009 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4010 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4011 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4013 Sched<[WriteVecExtract]>;
4016 let Predicates = [HasAVX, NoBWI] in
4017 defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, WIG;
4019 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4020 defm PINSRW : sse2_pinsrw, PD;
4022 } // ExeDomain = SSEPackedInt
4024 // Always select FP16 instructions if available.
4025 let Predicates = [UseSSE2], AddedComplexity = -10 in {
4026 def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
4027 def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>;
4028 def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
4029 def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
4032 let Predicates = [HasAVX, NoBWI] in {
4033 def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
4034 def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
4035 def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
4038 //===---------------------------------------------------------------------===//
4039 // SSE2 - Packed Mask Creation
4040 //===---------------------------------------------------------------------===//
4042 let ExeDomain = SSEPackedInt in {
4044 def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4046 "pmovmskb\t{$src, $dst|$dst, $src}",
4047 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4048 Sched<[WriteVecMOVMSK]>, VEX, WIG;
4050 let Predicates = [HasAVX2] in {
4051 def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4053 "pmovmskb\t{$src, $dst|$dst, $src}",
4054 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
4055 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, WIG;
4058 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4059 "pmovmskb\t{$src, $dst|$dst, $src}",
4060 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4061 Sched<[WriteVecMOVMSK]>;
4063 } // ExeDomain = SSEPackedInt
4065 //===---------------------------------------------------------------------===//
4066 // SSE2 - Conditional Store
4067 //===---------------------------------------------------------------------===//
4069 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
4070 // As VEX does not have separate instruction contexts for address size
4071 // overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict.
4072 // Prefer VMASKMODDQU64.
4073 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4074 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4075 (ins VR128:$src, VR128:$mask),
4076 "maskmovdqu\t{$mask, $src|$src, $mask}",
4077 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4079 let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in
4080 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4081 (ins VR128:$src, VR128:$mask),
4082 "maskmovdqu\t{$mask, $src|$src, $mask}",
4083 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4086 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4087 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4088 "maskmovdqu\t{$mask, $src|$src, $mask}",
4089 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
4090 let Uses = [EDI], Predicates = [UseSSE2] in
4091 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4092 "maskmovdqu\t{$mask, $src|$src, $mask}",
4093 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4095 } // ExeDomain = SSEPackedInt
4097 //===---------------------------------------------------------------------===//
4098 // SSE2 - Move Doubleword/Quadword
4099 //===---------------------------------------------------------------------===//
4101 //===---------------------------------------------------------------------===//
4102 // Move Int Doubleword to Packed Double Int
4104 let ExeDomain = SSEPackedInt in {
4105 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4106 "movd\t{$src, $dst|$dst, $src}",
4108 (v4i32 (scalar_to_vector GR32:$src)))]>,
4109 VEX, Sched<[WriteVecMoveFromGpr]>;
4110 def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4111 "movd\t{$src, $dst|$dst, $src}",
4113 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4114 VEX, Sched<[WriteVecLoad]>;
4115 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4116 "movq\t{$src, $dst|$dst, $src}",
4118 (v2i64 (scalar_to_vector GR64:$src)))]>,
4119 VEX, Sched<[WriteVecMoveFromGpr]>;
4120 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4121 def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4122 "movq\t{$src, $dst|$dst, $src}", []>,
4123 VEX, Sched<[WriteVecLoad]>;
4124 let isCodeGenOnly = 1 in
4125 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4126 "movq\t{$src, $dst|$dst, $src}",
4127 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4128 VEX, Sched<[WriteVecMoveFromGpr]>;
4130 def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4131 "movd\t{$src, $dst|$dst, $src}",
4133 (v4i32 (scalar_to_vector GR32:$src)))]>,
4134 Sched<[WriteVecMoveFromGpr]>;
4135 def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4136 "movd\t{$src, $dst|$dst, $src}",
4138 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4139 Sched<[WriteVecLoad]>;
4140 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4141 "movq\t{$src, $dst|$dst, $src}",
4143 (v2i64 (scalar_to_vector GR64:$src)))]>,
4144 Sched<[WriteVecMoveFromGpr]>;
4145 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4146 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4147 "movq\t{$src, $dst|$dst, $src}", []>,
4148 Sched<[WriteVecLoad]>;
4149 let isCodeGenOnly = 1 in
4150 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4151 "movq\t{$src, $dst|$dst, $src}",
4152 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4153 Sched<[WriteVecMoveFromGpr]>;
4154 } // ExeDomain = SSEPackedInt
4156 //===---------------------------------------------------------------------===//
4157 // Move Int Doubleword to Single Scalar
4159 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4160 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4161 "movd\t{$src, $dst|$dst, $src}",
4162 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4163 VEX, Sched<[WriteVecMoveFromGpr]>;
4165 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4166 "movd\t{$src, $dst|$dst, $src}",
4167 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4168 Sched<[WriteVecMoveFromGpr]>;
4170 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4172 //===---------------------------------------------------------------------===//
4173 // Move Packed Doubleword Int to Packed Double Int
4175 let ExeDomain = SSEPackedInt in {
4176 def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4177 "movd\t{$src, $dst|$dst, $src}",
4178 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4180 Sched<[WriteVecMoveToGpr]>;
4181 def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
4182 (ins i32mem:$dst, VR128:$src),
4183 "movd\t{$src, $dst|$dst, $src}",
4184 [(store (i32 (extractelt (v4i32 VR128:$src),
4185 (iPTR 0))), addr:$dst)]>,
4186 VEX, Sched<[WriteVecStore]>;
4187 def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4188 "movd\t{$src, $dst|$dst, $src}",
4189 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4191 Sched<[WriteVecMoveToGpr]>;
4192 def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4193 "movd\t{$src, $dst|$dst, $src}",
4194 [(store (i32 (extractelt (v4i32 VR128:$src),
4195 (iPTR 0))), addr:$dst)]>,
4196 Sched<[WriteVecStore]>;
4197 } // ExeDomain = SSEPackedInt
4199 //===---------------------------------------------------------------------===//
4200 // Move Packed Doubleword Int first element to Doubleword Int
4202 let ExeDomain = SSEPackedInt in {
4203 let SchedRW = [WriteVecMoveToGpr] in {
4204 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4205 "movq\t{$src, $dst|$dst, $src}",
4206 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4210 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4211 "movq\t{$src, $dst|$dst, $src}",
4212 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4216 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4217 def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4218 (ins i64mem:$dst, VR128:$src),
4219 "movq\t{$src, $dst|$dst, $src}", []>,
4220 VEX, Sched<[WriteVecStore]>;
4221 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4222 def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4223 "movq\t{$src, $dst|$dst, $src}", []>,
4224 Sched<[WriteVecStore]>;
4225 } // ExeDomain = SSEPackedInt
4227 //===---------------------------------------------------------------------===//
4228 // Bitcast FR64 <-> GR64
4230 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4231 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4232 "movq\t{$src, $dst|$dst, $src}",
4233 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4234 VEX, Sched<[WriteVecMoveToGpr]>;
4236 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4237 "movq\t{$src, $dst|$dst, $src}",
4238 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4239 Sched<[WriteVecMoveToGpr]>;
4240 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4242 //===---------------------------------------------------------------------===//
4243 // Move Scalar Single to Double Int
4245 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4246 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4247 "movd\t{$src, $dst|$dst, $src}",
4248 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4249 VEX, Sched<[WriteVecMoveToGpr]>;
4250 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4251 "movd\t{$src, $dst|$dst, $src}",
4252 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4253 Sched<[WriteVecMoveToGpr]>;
4254 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4256 let Predicates = [UseAVX] in {
4257 def : Pat<(v4i32 (scalar_to_vector (i32 (anyext GR8:$src)))),
4258 (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
4259 GR8:$src, sub_8bit)))>;
4260 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4261 (VMOVDI2PDIrr GR32:$src)>;
4263 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4264 (VMOV64toPQIrr GR64:$src)>;
4266 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4267 // These instructions also write zeros in the high part of a 256-bit register.
4268 def : Pat<(v4i32 (X86vzload32 addr:$src)),
4269 (VMOVDI2PDIrm addr:$src)>;
4270 def : Pat<(v8i32 (X86vzload32 addr:$src)),
4271 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4274 let Predicates = [UseSSE2] in {
4275 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4276 (MOVDI2PDIrr GR32:$src)>;
4278 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4279 (MOV64toPQIrr GR64:$src)>;
4280 def : Pat<(v4i32 (X86vzload32 addr:$src)),
4281 (MOVDI2PDIrm addr:$src)>;
4284 // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4285 // "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4287 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4288 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4289 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4290 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4291 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4292 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4293 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4294 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4295 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4297 //===---------------------------------------------------------------------===//
4298 // SSE2 - Move Quadword
4299 //===---------------------------------------------------------------------===//
4301 //===---------------------------------------------------------------------===//
4302 // Move Quadword Int to Packed Quadword Int
4305 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4306 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4307 "vmovq\t{$src, $dst|$dst, $src}",
4309 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4310 VEX, Requires<[UseAVX]>, WIG;
4311 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4312 "movq\t{$src, $dst|$dst, $src}",
4314 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4315 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4316 } // ExeDomain, SchedRW
4318 //===---------------------------------------------------------------------===//
4319 // Move Packed Quadword Int to Quadword Int
4321 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4322 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4323 "movq\t{$src, $dst|$dst, $src}",
4324 [(store (i64 (extractelt (v2i64 VR128:$src),
4325 (iPTR 0))), addr:$dst)]>,
4327 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4328 "movq\t{$src, $dst|$dst, $src}",
4329 [(store (i64 (extractelt (v2i64 VR128:$src),
4330 (iPTR 0))), addr:$dst)]>;
4331 } // ExeDomain, SchedRW
4333 // For disassembler only
4334 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4335 SchedRW = [SchedWriteVecLogic.XMM] in {
4336 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4337 "movq\t{$src, $dst|$dst, $src}", []>, VEX, WIG;
4338 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4339 "movq\t{$src, $dst|$dst, $src}", []>;
4342 def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4343 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4344 def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4345 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4347 let Predicates = [UseAVX] in {
4348 def : Pat<(v2i64 (X86vzload64 addr:$src)),
4349 (VMOVQI2PQIrm addr:$src)>;
4350 def : Pat<(v4i64 (X86vzload64 addr:$src)),
4351 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4353 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4354 (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4357 let Predicates = [UseSSE2] in {
4358 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4360 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4361 (MOVPQI2QImr addr:$dst, VR128:$src)>;
4364 //===---------------------------------------------------------------------===//
4365 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4366 // IA32 document. movq xmm1, xmm2 does clear the high bits.
4368 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4369 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4370 "vmovq\t{$src, $dst|$dst, $src}",
4371 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4372 XS, VEX, Requires<[UseAVX]>, WIG;
4373 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4374 "movq\t{$src, $dst|$dst, $src}",
4375 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4376 XS, Requires<[UseSSE2]>;
4377 } // ExeDomain, SchedRW
4379 let Predicates = [UseAVX] in {
4380 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4381 (VMOVZPQILo2PQIrr VR128:$src)>;
4383 let Predicates = [UseSSE2] in {
4384 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4385 (MOVZPQILo2PQIrr VR128:$src)>;
4388 let Predicates = [UseAVX] in {
4389 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4390 (SUBREG_TO_REG (i32 0),
4391 (v2f64 (VMOVZPQILo2PQIrr
4392 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4394 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4395 (SUBREG_TO_REG (i32 0),
4396 (v2i64 (VMOVZPQILo2PQIrr
4397 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4401 //===---------------------------------------------------------------------===//
4402 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4403 //===---------------------------------------------------------------------===//
4405 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4406 ValueType vt, RegisterClass RC, PatFrag mem_frag,
4407 X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4408 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4409 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4410 [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4412 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4413 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4414 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4415 Sched<[sched.Folded]>;
4418 let Predicates = [HasAVX, NoVLX] in {
4419 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4420 v4f32, VR128, loadv4f32, f128mem,
4421 SchedWriteFShuffle.XMM>, VEX, WIG;
4422 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4423 v4f32, VR128, loadv4f32, f128mem,
4424 SchedWriteFShuffle.XMM>, VEX, WIG;
4425 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4426 v8f32, VR256, loadv8f32, f256mem,
4427 SchedWriteFShuffle.YMM>, VEX, VEX_L, WIG;
4428 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4429 v8f32, VR256, loadv8f32, f256mem,
4430 SchedWriteFShuffle.YMM>, VEX, VEX_L, WIG;
4432 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4433 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4434 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4435 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4437 let Predicates = [HasAVX, NoVLX] in {
4438 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4439 (VMOVSHDUPrr VR128:$src)>;
4440 def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4441 (VMOVSHDUPrm addr:$src)>;
4442 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4443 (VMOVSLDUPrr VR128:$src)>;
4444 def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4445 (VMOVSLDUPrm addr:$src)>;
4446 def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4447 (VMOVSHDUPYrr VR256:$src)>;
4448 def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4449 (VMOVSHDUPYrm addr:$src)>;
4450 def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4451 (VMOVSLDUPYrr VR256:$src)>;
4452 def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4453 (VMOVSLDUPYrm addr:$src)>;
4456 let Predicates = [UseSSE3] in {
4457 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4458 (MOVSHDUPrr VR128:$src)>;
4459 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4460 (MOVSHDUPrm addr:$src)>;
4461 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4462 (MOVSLDUPrr VR128:$src)>;
4463 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4464 (MOVSLDUPrm addr:$src)>;
4467 //===---------------------------------------------------------------------===//
4468 // SSE3 - Replicate Double FP - MOVDDUP
4469 //===---------------------------------------------------------------------===//
4471 multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4472 def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4473 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4474 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4476 def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4477 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4480 (scalar_to_vector (loadf64 addr:$src)))))]>,
4481 Sched<[sched.XMM.Folded]>;
4484 // FIXME: Merge with above classes when there are patterns for the ymm version
4485 multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4486 def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4487 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4488 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4490 def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4491 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4493 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4494 Sched<[sched.YMM.Folded]>;
4497 let Predicates = [HasAVX, NoVLX] in {
4498 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4500 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4504 defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4507 let Predicates = [HasAVX, NoVLX] in {
4508 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4509 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4512 let Predicates = [UseSSE3] in {
4513 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4514 (MOVDDUPrm addr:$src)>;
4517 //===---------------------------------------------------------------------===//
4518 // SSE3 - Move Unaligned Integer
4519 //===---------------------------------------------------------------------===//
4521 let Predicates = [HasAVX] in {
4522 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4523 "vlddqu\t{$src, $dst|$dst, $src}",
4524 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4525 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, WIG;
4526 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4527 "vlddqu\t{$src, $dst|$dst, $src}",
4528 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4529 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, WIG;
4532 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4533 "lddqu\t{$src, $dst|$dst, $src}",
4534 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4535 Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4537 //===---------------------------------------------------------------------===//
4538 // SSE3 - Arithmetic
4539 //===---------------------------------------------------------------------===//
4541 multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4542 X86MemOperand x86memop, X86FoldableSchedWrite sched,
4543 PatFrag ld_frag, bit Is2Addr = 1> {
4544 let Uses = [MXCSR], mayRaiseFPException = 1 in {
4545 def rr : I<0xD0, MRMSrcReg,
4546 (outs RC:$dst), (ins RC:$src1, RC:$src2),
4548 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4549 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4550 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4552 def rm : I<0xD0, MRMSrcMem,
4553 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4555 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4556 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4557 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4558 Sched<[sched.Folded, sched.ReadAfterFold]>;
4562 let Predicates = [HasAVX] in {
4563 let ExeDomain = SSEPackedSingle in {
4564 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4565 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4567 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4568 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4569 XD, VEX_4V, VEX_L, WIG;
4571 let ExeDomain = SSEPackedDouble in {
4572 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4573 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4575 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4576 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4577 PD, VEX_4V, VEX_L, WIG;
4580 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4581 let ExeDomain = SSEPackedSingle in
4582 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4583 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4584 let ExeDomain = SSEPackedDouble in
4585 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4586 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4589 //===---------------------------------------------------------------------===//
4590 // SSE3 Instructions
4591 //===---------------------------------------------------------------------===//
4594 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4595 X86MemOperand x86memop, SDNode OpNode,
4596 X86FoldableSchedWrite sched, PatFrag ld_frag,
4598 let Uses = [MXCSR], mayRaiseFPException = 1 in {
4599 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4601 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4602 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4603 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4606 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4608 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4609 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4610 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4611 Sched<[sched.Folded, sched.ReadAfterFold]>;
4614 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4615 X86MemOperand x86memop, SDNode OpNode,
4616 X86FoldableSchedWrite sched, PatFrag ld_frag,
4618 let Uses = [MXCSR], mayRaiseFPException = 1 in {
4619 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4621 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4622 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4623 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4626 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4628 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4629 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4630 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4631 Sched<[sched.Folded, sched.ReadAfterFold]>;
4635 let Predicates = [HasAVX] in {
4636 let ExeDomain = SSEPackedSingle in {
4637 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4638 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, WIG;
4639 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4640 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, WIG;
4641 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4642 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, WIG;
4643 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4644 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, WIG;
4646 let ExeDomain = SSEPackedDouble in {
4647 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4648 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, WIG;
4649 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4650 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, WIG;
4651 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4652 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, WIG;
4653 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4654 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, WIG;
4658 let Constraints = "$src1 = $dst" in {
4659 let ExeDomain = SSEPackedSingle in {
4660 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4661 WriteFHAdd, memopv4f32>;
4662 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4663 WriteFHAdd, memopv4f32>;
4665 let ExeDomain = SSEPackedDouble in {
4666 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4667 WriteFHAdd, memopv2f64>;
4668 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4669 WriteFHAdd, memopv2f64>;
4673 //===---------------------------------------------------------------------===//
4674 // SSSE3 - Packed Absolute Instructions
4675 //===---------------------------------------------------------------------===//
4677 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4678 multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4679 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4680 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4682 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4683 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4686 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4688 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4690 (vt (OpNode (ld_frag addr:$src))))]>,
4691 Sched<[sched.XMM.Folded]>;
4694 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4695 multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4696 SDNode OpNode, X86SchedWriteWidths sched> {
4697 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4699 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4700 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4703 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4705 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4707 (vt (OpNode (load addr:$src))))]>,
4708 Sched<[sched.YMM.Folded]>;
4711 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4712 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4714 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4717 let Predicates = [HasAVX, NoVLX] in {
4718 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4721 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4722 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4724 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4727 let Predicates = [HasAVX2, NoVLX] in {
4728 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4732 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4734 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4736 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4739 //===---------------------------------------------------------------------===//
4740 // SSSE3 - Packed Binary Operator Instructions
4741 //===---------------------------------------------------------------------===//
4743 /// SS3I_binop_rm - Simple SSSE3 bin op
4744 multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4745 ValueType DstVT, ValueType OpVT, RegisterClass RC,
4746 PatFrag memop_frag, X86MemOperand x86memop,
4747 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4748 let isCommutable = 1 in
4749 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4750 (ins RC:$src1, RC:$src2),
4752 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4753 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4754 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4756 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4757 (ins RC:$src1, x86memop:$src2),
4759 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4760 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4762 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4763 Sched<[sched.Folded, sched.ReadAfterFold]>;
4766 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4767 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4768 Intrinsic IntId128, X86FoldableSchedWrite sched,
4769 PatFrag ld_frag, bit Is2Addr = 1> {
4770 let isCommutable = 1 in
4771 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4772 (ins VR128:$src1, VR128:$src2),
4774 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4775 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4776 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4778 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4779 (ins VR128:$src1, i128mem:$src2),
4781 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4782 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4784 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4785 Sched<[sched.Folded, sched.ReadAfterFold]>;
4788 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4790 X86FoldableSchedWrite sched> {
4791 let isCommutable = 1 in
4792 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4793 (ins VR256:$src1, VR256:$src2),
4794 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4795 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4797 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4798 (ins VR256:$src1, i256mem:$src2),
4799 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4801 (IntId256 VR256:$src1, (load addr:$src2)))]>,
4802 Sched<[sched.Folded, sched.ReadAfterFold]>;
4805 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4806 let isCommutable = 0 in {
4807 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4808 VR128, load, i128mem,
4809 SchedWriteVarShuffle.XMM, 0>, VEX_4V, WIG;
4810 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4811 v16i8, VR128, load, i128mem,
4812 SchedWriteVecIMul.XMM, 0>, VEX_4V, WIG;
4814 defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4815 VR128, load, i128mem,
4816 SchedWriteVecIMul.XMM, 0>, VEX_4V, WIG;
4819 let ImmT = NoImm, Predicates = [HasAVX] in {
4820 let isCommutable = 0 in {
4821 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4823 SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
4824 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4826 SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
4827 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4829 SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
4830 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4832 SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
4833 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
4834 int_x86_ssse3_psign_b_128,
4835 SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
4836 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
4837 int_x86_ssse3_psign_w_128,
4838 SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
4839 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
4840 int_x86_ssse3_psign_d_128,
4841 SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
4842 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
4843 int_x86_ssse3_phadd_sw_128,
4844 SchedWritePHAdd.XMM, load, 0>, VEX_4V, WIG;
4845 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
4846 int_x86_ssse3_phsub_sw_128,
4847 SchedWritePHAdd.XMM, load, 0>, VEX_4V, WIG;
4851 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4852 let isCommutable = 0 in {
4853 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4854 VR256, load, i256mem,
4855 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, WIG;
4856 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4857 v32i8, VR256, load, i256mem,
4858 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, WIG;
4860 defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4861 VR256, load, i256mem,
4862 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, WIG;
4865 let ImmT = NoImm, Predicates = [HasAVX2] in {
4866 let isCommutable = 0 in {
4867 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4868 VR256, load, i256mem,
4869 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
4870 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4872 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
4873 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4874 VR256, load, i256mem,
4875 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
4876 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4878 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
4879 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4880 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
4881 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4882 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
4883 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4884 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
4885 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4886 int_x86_avx2_phadd_sw,
4887 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, WIG;
4888 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4889 int_x86_avx2_phsub_sw,
4890 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, WIG;
4894 // None of these have i8 immediate fields.
4895 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4896 let isCommutable = 0 in {
4897 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4898 memop, i128mem, SchedWritePHAdd.XMM>;
4899 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4900 memop, i128mem, SchedWritePHAdd.XMM>;
4901 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4902 memop, i128mem, SchedWritePHAdd.XMM>;
4903 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4904 memop, i128mem, SchedWritePHAdd.XMM>;
4905 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4906 SchedWriteVecALU.XMM, memop>;
4907 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4908 SchedWriteVecALU.XMM, memop>;
4909 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4910 SchedWriteVecALU.XMM, memop>;
4911 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4912 memop, i128mem, SchedWriteVarShuffle.XMM>;
4913 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
4914 int_x86_ssse3_phadd_sw_128,
4915 SchedWritePHAdd.XMM, memop>;
4916 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
4917 int_x86_ssse3_phsub_sw_128,
4918 SchedWritePHAdd.XMM, memop>;
4919 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4920 v16i8, VR128, memop, i128mem,
4921 SchedWriteVecIMul.XMM>;
4923 defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4924 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4927 //===---------------------------------------------------------------------===//
4928 // SSSE3 - Packed Align Instruction Patterns
4929 //===---------------------------------------------------------------------===//
4931 multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4932 PatFrag memop_frag, X86MemOperand x86memop,
4933 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4934 let hasSideEffects = 0 in {
4935 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4936 (ins RC:$src1, RC:$src2, u8imm:$src3),
4938 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4940 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4941 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
4944 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4945 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4947 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4949 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4950 [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4951 (memop_frag addr:$src2),
4952 (i8 timm:$src3))))]>,
4953 Sched<[sched.Folded, sched.ReadAfterFold]>;
4957 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4958 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4959 SchedWriteShuffle.XMM, 0>, VEX_4V, WIG;
4960 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4961 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4962 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, WIG;
4963 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4964 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4965 SchedWriteShuffle.XMM>;
4967 //===---------------------------------------------------------------------===//
4968 // SSSE3 - Thread synchronization
4969 //===---------------------------------------------------------------------===//
4971 let SchedRW = [WriteSystem] in {
4972 let Uses = [EAX, ECX, EDX] in
4973 def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4974 TB, Requires<[HasSSE3, Not64BitMode]>;
4975 let Uses = [RAX, ECX, EDX] in
4976 def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4977 TB, Requires<[HasSSE3, In64BitMode]>;
4979 let Uses = [ECX, EAX] in
4980 def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
4981 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4984 def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4985 def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4987 def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4988 Requires<[Not64BitMode]>;
4989 def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4990 Requires<[In64BitMode]>;
4992 //===----------------------------------------------------------------------===//
4993 // SSE4.1 - Packed Move with Sign/Zero Extend
4994 // NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
4995 //===----------------------------------------------------------------------===//
4997 multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4998 RegisterClass OutRC, RegisterClass InRC,
4999 X86FoldableSchedWrite sched> {
5000 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
5001 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
5004 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
5005 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
5006 Sched<[sched.Folded]>;
5009 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
5010 X86MemOperand MemOp, X86MemOperand MemYOp,
5012 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
5013 SchedWriteShuffle.XMM>;
5014 let Predicates = [HasAVX, prd] in
5015 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
5016 VR128, VR128, SchedWriteVecExtend.XMM>,
5018 let Predicates = [HasAVX2, prd] in
5019 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
5020 VR256, VR128, SchedWriteVecExtend.YMM>,
5024 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5025 X86MemOperand MemYOp, Predicate prd> {
5026 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
5027 MemOp, MemYOp, prd>;
5028 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
5029 !strconcat("pmovzx", OpcodeStr),
5030 MemOp, MemYOp, prd>;
5033 defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
5034 defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
5035 defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
5037 defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
5038 defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
5040 defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
5043 multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
5044 SDNode ExtOp, SDNode InVecOp> {
5045 // Register-Register patterns
5046 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5047 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
5048 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
5050 let Predicates = [HasAVX2, NoVLX] in {
5051 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
5052 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
5053 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
5054 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
5056 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
5057 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
5058 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
5059 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
5061 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
5062 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
5065 // Simple Register-Memory patterns
5066 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5067 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5068 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5070 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
5071 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5074 let Predicates = [HasAVX2, NoVLX] in {
5075 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5076 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5077 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5078 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5080 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5081 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5082 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5083 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5085 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5086 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5089 // AVX2 Register-Memory patterns
5090 let Predicates = [HasAVX2, NoVLX] in {
5091 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
5092 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5094 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5095 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5096 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5097 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5098 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5099 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5101 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
5102 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5104 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5105 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5106 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
5107 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5109 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5110 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5111 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5112 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5113 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5114 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5118 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5119 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5121 // SSE4.1/AVX patterns.
5122 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5124 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5125 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5126 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5128 let Predicates = [HasAVX, NoVLX] in {
5129 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5130 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5131 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5132 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5134 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5135 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5136 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5137 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5139 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5140 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5142 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5143 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5144 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5146 let Predicates = [HasAVX, NoVLX] in {
5147 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5148 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5149 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5150 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5152 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5153 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5154 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5155 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5157 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5158 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5160 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5161 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5162 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5163 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5164 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5165 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5166 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5167 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5168 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5170 let Predicates = [HasAVX, NoVLX] in {
5171 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5172 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5173 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5174 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5175 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5176 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5178 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5179 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5180 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5181 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5183 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5184 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5185 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5186 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5187 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5188 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5189 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5190 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5192 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5193 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5194 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5195 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5196 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5197 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5199 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5200 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5201 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5202 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5203 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5204 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5205 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5206 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5210 defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5211 defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5213 let Predicates = [UseSSE41] in {
5214 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5215 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5218 //===----------------------------------------------------------------------===//
5219 // SSE4.1 - Extract Instructions
5220 //===----------------------------------------------------------------------===//
5222 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5223 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5224 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5225 (ins VR128:$src1, u8imm:$src2),
5226 !strconcat(OpcodeStr,
5227 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5228 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5230 Sched<[WriteVecExtract]>;
5231 let hasSideEffects = 0, mayStore = 1 in
5232 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5233 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5234 !strconcat(OpcodeStr,
5235 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5236 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
5237 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5240 let Predicates = [HasAVX, NoBWI] in
5241 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, WIG;
5243 defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
5246 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5247 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5248 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5249 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5250 (ins VR128:$src1, u8imm:$src2),
5251 !strconcat(OpcodeStr,
5252 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5253 Sched<[WriteVecExtract]>;
5255 let hasSideEffects = 0, mayStore = 1 in
5256 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5257 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5258 !strconcat(OpcodeStr,
5259 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5260 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
5261 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5264 let Predicates = [HasAVX, NoBWI] in
5265 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, WIG;
5267 defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
5269 let Predicates = [UseSSE41] in
5270 def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
5272 let Predicates = [HasAVX, NoBWI] in
5273 def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
5276 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5277 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5278 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5279 (ins VR128:$src1, u8imm:$src2),
5280 !strconcat(OpcodeStr,
5281 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5283 (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5284 Sched<[WriteVecExtract]>;
5285 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5286 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5287 !strconcat(OpcodeStr,
5288 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5289 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5290 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5293 let Predicates = [HasAVX, NoDQI] in
5294 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5296 defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
5298 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5299 multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5300 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5301 (ins VR128:$src1, u8imm:$src2),
5302 !strconcat(OpcodeStr,
5303 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5305 (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5306 Sched<[WriteVecExtract]>;
5307 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5308 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5309 !strconcat(OpcodeStr,
5310 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5311 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5312 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5315 let Predicates = [HasAVX, NoDQI] in
5316 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, REX_W;
5318 defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W;
5320 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5322 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5323 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5324 (ins VR128:$src1, u8imm:$src2),
5325 !strconcat(OpcodeStr,
5326 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5327 [(set GR32orGR64:$dst,
5328 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5329 Sched<[WriteVecExtract]>;
5330 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5331 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5332 !strconcat(OpcodeStr,
5333 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5334 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5335 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5338 let ExeDomain = SSEPackedSingle in {
5339 let Predicates = [UseAVX] in
5340 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, WIG;
5341 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
5344 //===----------------------------------------------------------------------===//
5345 // SSE4.1 - Insert Instructions
5346 //===----------------------------------------------------------------------===//
5348 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5349 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5350 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5352 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5354 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5356 (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
5357 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5358 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5359 (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5361 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5363 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5365 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
5366 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5369 let Predicates = [HasAVX, NoBWI] in {
5370 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, WIG;
5371 def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext (i8 GR8:$src2))), timm:$src3),
5372 (VPINSRBrr VR128:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
5373 GR8:$src2, sub_8bit), timm:$src3)>;
5376 let Constraints = "$src1 = $dst" in
5377 defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
5379 multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5380 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5381 (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5383 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5385 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5387 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5388 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5389 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5390 (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5392 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5394 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5396 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5397 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5400 let Predicates = [HasAVX, NoDQI] in
5401 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5402 let Constraints = "$src1 = $dst" in
5403 defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5405 multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5406 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5407 (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5409 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5411 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5413 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5414 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5415 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5416 (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5418 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5420 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5422 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5423 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5426 let Predicates = [HasAVX, NoDQI] in
5427 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, REX_W;
5428 let Constraints = "$src1 = $dst" in
5429 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5431 // insertps has a few different modes, there's the first two here below which
5432 // are optimized inserts that won't zero arbitrary elements in the destination
5433 // vector. The next one matches the intrinsic and could zero arbitrary elements
5434 // in the target vector.
5435 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5436 let isCommutable = 1 in
5437 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5438 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5440 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5442 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5444 (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
5445 Sched<[SchedWriteFShuffle.XMM]>;
5446 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5447 (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5449 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5451 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5453 (X86insertps VR128:$src1,
5454 (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5456 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5459 let ExeDomain = SSEPackedSingle in {
5460 let Predicates = [UseAVX] in
5461 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5463 let Constraints = "$src1 = $dst" in
5464 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5467 //===----------------------------------------------------------------------===//
5468 // SSE4.1 - Round Instructions
5469 //===----------------------------------------------------------------------===//
5471 multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5472 X86MemOperand x86memop, RegisterClass RC,
5473 ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode,
5474 X86FoldableSchedWrite sched> {
5475 // Intrinsic operation, reg.
5476 // Vector intrinsic operation, reg
5477 let Uses = [MXCSR], mayRaiseFPException = 1 in {
5478 def r : SS4AIi8<opc, MRMSrcReg,
5479 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5480 !strconcat(OpcodeStr,
5481 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5482 [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
5485 // Vector intrinsic operation, mem
5486 def m : SS4AIi8<opc, MRMSrcMem,
5487 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5488 !strconcat(OpcodeStr,
5489 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5491 (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
5492 Sched<[sched.Folded]>;
5496 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5497 string OpcodeStr, X86FoldableSchedWrite sched> {
5498 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5499 def SSr : SS4AIi8<opcss, MRMSrcReg,
5500 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5501 !strconcat(OpcodeStr,
5502 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5503 []>, Sched<[sched]>;
5506 def SSm : SS4AIi8<opcss, MRMSrcMem,
5507 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5508 !strconcat(OpcodeStr,
5509 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5510 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5511 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5513 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5514 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5515 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5516 !strconcat(OpcodeStr,
5517 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5518 []>, Sched<[sched]>;
5521 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5522 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5523 !strconcat(OpcodeStr,
5524 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5525 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5526 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5529 multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5530 string OpcodeStr, X86FoldableSchedWrite sched> {
5531 let Uses = [MXCSR], mayRaiseFPException = 1 in {
5532 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5533 def SSr : SS4AIi8<opcss, MRMSrcReg,
5534 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5535 !strconcat(OpcodeStr,
5536 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5537 []>, Sched<[sched]>;
5540 def SSm : SS4AIi8<opcss, MRMSrcMem,
5541 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5542 !strconcat(OpcodeStr,
5543 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5544 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5545 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5547 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5548 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5549 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5550 !strconcat(OpcodeStr,
5551 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5552 []>, Sched<[sched]>;
5555 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5556 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5557 !strconcat(OpcodeStr,
5558 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5559 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5560 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5564 multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5565 string OpcodeStr, X86FoldableSchedWrite sched,
5566 ValueType VT32, ValueType VT64,
5567 SDNode OpNode, bit Is2Addr = 1> {
5568 let Uses = [MXCSR], mayRaiseFPException = 1 in {
5569 let ExeDomain = SSEPackedSingle in {
5570 def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5571 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5573 !strconcat(OpcodeStr,
5574 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5575 !strconcat(OpcodeStr,
5576 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5577 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5580 def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5581 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5583 !strconcat(OpcodeStr,
5584 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5585 !strconcat(OpcodeStr,
5586 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5588 (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
5589 Sched<[sched.Folded, sched.ReadAfterFold]>;
5590 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5592 let ExeDomain = SSEPackedDouble in {
5593 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5594 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5596 !strconcat(OpcodeStr,
5597 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5598 !strconcat(OpcodeStr,
5599 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5600 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5603 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5604 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5606 !strconcat(OpcodeStr,
5607 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5608 !strconcat(OpcodeStr,
5609 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5611 (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
5612 Sched<[sched.Folded, sched.ReadAfterFold]>;
5613 } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5617 // FP round - roundss, roundps, roundsd, roundpd
5618 let Predicates = [HasAVX, NoVLX] in {
5619 let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
5621 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5622 loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
5624 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5625 loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
5629 let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
5630 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5631 loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
5633 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5634 loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
5638 let Predicates = [UseAVX] in {
5639 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5640 v4f32, v2f64, X86RndScales, 0>,
5641 VEX_4V, VEX_LIG, WIG, SIMD_EXC;
5642 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5643 VEX_4V, VEX_LIG, WIG, SIMD_EXC;
5646 let Predicates = [UseAVX] in {
5647 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5648 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
5649 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5650 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
5653 let Predicates = [UseAVX, OptForSize] in {
5654 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5655 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5656 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5657 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5660 let ExeDomain = SSEPackedSingle in
5661 defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5662 memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
5663 let ExeDomain = SSEPackedDouble in
5664 defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5665 memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
5667 defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5669 let Constraints = "$src1 = $dst" in
5670 defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5671 v4f32, v2f64, X86RndScales>;
5673 let Predicates = [UseSSE41] in {
5674 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5675 (ROUNDSSr FR32:$src1, timm:$src2)>;
5676 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5677 (ROUNDSDr FR64:$src1, timm:$src2)>;
5680 let Predicates = [UseSSE41, OptForSize] in {
5681 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5682 (ROUNDSSm addr:$src1, timm:$src2)>;
5683 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5684 (ROUNDSDm addr:$src1, timm:$src2)>;
5687 //===----------------------------------------------------------------------===//
5688 // SSE4.1 - Packed Bit Test
5689 //===----------------------------------------------------------------------===//
5691 // ptest instruction we'll lower to this in X86ISelLowering primarily from
5692 // the intel intrinsic that corresponds to this.
5693 let Defs = [EFLAGS], Predicates = [HasAVX] in {
5694 def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5695 "vptest\t{$src2, $src1|$src1, $src2}",
5696 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5697 Sched<[SchedWriteVecTest.XMM]>, VEX, WIG;
5698 def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5699 "vptest\t{$src2, $src1|$src1, $src2}",
5700 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5701 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5704 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5705 "vptest\t{$src2, $src1|$src1, $src2}",
5706 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5707 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, WIG;
5708 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5709 "vptest\t{$src2, $src1|$src1, $src2}",
5710 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5711 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5715 let Defs = [EFLAGS] in {
5716 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5717 "ptest\t{$src2, $src1|$src1, $src2}",
5718 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5719 Sched<[SchedWriteVecTest.XMM]>;
5720 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5721 "ptest\t{$src2, $src1|$src1, $src2}",
5722 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5723 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5726 // The bit test instructions below are AVX only
5727 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5728 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5729 X86FoldableSchedWrite sched> {
5730 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5731 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5732 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5733 Sched<[sched]>, VEX;
5734 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5735 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5736 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5737 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5740 let Defs = [EFLAGS], Predicates = [HasAVX] in {
5741 let ExeDomain = SSEPackedSingle in {
5742 defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5743 SchedWriteFTest.XMM>;
5744 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5745 SchedWriteFTest.YMM>, VEX_L;
5747 let ExeDomain = SSEPackedDouble in {
5748 defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5749 SchedWriteFTest.XMM>;
5750 defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5751 SchedWriteFTest.YMM>, VEX_L;
5755 //===----------------------------------------------------------------------===//
5756 // SSE4.1 - Misc Instructions
5757 //===----------------------------------------------------------------------===//
5759 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5760 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5761 "popcnt{w}\t{$src, $dst|$dst, $src}",
5762 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5763 Sched<[WritePOPCNT]>, OpSize16, XS;
5764 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5765 "popcnt{w}\t{$src, $dst|$dst, $src}",
5766 [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5767 (implicit EFLAGS)]>,
5768 Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5770 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5771 "popcnt{l}\t{$src, $dst|$dst, $src}",
5772 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5773 Sched<[WritePOPCNT]>, OpSize32, XS;
5775 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5776 "popcnt{l}\t{$src, $dst|$dst, $src}",
5777 [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5778 (implicit EFLAGS)]>,
5779 Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5781 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5782 "popcnt{q}\t{$src, $dst|$dst, $src}",
5783 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5784 Sched<[WritePOPCNT]>, XS;
5785 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5786 "popcnt{q}\t{$src, $dst|$dst, $src}",
5787 [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5788 (implicit EFLAGS)]>,
5789 Sched<[WritePOPCNT.Folded]>, XS;
5792 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5793 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5794 SDNode OpNode, PatFrag ld_frag,
5795 X86FoldableSchedWrite Sched> {
5796 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5798 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5799 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5801 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5803 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5805 (v8i16 (OpNode (ld_frag addr:$src))))]>,
5806 Sched<[Sched.Folded]>;
5809 // PHMIN has the same profile as PSAD, thus we use the same scheduling
5810 // model, although the naming is misleading.
5811 let Predicates = [HasAVX] in
5812 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5814 WritePHMINPOS>, VEX, WIG;
5815 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5819 /// SS48I_binop_rm - Simple SSE41 binary operator.
5820 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5821 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5822 X86MemOperand x86memop, X86FoldableSchedWrite sched,
5824 let isCommutable = 1 in
5825 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5826 (ins RC:$src1, RC:$src2),
5828 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5829 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5830 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5832 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5833 (ins RC:$src1, x86memop:$src2),
5835 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5836 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5838 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5839 Sched<[sched.Folded, sched.ReadAfterFold]>;
5842 let Predicates = [HasAVX, NoVLX] in {
5843 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5844 load, i128mem, SchedWriteVecALU.XMM, 0>,
5846 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5847 load, i128mem, SchedWriteVecALU.XMM, 0>,
5849 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5850 load, i128mem, SchedWriteVecALU.XMM, 0>,
5852 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5853 load, i128mem, SchedWriteVecALU.XMM, 0>,
5855 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5856 load, i128mem, SchedWriteVecIMul.XMM, 0>,
5859 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5860 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5861 load, i128mem, SchedWriteVecALU.XMM, 0>,
5863 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5864 load, i128mem, SchedWriteVecALU.XMM, 0>,
5866 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5867 load, i128mem, SchedWriteVecALU.XMM, 0>,
5869 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5870 load, i128mem, SchedWriteVecALU.XMM, 0>,
5874 let Predicates = [HasAVX2, NoVLX] in {
5875 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5876 load, i256mem, SchedWriteVecALU.YMM, 0>,
5878 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5879 load, i256mem, SchedWriteVecALU.YMM, 0>,
5881 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5882 load, i256mem, SchedWriteVecALU.YMM, 0>,
5884 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5885 load, i256mem, SchedWriteVecALU.YMM, 0>,
5887 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5888 load, i256mem, SchedWriteVecIMul.YMM, 0>,
5891 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5892 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5893 load, i256mem, SchedWriteVecALU.YMM, 0>,
5895 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5896 load, i256mem, SchedWriteVecALU.YMM, 0>,
5898 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5899 load, i256mem, SchedWriteVecALU.YMM, 0>,
5901 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5902 load, i256mem, SchedWriteVecALU.YMM, 0>,
5906 let Constraints = "$src1 = $dst" in {
5907 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5908 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5909 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5910 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5911 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5912 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5913 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5914 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5915 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5916 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5917 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5918 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5919 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5920 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5921 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5922 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5923 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5924 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5927 let Predicates = [HasAVX, NoVLX] in
5928 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5929 load, i128mem, SchedWritePMULLD.XMM, 0>,
5931 let Predicates = [HasAVX] in
5932 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5933 load, i128mem, SchedWriteVecALU.XMM, 0>,
5936 let Predicates = [HasAVX2, NoVLX] in
5937 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5938 load, i256mem, SchedWritePMULLD.YMM, 0>,
5940 let Predicates = [HasAVX2] in
5941 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5942 load, i256mem, SchedWriteVecALU.YMM, 0>,
5945 let Constraints = "$src1 = $dst" in {
5946 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5947 memop, i128mem, SchedWritePMULLD.XMM, 1>;
5948 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5949 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5952 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5953 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5954 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5955 X86MemOperand x86memop, bit Is2Addr,
5956 X86FoldableSchedWrite sched> {
5957 let isCommutable = 1 in
5958 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5959 (ins RC:$src1, RC:$src2, u8imm:$src3),
5961 !strconcat(OpcodeStr,
5962 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5963 !strconcat(OpcodeStr,
5964 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5965 [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
5967 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5968 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5970 !strconcat(OpcodeStr,
5971 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5972 !strconcat(OpcodeStr,
5973 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5975 (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
5976 Sched<[sched.Folded, sched.ReadAfterFold]>;
5979 /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5980 multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5981 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5982 X86MemOperand x86memop, bit Is2Addr,
5983 X86FoldableSchedWrite sched> {
5984 let isCommutable = 1 in
5985 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5986 (ins RC:$src1, RC:$src2, u8imm:$src3),
5988 !strconcat(OpcodeStr,
5989 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5990 !strconcat(OpcodeStr,
5991 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5992 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
5994 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5995 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5997 !strconcat(OpcodeStr,
5998 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5999 !strconcat(OpcodeStr,
6000 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6002 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6003 Sched<[sched.Folded, sched.ReadAfterFold]>;
6006 def BlendCommuteImm2 : SDNodeXForm<timm, [{
6007 uint8_t Imm = N->getZExtValue() & 0x03;
6008 return getI8Imm(Imm ^ 0x03, SDLoc(N));
6011 def BlendCommuteImm4 : SDNodeXForm<timm, [{
6012 uint8_t Imm = N->getZExtValue() & 0x0f;
6013 return getI8Imm(Imm ^ 0x0f, SDLoc(N));
6016 def BlendCommuteImm8 : SDNodeXForm<timm, [{
6017 uint8_t Imm = N->getZExtValue() & 0xff;
6018 return getI8Imm(Imm ^ 0xff, SDLoc(N));
6021 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
6022 def BlendScaleImm4 : SDNodeXForm<timm, [{
6023 uint8_t Imm = N->getZExtValue();
6025 for (unsigned i = 0; i != 4; ++i) {
6027 NewImm |= 0x3 << (i * 2);
6029 return getI8Imm(NewImm, SDLoc(N));
6032 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
6033 def BlendScaleImm2 : SDNodeXForm<timm, [{
6034 uint8_t Imm = N->getZExtValue();
6036 for (unsigned i = 0; i != 2; ++i) {
6038 NewImm |= 0xf << (i * 4);
6040 return getI8Imm(NewImm, SDLoc(N));
6043 // Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
6044 def BlendScaleImm2to4 : SDNodeXForm<timm, [{
6045 uint8_t Imm = N->getZExtValue();
6047 for (unsigned i = 0; i != 2; ++i) {
6049 NewImm |= 0x3 << (i * 2);
6051 return getI8Imm(NewImm, SDLoc(N));
6054 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
6055 def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
6056 uint8_t Imm = N->getZExtValue();
6058 for (unsigned i = 0; i != 4; ++i) {
6060 NewImm |= 0x3 << (i * 2);
6062 return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6065 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
6066 def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
6067 uint8_t Imm = N->getZExtValue();
6069 for (unsigned i = 0; i != 2; ++i) {
6071 NewImm |= 0xf << (i * 4);
6073 return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6076 // Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
6077 def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
6078 uint8_t Imm = N->getZExtValue();
6080 for (unsigned i = 0; i != 2; ++i) {
6082 NewImm |= 0x3 << (i * 2);
6084 return getI8Imm(NewImm ^ 0xf, SDLoc(N));
6087 let Predicates = [HasAVX] in {
6088 let isCommutable = 0 in {
6089 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6090 VR128, load, i128mem, 0,
6091 SchedWriteMPSAD.XMM>, VEX_4V, WIG;
6094 let Uses = [MXCSR], mayRaiseFPException = 1 in {
6095 let ExeDomain = SSEPackedSingle in
6096 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6097 VR128, load, f128mem, 0,
6098 SchedWriteDPPS.XMM>, VEX_4V, WIG;
6099 let ExeDomain = SSEPackedDouble in
6100 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6101 VR128, load, f128mem, 0,
6102 SchedWriteDPPD.XMM>, VEX_4V, WIG;
6103 let ExeDomain = SSEPackedSingle in
6104 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6105 VR256, load, i256mem, 0,
6106 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, WIG;
6110 let Predicates = [HasAVX2] in {
6111 let isCommutable = 0 in {
6112 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6113 VR256, load, i256mem, 0,
6114 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, WIG;
6118 let Constraints = "$src1 = $dst" in {
6119 let isCommutable = 0 in {
6120 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6121 VR128, memop, i128mem, 1,
6122 SchedWriteMPSAD.XMM>;
6125 let ExeDomain = SSEPackedSingle in
6126 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6127 VR128, memop, f128mem, 1,
6128 SchedWriteDPPS.XMM>, SIMD_EXC;
6129 let ExeDomain = SSEPackedDouble in
6130 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6131 VR128, memop, f128mem, 1,
6132 SchedWriteDPPD.XMM>, SIMD_EXC;
6135 /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6136 multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6137 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6138 X86MemOperand x86memop, bit Is2Addr, Domain d,
6139 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6140 let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6141 let isCommutable = 1 in
6142 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6143 (ins RC:$src1, RC:$src2, u8imm:$src3),
6145 !strconcat(OpcodeStr,
6146 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6147 !strconcat(OpcodeStr,
6148 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6149 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
6151 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6152 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6154 !strconcat(OpcodeStr,
6155 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6156 !strconcat(OpcodeStr,
6157 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6159 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6160 Sched<[sched.Folded, sched.ReadAfterFold]>;
6163 // Pattern to commute if load is in first source.
6164 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
6165 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6166 (commuteXForm timm:$src3))>;
6169 let Predicates = [HasAVX] in {
6170 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6171 VR128, load, f128mem, 0, SSEPackedSingle,
6172 SchedWriteFBlend.XMM, BlendCommuteImm4>,
6174 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6175 VR256, load, f256mem, 0, SSEPackedSingle,
6176 SchedWriteFBlend.YMM, BlendCommuteImm8>,
6178 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6179 VR128, load, f128mem, 0, SSEPackedDouble,
6180 SchedWriteFBlend.XMM, BlendCommuteImm2>,
6182 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6183 VR256, load, f256mem, 0, SSEPackedDouble,
6184 SchedWriteFBlend.YMM, BlendCommuteImm4>,
6186 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6187 VR128, load, i128mem, 0, SSEPackedInt,
6188 SchedWriteBlend.XMM, BlendCommuteImm8>,
6192 let Predicates = [HasAVX2] in {
6193 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6194 VR256, load, i256mem, 0, SSEPackedInt,
6195 SchedWriteBlend.YMM, BlendCommuteImm8>,
6199 // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6200 // ExecutionDomainFixPass will cleanup domains later on.
6201 let Predicates = [HasAVX1Only] in {
6202 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
6203 (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6204 def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
6205 (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6206 def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
6207 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
6209 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6210 // it from becoming movsd via commuting under optsize.
6211 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6212 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6213 def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
6214 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6215 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
6216 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6218 def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
6219 (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6220 def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
6221 (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6222 def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
6223 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
6225 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6226 // it from becoming movss via commuting under optsize.
6227 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6228 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6229 def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
6230 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6231 def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
6232 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6235 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6236 VR128, memop, f128mem, 1, SSEPackedSingle,
6237 SchedWriteFBlend.XMM, BlendCommuteImm4>;
6238 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6239 VR128, memop, f128mem, 1, SSEPackedDouble,
6240 SchedWriteFBlend.XMM, BlendCommuteImm2>;
6241 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6242 VR128, memop, i128mem, 1, SSEPackedInt,
6243 SchedWriteBlend.XMM, BlendCommuteImm8>;
6245 let Predicates = [UseSSE41] in {
6246 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6247 // it from becoming movss via commuting under optsize.
6248 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6249 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6250 def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
6251 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6252 def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
6253 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6255 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6256 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6257 def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
6258 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6259 def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
6260 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6263 // For insertion into the zero index (low half) of a 256-bit vector, it is
6264 // more efficient to generate a blend with immediate instead of an insert*128.
6265 let Predicates = [HasAVX] in {
6266 def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6267 (VBLENDPDYrri VR256:$src1,
6268 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6269 VR128:$src2, sub_xmm), 0x3)>;
6270 def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6271 (VBLENDPSYrri VR256:$src1,
6272 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6273 VR128:$src2, sub_xmm), 0xf)>;
6275 def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6276 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6277 VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6278 def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6279 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6280 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6283 /// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6284 multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6285 X86MemOperand x86memop, ValueType VT,
6286 PatFrag mem_frag, SDNode OpNode,
6287 X86FoldableSchedWrite sched> {
6288 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6289 (ins RC:$src1, RC:$src2, RC:$src3),
6290 !strconcat(OpcodeStr,
6291 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6292 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6293 SSEPackedInt>, TAPD, VEX_4V,
6296 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6297 (ins RC:$src1, x86memop:$src2, RC:$src3),
6298 !strconcat(OpcodeStr,
6299 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6301 (OpNode RC:$src3, (mem_frag addr:$src2),
6302 RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6303 Sched<[sched.Folded, sched.ReadAfterFold,
6305 ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6308 sched.ReadAfterFold]>;
6311 let Predicates = [HasAVX] in {
6312 let ExeDomain = SSEPackedDouble in {
6313 defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6314 v2f64, loadv2f64, X86Blendv,
6315 SchedWriteFVarBlend.XMM>;
6316 defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6317 v4f64, loadv4f64, X86Blendv,
6318 SchedWriteFVarBlend.YMM>, VEX_L;
6319 } // ExeDomain = SSEPackedDouble
6320 let ExeDomain = SSEPackedSingle in {
6321 defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6322 v4f32, loadv4f32, X86Blendv,
6323 SchedWriteFVarBlend.XMM>;
6324 defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6325 v8f32, loadv8f32, X86Blendv,
6326 SchedWriteFVarBlend.YMM>, VEX_L;
6327 } // ExeDomain = SSEPackedSingle
6328 defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6329 v16i8, loadv16i8, X86Blendv,
6330 SchedWriteVarBlend.XMM>;
6333 let Predicates = [HasAVX2] in {
6334 defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6335 v32i8, loadv32i8, X86Blendv,
6336 SchedWriteVarBlend.YMM>, VEX_L;
6339 let Predicates = [HasAVX] in {
6340 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6341 (v4i32 VR128:$src2))),
6342 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6343 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6344 (v2i64 VR128:$src2))),
6345 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6346 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6347 (v8i32 VR256:$src2))),
6348 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6349 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6350 (v4i64 VR256:$src2))),
6351 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6354 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6355 // changed to use blends because blends have better throughput on sandybridge
6356 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6357 let Predicates = [HasAVX, OptForSpeed] in {
6358 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6359 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6360 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6361 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6363 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6364 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6365 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6366 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6367 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6368 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6370 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6371 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6372 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6373 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6374 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6375 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6377 // Move low f32 and clear high bits.
6378 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6379 (SUBREG_TO_REG (i32 0),
6380 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6381 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6382 (i8 1))), sub_xmm)>;
6383 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6384 (SUBREG_TO_REG (i32 0),
6385 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6386 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6387 (i8 3))), sub_xmm)>;
6390 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6391 // changed to use blends because blends have better throughput on sandybridge
6392 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6393 let Predicates = [UseSSE41, OptForSpeed] in {
6394 // With SSE41 we can use blends for these patterns.
6395 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6396 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6397 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6398 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6400 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6401 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6402 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6403 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6404 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6405 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6407 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6408 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6409 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6410 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6411 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6412 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6416 /// SS41I_ternary - SSE 4.1 ternary operator
6417 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6418 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6419 PatFrag mem_frag, X86MemOperand x86memop,
6420 SDNode OpNode, X86FoldableSchedWrite sched> {
6421 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6422 (ins VR128:$src1, VR128:$src2),
6423 !strconcat(OpcodeStr,
6424 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6426 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6429 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6430 (ins VR128:$src1, x86memop:$src2),
6431 !strconcat(OpcodeStr,
6432 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6434 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6435 Sched<[sched.Folded, sched.ReadAfterFold]>;
6439 let ExeDomain = SSEPackedDouble in
6440 defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6441 X86Blendv, SchedWriteFVarBlend.XMM>;
6442 let ExeDomain = SSEPackedSingle in
6443 defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6444 X86Blendv, SchedWriteFVarBlend.XMM>;
6445 defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6446 X86Blendv, SchedWriteVarBlend.XMM>;
6448 // Aliases with the implicit xmm0 argument
6449 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6450 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6451 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6452 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6453 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6454 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6455 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6456 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6457 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6458 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6459 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6460 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6462 let Predicates = [UseSSE41] in {
6463 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6464 (v4i32 VR128:$src2))),
6465 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6466 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6467 (v2i64 VR128:$src2))),
6468 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6471 let AddedComplexity = 400 in { // Prefer non-temporal versions
6473 let Predicates = [HasAVX, NoVLX] in
6474 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6475 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6476 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, WIG;
6477 let Predicates = [HasAVX2, NoVLX] in
6478 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6479 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6480 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, WIG;
6481 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6482 "movntdqa\t{$src, $dst|$dst, $src}", []>,
6483 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6485 let Predicates = [HasAVX2, NoVLX] in {
6486 def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6487 (VMOVNTDQAYrm addr:$src)>;
6488 def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6489 (VMOVNTDQAYrm addr:$src)>;
6490 def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6491 (VMOVNTDQAYrm addr:$src)>;
6492 def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6493 (VMOVNTDQAYrm addr:$src)>;
6494 def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6495 (VMOVNTDQAYrm addr:$src)>;
6496 def : Pat<(v16f16 (alignednontemporalload addr:$src)),
6497 (VMOVNTDQAYrm addr:$src)>;
6498 def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6499 (VMOVNTDQAYrm addr:$src)>;
6502 let Predicates = [HasAVX, NoVLX] in {
6503 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6504 (VMOVNTDQArm addr:$src)>;
6505 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6506 (VMOVNTDQArm addr:$src)>;
6507 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6508 (VMOVNTDQArm addr:$src)>;
6509 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6510 (VMOVNTDQArm addr:$src)>;
6511 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6512 (VMOVNTDQArm addr:$src)>;
6513 def : Pat<(v8f16 (alignednontemporalload addr:$src)),
6514 (VMOVNTDQArm addr:$src)>;
6515 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6516 (VMOVNTDQArm addr:$src)>;
6519 let Predicates = [UseSSE41] in {
6520 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6521 (MOVNTDQArm addr:$src)>;
6522 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6523 (MOVNTDQArm addr:$src)>;
6524 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6525 (MOVNTDQArm addr:$src)>;
6526 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6527 (MOVNTDQArm addr:$src)>;
6528 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6529 (MOVNTDQArm addr:$src)>;
6530 def : Pat<(v8f16 (alignednontemporalload addr:$src)),
6531 (MOVNTDQArm addr:$src)>;
6532 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6533 (MOVNTDQArm addr:$src)>;
6536 } // AddedComplexity
6538 //===----------------------------------------------------------------------===//
6539 // SSE4.2 - Compare Instructions
6540 //===----------------------------------------------------------------------===//
6542 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
6543 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6544 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6545 X86MemOperand x86memop, X86FoldableSchedWrite sched,
6547 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6548 (ins RC:$src1, RC:$src2),
6550 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6551 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6552 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6554 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6555 (ins RC:$src1, x86memop:$src2),
6557 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6558 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6560 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6561 Sched<[sched.Folded, sched.ReadAfterFold]>;
6564 let Predicates = [HasAVX] in
6565 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6566 load, i128mem, SchedWriteVecALU.XMM, 0>,
6569 let Predicates = [HasAVX2] in
6570 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6571 load, i256mem, SchedWriteVecALU.YMM, 0>,
6574 let Constraints = "$src1 = $dst" in
6575 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6576 memop, i128mem, SchedWriteVecALU.XMM>;
6578 //===----------------------------------------------------------------------===//
6579 // SSE4.2 - String/text Processing Instructions
6580 //===----------------------------------------------------------------------===//
6582 multiclass pcmpistrm_SS42AI<string asm> {
6583 def rr : SS42AI<0x62, MRMSrcReg, (outs),
6584 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6585 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6586 []>, Sched<[WritePCmpIStrM]>;
6588 def rm :SS42AI<0x62, MRMSrcMem, (outs),
6589 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6590 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6591 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6594 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6595 let Predicates = [HasAVX] in
6596 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, WIG;
6597 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
6600 multiclass SS42AI_pcmpestrm<string asm> {
6601 def rr : SS42AI<0x60, MRMSrcReg, (outs),
6602 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6603 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6604 []>, Sched<[WritePCmpEStrM]>;
6606 def rm : SS42AI<0x60, MRMSrcMem, (outs),
6607 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6608 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6609 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6612 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6613 let Predicates = [HasAVX] in
6614 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, WIG;
6615 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
6618 multiclass SS42AI_pcmpistri<string asm> {
6619 def rr : SS42AI<0x63, MRMSrcReg, (outs),
6620 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6621 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6622 []>, Sched<[WritePCmpIStrI]>;
6624 def rm : SS42AI<0x63, MRMSrcMem, (outs),
6625 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6626 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6627 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6630 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6631 let Predicates = [HasAVX] in
6632 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, WIG;
6633 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
6636 multiclass SS42AI_pcmpestri<string asm> {
6637 def rr : SS42AI<0x61, MRMSrcReg, (outs),
6638 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6639 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6640 []>, Sched<[WritePCmpEStrI]>;
6642 def rm : SS42AI<0x61, MRMSrcMem, (outs),
6643 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6644 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6645 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6648 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6649 let Predicates = [HasAVX] in
6650 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, WIG;
6651 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
6654 //===----------------------------------------------------------------------===//
6655 // SSE4.2 - CRC Instructions
6656 //===----------------------------------------------------------------------===//
6658 // No CRC instructions have AVX equivalents
6660 // crc intrinsic instruction
6661 // This set of instructions are only rm, the only difference is the size
6663 class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6664 RegisterClass RCIn, SDPatternOperator Int> :
6665 CRC32I<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6666 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6667 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6668 Sched<[WriteCRC32]>;
6670 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6671 X86MemOperand x86memop, SDPatternOperator Int> :
6672 CRC32I<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6673 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6674 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6675 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6677 let Constraints = "$src1 = $dst" in {
6678 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6679 int_x86_sse42_crc32_32_8>;
6680 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6681 int_x86_sse42_crc32_32_8>;
6682 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6683 int_x86_sse42_crc32_32_16>, OpSize16;
6684 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6685 int_x86_sse42_crc32_32_16>, OpSize16;
6686 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6687 int_x86_sse42_crc32_32_32>, OpSize32;
6688 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6689 int_x86_sse42_crc32_32_32>, OpSize32;
6690 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6691 int_x86_sse42_crc32_64_64>, REX_W;
6692 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6693 int_x86_sse42_crc32_64_64>, REX_W;
6694 let hasSideEffects = 0 in {
6696 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6698 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6703 //===----------------------------------------------------------------------===//
6704 // SHA-NI Instructions
6705 //===----------------------------------------------------------------------===//
6707 // FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6708 multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6709 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6710 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6711 (ins VR128:$src1, VR128:$src2),
6713 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6714 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6716 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6717 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6718 T8PS, Sched<[sched]>;
6720 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6721 (ins VR128:$src1, i128mem:$src2),
6723 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6724 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6726 (set VR128:$dst, (IntId VR128:$src1,
6727 (memop addr:$src2), XMM0)),
6728 (set VR128:$dst, (IntId VR128:$src1,
6729 (memop addr:$src2))))]>, T8PS,
6730 Sched<[sched.Folded, sched.ReadAfterFold]>;
6733 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6734 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6735 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6736 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6738 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6739 (i8 timm:$src3)))]>, TAPS,
6740 Sched<[SchedWriteVecIMul.XMM]>;
6741 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6742 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6743 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6745 (int_x86_sha1rnds4 VR128:$src1,
6747 (i8 timm:$src3)))]>, TAPS,
6748 Sched<[SchedWriteVecIMul.XMM.Folded,
6749 SchedWriteVecIMul.XMM.ReadAfterFold]>;
6751 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6752 SchedWriteVecIMul.XMM>;
6753 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6754 SchedWriteVecIMul.XMM>;
6755 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6756 SchedWriteVecIMul.XMM>;
6759 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6760 SchedWriteVecIMul.XMM, 1>;
6762 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6763 SchedWriteVecIMul.XMM>;
6764 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6765 SchedWriteVecIMul.XMM>;
6768 // Aliases with explicit %xmm0
6769 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6770 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6771 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6772 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6774 //===----------------------------------------------------------------------===//
6775 // AES-NI Instructions
6776 //===----------------------------------------------------------------------===//
6778 multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6779 Intrinsic IntId, PatFrag ld_frag,
6780 bit Is2Addr = 0, RegisterClass RC = VR128,
6781 X86MemOperand MemOp = i128mem> {
6782 let AsmString = OpcodeStr#
6783 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6784 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6785 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6786 (ins RC:$src1, RC:$src2), "",
6787 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6788 Sched<[WriteAESDecEnc]>;
6789 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6790 (ins RC:$src1, MemOp:$src2), "",
6791 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6792 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6796 // Perform One Round of an AES Encryption/Decryption Flow
6797 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6798 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
6799 int_x86_aesni_aesenc, load>, VEX_4V, WIG;
6800 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
6801 int_x86_aesni_aesenclast, load>, VEX_4V, WIG;
6802 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
6803 int_x86_aesni_aesdec, load>, VEX_4V, WIG;
6804 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
6805 int_x86_aesni_aesdeclast, load>, VEX_4V, WIG;
6808 let Predicates = [NoVLX, HasVAES] in {
6809 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc",
6810 int_x86_aesni_aesenc_256, load, 0, VR256,
6811 i256mem>, VEX_4V, VEX_L, WIG;
6812 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast",
6813 int_x86_aesni_aesenclast_256, load, 0, VR256,
6814 i256mem>, VEX_4V, VEX_L, WIG;
6815 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec",
6816 int_x86_aesni_aesdec_256, load, 0, VR256,
6817 i256mem>, VEX_4V, VEX_L, WIG;
6818 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast",
6819 int_x86_aesni_aesdeclast_256, load, 0, VR256,
6820 i256mem>, VEX_4V, VEX_L, WIG;
6823 let Constraints = "$src1 = $dst" in {
6824 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
6825 int_x86_aesni_aesenc, memop, 1>;
6826 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
6827 int_x86_aesni_aesenclast, memop, 1>;
6828 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
6829 int_x86_aesni_aesdec, memop, 1>;
6830 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
6831 int_x86_aesni_aesdeclast, memop, 1>;
6834 // Perform the AES InvMixColumn Transformation
6835 let Predicates = [HasAVX, HasAES] in {
6836 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6838 "vaesimc\t{$src1, $dst|$dst, $src1}",
6840 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6842 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6843 (ins i128mem:$src1),
6844 "vaesimc\t{$src1, $dst|$dst, $src1}",
6845 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6846 Sched<[WriteAESIMC.Folded]>, VEX, WIG;
6848 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6850 "aesimc\t{$src1, $dst|$dst, $src1}",
6852 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6853 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6854 (ins i128mem:$src1),
6855 "aesimc\t{$src1, $dst|$dst, $src1}",
6856 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6857 Sched<[WriteAESIMC.Folded]>;
6859 // AES Round Key Generation Assist
6860 let Predicates = [HasAVX, HasAES] in {
6861 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6862 (ins VR128:$src1, u8imm:$src2),
6863 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6865 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6866 Sched<[WriteAESKeyGen]>, VEX, WIG;
6867 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6868 (ins i128mem:$src1, u8imm:$src2),
6869 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6871 (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
6872 Sched<[WriteAESKeyGen.Folded]>, VEX, WIG;
6874 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6875 (ins VR128:$src1, u8imm:$src2),
6876 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6878 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6879 Sched<[WriteAESKeyGen]>;
6880 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6881 (ins i128mem:$src1, u8imm:$src2),
6882 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6884 (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
6885 Sched<[WriteAESKeyGen.Folded]>;
6887 //===----------------------------------------------------------------------===//
6888 // PCLMUL Instructions
6889 //===----------------------------------------------------------------------===//
6891 // Immediate transform to help with commuting.
6892 def PCLMULCommuteImm : SDNodeXForm<timm, [{
6893 uint8_t Imm = N->getZExtValue();
6894 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6897 // SSE carry-less Multiplication instructions
6898 let Predicates = [NoAVX, HasPCLMUL] in {
6899 let Constraints = "$src1 = $dst" in {
6900 let isCommutable = 1 in
6901 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6902 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6903 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6905 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
6906 Sched<[WriteCLMul]>;
6908 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6909 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6910 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6912 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6914 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6915 } // Constraints = "$src1 = $dst"
6917 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6919 (PCLMULQDQrm VR128:$src1, addr:$src2,
6920 (PCLMULCommuteImm timm:$src3))>;
6921 } // Predicates = [NoAVX, HasPCLMUL]
6924 foreach HI = ["hq","lq"] in
6925 foreach LO = ["hq","lq"] in {
6926 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6927 (PCLMULQDQrr VR128:$dst, VR128:$src,
6928 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6929 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6930 (PCLMULQDQrm VR128:$dst, i128mem:$src,
6931 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6934 // AVX carry-less Multiplication instructions
6935 multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6936 PatFrag LdFrag, Intrinsic IntId> {
6937 let isCommutable = 1 in
6938 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6939 (ins RC:$src1, RC:$src2, u8imm:$src3),
6940 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6942 (IntId RC:$src1, RC:$src2, timm:$src3))]>,
6943 Sched<[WriteCLMul]>;
6945 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6946 (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6947 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6949 (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
6950 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6952 // We can commute a load in the first operand by swapping the sources and
6953 // rotating the immediate.
6954 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
6955 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6956 (PCLMULCommuteImm timm:$src3))>;
6959 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6960 defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6961 int_x86_pclmulqdq>, VEX_4V, WIG;
6963 let Predicates = [NoVLX, HasVPCLMULQDQ] in
6964 defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6965 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, WIG;
6967 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6968 X86MemOperand MemOp, string Hi, string Lo> {
6969 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6970 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6971 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6972 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6973 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6974 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6977 multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6978 X86MemOperand MemOp> {
6979 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6980 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6981 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6982 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6986 defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6987 defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6989 //===----------------------------------------------------------------------===//
6990 // SSE4A Instructions
6991 //===----------------------------------------------------------------------===//
6993 let Predicates = [HasSSE4A] in {
6995 let ExeDomain = SSEPackedInt in {
6996 let Constraints = "$src = $dst" in {
6997 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
6998 (ins VR128:$src, u8imm:$len, u8imm:$idx),
6999 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7000 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
7002 PD, Sched<[SchedWriteVecALU.XMM]>;
7003 def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
7004 (ins VR128:$src, VR128:$mask),
7005 "extrq\t{$mask, $src|$src, $mask}",
7006 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7008 PD, Sched<[SchedWriteVecALU.XMM]>;
7010 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7011 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
7012 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7013 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
7014 timm:$len, timm:$idx))]>,
7015 XD, Sched<[SchedWriteVecALU.XMM]>;
7016 def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
7017 (ins VR128:$src, VR128:$mask),
7018 "insertq\t{$mask, $src|$src, $mask}",
7019 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7021 XD, Sched<[SchedWriteVecALU.XMM]>;
7023 } // ExeDomain = SSEPackedInt
7025 // Non-temporal (unaligned) scalar stores.
7026 let AddedComplexity = 400 in { // Prefer non-temporal versions
7027 let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
7028 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7029 "movntss\t{$src, $dst|$dst, $src}", []>, XS;
7031 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7032 "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
7035 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
7036 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7038 def : Pat<(nontemporalstore FR64:$src, addr:$dst),
7039 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7041 } // AddedComplexity
7044 //===----------------------------------------------------------------------===//
7046 //===----------------------------------------------------------------------===//
7048 //===----------------------------------------------------------------------===//
7049 // VBROADCAST - Load from memory and broadcast to all elements of the
7050 // destination operand
7052 class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
7053 X86MemOperand x86memop, ValueType VT,
7054 PatFrag bcast_frag, SchedWrite Sched> :
7055 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7056 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7057 [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
7058 Sched<[Sched]>, VEX;
7060 // AVX2 adds register forms
7061 class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
7062 ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
7063 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7064 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7065 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
7066 Sched<[Sched]>, VEX;
7068 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
7069 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
7070 f32mem, v4f32, X86VBroadcastld32,
7071 SchedWriteFShuffle.XMM.Folded>;
7072 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
7073 f32mem, v8f32, X86VBroadcastld32,
7074 SchedWriteFShuffle.XMM.Folded>, VEX_L;
7076 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
7077 def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
7078 v4f64, X86VBroadcastld64,
7079 SchedWriteFShuffle.XMM.Folded>, VEX_L;
7081 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
7082 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
7083 v4f32, v4f32, SchedWriteFShuffle.XMM>;
7084 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
7085 v8f32, v4f32, WriteFShuffle256>, VEX_L;
7087 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
7088 def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
7089 v4f64, v2f64, WriteFShuffle256>, VEX_L;
7091 //===----------------------------------------------------------------------===//
7092 // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7093 // halves of a 256-bit vector.
7095 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7096 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7098 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7099 Sched<[WriteShuffleLd]>, VEX, VEX_L;
7101 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7102 ExeDomain = SSEPackedSingle in
7103 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7105 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7106 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7108 let Predicates = [HasAVX, NoVLX] in {
7109 def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
7110 (VBROADCASTF128 addr:$src)>;
7111 def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
7112 (VBROADCASTF128 addr:$src)>;
7113 // NOTE: We're using FP instructions here, but execution domain fixing can
7114 // convert to integer when profitable.
7115 def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
7116 (VBROADCASTF128 addr:$src)>;
7117 def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
7118 (VBROADCASTF128 addr:$src)>;
7119 def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
7120 (VBROADCASTF128 addr:$src)>;
7121 def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)),
7122 (VBROADCASTF128 addr:$src)>;
7123 def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
7124 (VBROADCASTF128 addr:$src)>;
7127 //===----------------------------------------------------------------------===//
7128 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7131 let ExeDomain = SSEPackedSingle in {
7132 let isCommutable = 1 in
7133 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7134 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7135 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7136 VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
7137 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7138 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7139 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7140 VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7143 // Immediate transform to help with commuting.
7144 def Perm2XCommuteImm : SDNodeXForm<timm, [{
7145 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7148 multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
7149 def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7150 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
7151 def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
7152 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
7153 // Pattern with load in other operand.
7154 def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
7155 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7156 (Perm2XCommuteImm timm:$imm))>;
7159 let Predicates = [HasAVX] in {
7160 defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
7161 defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
7164 let Predicates = [HasAVX1Only] in {
7165 defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>;
7166 defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>;
7167 defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
7168 defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>;
7169 defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>;
7172 //===----------------------------------------------------------------------===//
7173 // VINSERTF128 - Insert packed floating-point values
7175 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7176 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7177 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7178 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7179 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7181 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7182 (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7183 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7184 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7187 // To create a 256-bit all ones value, we should produce VCMPTRUEPS
7188 // with YMM register containing zero.
7189 // FIXME: Avoid producing vxorps to clear the fake inputs.
7190 let Predicates = [HasAVX1Only] in {
7191 def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7194 multiclass vinsert_lowering<string InstrStr, string PermStr,
7195 ValueType From, ValueType To,
7196 PatFrag frommemop_frag, PatFrag tomemop_frag> {
7197 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7199 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7200 (INSERT_get_vinsert128_imm VR256:$ins))>;
7201 def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7202 (From (frommemop_frag addr:$src2)),
7204 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7205 (INSERT_get_vinsert128_imm VR256:$ins))>;
7206 // Folding "To" vector - convert to perm2x128 and commute inputs.
7207 def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)),
7210 (!cast<Instruction>(PermStr#rm)
7211 (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm),
7212 addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>;
7215 let Predicates = [HasAVX, NoVLX] in {
7216 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>;
7217 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>;
7220 let Predicates = [HasAVX1Only] in {
7221 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64, loadv2i64, loadv4i64>;
7222 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32, loadv4i32, loadv8i32>;
7223 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>;
7224 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>;
7225 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>;
7226 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>;
7229 //===----------------------------------------------------------------------===//
7230 // VEXTRACTF128 - Extract packed floating-point values
7232 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7233 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7234 (ins VR256:$src1, u8imm:$src2),
7235 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7236 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7238 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7239 (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7240 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7241 []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7244 multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7245 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7246 (To (!cast<Instruction>(InstrStr#rr)
7248 (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7249 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7250 (iPTR imm))), addr:$dst),
7251 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7252 (EXTRACT_get_vextract128_imm VR128:$ext))>;
7256 let Predicates = [HasAVX, NoVLX] in {
7257 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7258 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7261 let Predicates = [HasAVX1Only] in {
7262 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
7263 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
7264 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7265 defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>;
7266 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
7267 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
7270 //===----------------------------------------------------------------------===//
7271 // VMASKMOV - Conditional SIMD Packed Loads and Stores
7273 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7274 Intrinsic IntLd, Intrinsic IntLd256,
7275 Intrinsic IntSt, Intrinsic IntSt256,
7276 X86SchedWriteMaskMove schedX,
7277 X86SchedWriteMaskMove schedY> {
7278 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7279 (ins VR128:$src1, f128mem:$src2),
7280 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7281 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7282 VEX_4V, Sched<[schedX.RM]>;
7283 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7284 (ins VR256:$src1, f256mem:$src2),
7285 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7286 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7287 VEX_4V, VEX_L, Sched<[schedY.RM]>;
7288 def mr : AVX8I<opc_mr, MRMDestMem, (outs),
7289 (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7290 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7291 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7292 VEX_4V, Sched<[schedX.MR]>;
7293 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7294 (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7295 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7296 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7297 VEX_4V, VEX_L, Sched<[schedY.MR]>;
7300 let ExeDomain = SSEPackedSingle in
7301 defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7302 int_x86_avx_maskload_ps,
7303 int_x86_avx_maskload_ps_256,
7304 int_x86_avx_maskstore_ps,
7305 int_x86_avx_maskstore_ps_256,
7306 WriteFMaskMove32, WriteFMaskMove32Y>;
7307 let ExeDomain = SSEPackedDouble in
7308 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7309 int_x86_avx_maskload_pd,
7310 int_x86_avx_maskload_pd_256,
7311 int_x86_avx_maskstore_pd,
7312 int_x86_avx_maskstore_pd_256,
7313 WriteFMaskMove64, WriteFMaskMove64Y>;
7315 //===----------------------------------------------------------------------===//
7317 //===----------------------------------------------------------------------===//
7318 let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst",
7319 ExplicitVEXPrefix = 1, checkVEXPredicate = 1 in
7320 multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7322 let isCommutable = IsCommutable in
7323 def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
7324 (ins VR128:$src1, VR128:$src2, VR128:$src3),
7325 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7326 [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
7327 VR128:$src2, VR128:$src3)))]>,
7328 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
7330 def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
7331 (ins VR128:$src1, VR128:$src2, i128mem:$src3),
7332 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7333 [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
7334 (loadv4i32 addr:$src3))))]>,
7335 VEX_4V, Sched<[SchedWriteVecIMul.XMM.Folded,
7336 SchedWriteVecIMul.XMM.ReadAfterFold,
7337 SchedWriteVecIMul.XMM.ReadAfterFold]>;
7339 let isCommutable = IsCommutable in
7340 def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
7341 (ins VR256:$src1, VR256:$src2, VR256:$src3),
7342 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7343 [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
7344 VR256:$src2, VR256:$src3)))]>,
7345 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
7347 def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
7348 (ins VR256:$src1, VR256:$src2, i256mem:$src3),
7349 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7350 [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
7351 (loadv8i32 addr:$src3))))]>,
7352 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM.Folded,
7353 SchedWriteVecIMul.YMM.ReadAfterFold,
7354 SchedWriteVecIMul.YMM.ReadAfterFold]>;
7357 defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>;
7358 defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>;
7359 defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>;
7360 defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>;
7362 def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
7363 (X86vpmaddwd node:$lhs, node:$rhs), [{
7364 return N->hasOneUse();
7367 let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
7368 def : Pat<(v8i32 (add VR256:$src1,
7369 (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
7370 (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
7371 def : Pat<(v8i32 (add VR256:$src1,
7372 (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
7373 (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
7374 def : Pat<(v4i32 (add VR128:$src1,
7375 (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
7376 (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
7377 def : Pat<(v4i32 (add VR128:$src1,
7378 (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
7379 (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
7382 //===----------------------------------------------------------------------===//
7383 // VPERMIL - Permute Single and Double Floating-Point Values
7386 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7387 RegisterClass RC, X86MemOperand x86memop_f,
7388 X86MemOperand x86memop_i,
7389 ValueType f_vt, ValueType i_vt,
7390 X86FoldableSchedWrite sched,
7391 X86FoldableSchedWrite varsched> {
7392 let Predicates = [HasAVX, NoVLX] in {
7393 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7394 (ins RC:$src1, RC:$src2),
7395 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7396 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7398 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7399 (ins RC:$src1, x86memop_i:$src2),
7400 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7401 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7402 (i_vt (load addr:$src2)))))]>, VEX_4V,
7403 Sched<[varsched.Folded, sched.ReadAfterFold]>;
7405 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7406 (ins RC:$src1, u8imm:$src2),
7407 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7408 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
7410 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7411 (ins x86memop_f:$src1, u8imm:$src2),
7412 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7414 (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
7415 Sched<[sched.Folded]>;
7416 }// Predicates = [HasAVX, NoVLX]
7419 let ExeDomain = SSEPackedSingle in {
7420 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7421 v4f32, v4i32, SchedWriteFShuffle.XMM,
7422 SchedWriteFVarShuffle.XMM>;
7423 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7424 v8f32, v8i32, SchedWriteFShuffle.YMM,
7425 SchedWriteFVarShuffle.YMM>, VEX_L;
7427 let ExeDomain = SSEPackedDouble in {
7428 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7429 v2f64, v2i64, SchedWriteFShuffle.XMM,
7430 SchedWriteFVarShuffle.XMM>;
7431 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7432 v4f64, v4i64, SchedWriteFShuffle.YMM,
7433 SchedWriteFVarShuffle.YMM>, VEX_L;
7436 //===----------------------------------------------------------------------===//
7437 // VZERO - Zero YMM registers
7438 // Note: These instruction do not affect the YMM16-YMM31.
7441 let SchedRW = [WriteSystem] in {
7442 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7443 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7444 // Zero All YMM registers
7445 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7446 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7447 Requires<[HasAVX]>, WIG;
7449 // Zero Upper bits of YMM registers
7450 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7451 [(int_x86_avx_vzeroupper)]>, PS, VEX,
7452 Requires<[HasAVX]>, WIG;
7456 //===----------------------------------------------------------------------===//
7457 // Half precision conversion instructions
7460 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7461 X86FoldableSchedWrite sched> {
7462 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7463 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7464 [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
7465 T8PD, VEX, Sched<[sched]>;
7466 let hasSideEffects = 0, mayLoad = 1 in
7467 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7468 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7469 []>, T8PD, VEX, Sched<[sched.Folded]>;
7472 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7473 SchedWrite RR, SchedWrite MR> {
7474 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7475 (ins RC:$src1, i32u8imm:$src2),
7476 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7477 [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
7478 TAPD, VEX, Sched<[RR]>;
7479 let hasSideEffects = 0, mayStore = 1 in
7480 def mr : Ii8<0x1D, MRMDestMem, (outs),
7481 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7482 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7483 TAPD, VEX, Sched<[MR]>;
7486 let Predicates = [HasF16C, NoVLX] in {
7487 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
7488 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
7489 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7490 WriteCvtPS2PHSt>, SIMD_EXC;
7491 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7492 WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
7494 // Pattern match vcvtph2ps of a scalar i64 load.
7495 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7496 (VCVTPH2PSrm addr:$src)>;
7497 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
7498 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7499 (VCVTPH2PSrm addr:$src)>;
7500 def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
7501 (VCVTPH2PSYrm addr:$src)>;
7503 def : Pat<(store (f64 (extractelt
7504 (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7505 (iPTR 0))), addr:$dst),
7506 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7507 def : Pat<(store (i64 (extractelt
7508 (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7509 (iPTR 0))), addr:$dst),
7510 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7511 def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
7512 (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
7515 //===----------------------------------------------------------------------===//
7516 // AVX2 Instructions
7517 //===----------------------------------------------------------------------===//
7519 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7520 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7521 ValueType OpVT, X86FoldableSchedWrite sched,
7523 X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7524 let isCommutable = 1 in
7525 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7526 (ins RC:$src1, RC:$src2, u8imm:$src3),
7527 !strconcat(OpcodeStr,
7528 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7529 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
7530 Sched<[sched]>, VEX_4V;
7531 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7532 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7533 !strconcat(OpcodeStr,
7534 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7536 (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
7537 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7539 // Pattern to commute if load is in first source.
7540 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
7541 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7542 (commuteXForm timm:$src3))>;
7545 let Predicates = [HasAVX2] in {
7546 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7547 SchedWriteBlend.XMM, VR128, i128mem,
7549 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7550 SchedWriteBlend.YMM, VR256, i256mem,
7551 BlendCommuteImm8>, VEX_L;
7553 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
7554 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
7555 def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
7556 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
7557 def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
7558 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
7560 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
7561 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
7562 def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
7563 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
7564 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
7565 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
7568 // For insertion into the zero index (low half) of a 256-bit vector, it is
7569 // more efficient to generate a blend with immediate instead of an insert*128.
7570 // NOTE: We're using FP instructions here, but execution domain fixing should
7571 // take care of using integer instructions when profitable.
7572 let Predicates = [HasAVX] in {
7573 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7574 (VBLENDPSYrri VR256:$src1,
7575 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7576 VR128:$src2, sub_xmm), 0xf)>;
7577 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7578 (VBLENDPSYrri VR256:$src1,
7579 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7580 VR128:$src2, sub_xmm), 0xf)>;
7581 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7582 (VBLENDPSYrri VR256:$src1,
7583 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7584 VR128:$src2, sub_xmm), 0xf)>;
7585 def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)),
7586 (VBLENDPSYrri VR256:$src1,
7587 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7588 VR128:$src2, sub_xmm), 0xf)>;
7589 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7590 (VBLENDPSYrri VR256:$src1,
7591 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7592 VR128:$src2, sub_xmm), 0xf)>;
7594 def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7595 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7596 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7597 def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7598 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7599 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7600 def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7601 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7602 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7603 def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)),
7604 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7605 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7606 def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7607 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7608 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7611 //===----------------------------------------------------------------------===//
7612 // VPBROADCAST - Load from memory and broadcast to all elements of the
7613 // destination operand
7615 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7616 X86MemOperand x86memop, PatFrag bcast_frag,
7617 ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7618 let Predicates = [HasAVX2, prd] in {
7619 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7620 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7622 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7623 Sched<[SchedWriteShuffle.XMM]>, VEX;
7624 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7625 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7627 (OpVT128 (bcast_frag addr:$src)))]>,
7628 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7629 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7630 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7632 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7633 Sched<[WriteShuffle256]>, VEX, VEX_L;
7634 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7635 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7637 (OpVT256 (bcast_frag addr:$src)))]>,
7638 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7640 // Provide aliases for broadcast from the same register class that
7641 // automatically does the extract.
7642 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7643 (!cast<Instruction>(NAME#"Yrr")
7644 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7648 defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
7649 v16i8, v32i8, NoVLX_Or_NoBWI>;
7650 defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
7651 v8i16, v16i16, NoVLX_Or_NoBWI>;
7652 defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
7653 v4i32, v8i32, NoVLX>;
7654 defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
7655 v2i64, v4i64, NoVLX>;
7657 let Predicates = [HasAVX2, NoVLX] in {
7658 // Provide fallback in case the load node that is used in the patterns above
7659 // is used by additional users, which prevents the pattern selection.
7660 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7661 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7662 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7663 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7664 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7665 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7668 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7669 def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7670 (VPBROADCASTBrr (VMOVDI2PDIrr
7671 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7672 GR8:$src, sub_8bit))))>;
7673 def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7674 (VPBROADCASTBYrr (VMOVDI2PDIrr
7675 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7676 GR8:$src, sub_8bit))))>;
7678 def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7679 (VPBROADCASTWrr (VMOVDI2PDIrr
7680 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7681 GR16:$src, sub_16bit))))>;
7682 def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7683 (VPBROADCASTWYrr (VMOVDI2PDIrr
7684 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7685 GR16:$src, sub_16bit))))>;
7687 def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
7688 (VPBROADCASTWrm addr:$src)>;
7689 def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
7690 (VPBROADCASTWYrm addr:$src)>;
7692 def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))),
7693 (VPBROADCASTWrr VR128:$src)>;
7694 def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))),
7695 (VPBROADCASTWYrr VR128:$src)>;
7697 def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))),
7698 (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
7699 def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))),
7700 (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
7702 let Predicates = [HasAVX2, NoVLX] in {
7703 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7704 (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
7705 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7706 (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
7707 def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7708 (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
7709 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7710 (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
7713 // AVX1 broadcast patterns
7714 let Predicates = [HasAVX1Only] in {
7715 def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
7716 (VBROADCASTSSYrm addr:$src)>;
7717 def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
7718 (VBROADCASTSDYrm addr:$src)>;
7719 def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
7720 (VBROADCASTSSrm addr:$src)>;
7723 // Provide fallback in case the load node that is used in the patterns above
7724 // is used by additional users, which prevents the pattern selection.
7725 let Predicates = [HasAVX, NoVLX] in {
7726 // 128bit broadcasts:
7727 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7728 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7729 def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
7730 (VMOVDDUPrm addr:$src)>;
7732 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7733 (VMOVDDUPrr VR128:$src)>;
7736 let Predicates = [HasAVX1Only] in {
7737 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7738 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7739 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7740 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7741 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7742 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7743 def : Pat<(v8f32 (X86VBroadcast v4f32:$src)),
7744 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7745 (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm),
7746 (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>;
7747 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7748 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7749 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7750 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7751 def : Pat<(v4f64 (X86VBroadcast v2f64:$src)),
7752 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7753 (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm),
7754 (v2f64 (VMOVDDUPrr VR128:$src)), 1)>;
7756 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7757 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
7758 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7759 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7760 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
7761 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
7762 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7763 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7764 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
7765 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
7767 def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7768 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
7769 def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
7770 (VMOVDDUPrm addr:$src)>;
7773 //===----------------------------------------------------------------------===//
7774 // VPERM - Permute instructions
7777 multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7778 ValueType OpVT, X86FoldableSchedWrite Sched,
7779 X86MemOperand memOp> {
7780 let Predicates = [HasAVX2, NoVLX] in {
7781 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7782 (ins VR256:$src1, VR256:$src2),
7783 !strconcat(OpcodeStr,
7784 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7786 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7787 Sched<[Sched]>, VEX_4V, VEX_L;
7788 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7789 (ins VR256:$src1, memOp:$src2),
7790 !strconcat(OpcodeStr,
7791 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7793 (OpVT (X86VPermv VR256:$src1,
7794 (load addr:$src2))))]>,
7795 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7799 defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7800 let ExeDomain = SSEPackedSingle in
7801 defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7803 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7804 ValueType OpVT, X86FoldableSchedWrite Sched,
7805 X86MemOperand memOp> {
7806 let Predicates = [HasAVX2, NoVLX] in {
7807 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7808 (ins VR256:$src1, u8imm:$src2),
7809 !strconcat(OpcodeStr,
7810 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7812 (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
7813 Sched<[Sched]>, VEX, VEX_L;
7814 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7815 (ins memOp:$src1, u8imm:$src2),
7816 !strconcat(OpcodeStr,
7817 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7819 (OpVT (X86VPermi (mem_frag addr:$src1),
7820 (i8 timm:$src2))))]>,
7821 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7825 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7826 WriteShuffle256, i256mem>, REX_W;
7827 let ExeDomain = SSEPackedDouble in
7828 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7829 WriteFShuffle256, f256mem>, REX_W;
7831 //===----------------------------------------------------------------------===//
7832 // VPERM2I128 - Permute Integer vector Values in 128-bit chunks
7834 let isCommutable = 1 in
7835 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7836 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7837 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7838 Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7839 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7840 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7841 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7842 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7844 let Predicates = [HasAVX2] in {
7845 defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>;
7846 defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>;
7847 defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
7848 defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>;
7849 defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
7850 defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
7853 //===----------------------------------------------------------------------===//
7854 // VINSERTI128 - Insert packed integer values
7856 let hasSideEffects = 0 in {
7857 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7858 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7859 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7860 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7862 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7863 (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7864 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7865 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7868 let Predicates = [HasAVX2, NoVLX] in {
7869 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64, loadv2i64, loadv4i64>;
7870 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32, loadv4i32, loadv8i32>;
7871 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16, loadv16i16>;
7872 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16, loadv16f16>;
7873 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>;
7874 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>;
7877 //===----------------------------------------------------------------------===//
7878 // VEXTRACTI128 - Extract packed integer values
7880 def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7881 (ins VR256:$src1, u8imm:$src2),
7882 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7883 Sched<[WriteShuffle256]>, VEX, VEX_L;
7884 let hasSideEffects = 0, mayStore = 1 in
7885 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7886 (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7887 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7888 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7890 let Predicates = [HasAVX2, NoVLX] in {
7891 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
7892 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
7893 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7894 defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>;
7895 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
7896 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
7899 //===----------------------------------------------------------------------===//
7900 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7902 multiclass avx2_pmovmask<string OpcodeStr,
7903 Intrinsic IntLd128, Intrinsic IntLd256,
7904 Intrinsic IntSt128, Intrinsic IntSt256,
7905 X86SchedWriteMaskMove schedX,
7906 X86SchedWriteMaskMove schedY> {
7907 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7908 (ins VR128:$src1, i128mem:$src2),
7909 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7910 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7911 VEX_4V, Sched<[schedX.RM]>;
7912 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7913 (ins VR256:$src1, i256mem:$src2),
7914 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7915 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7916 VEX_4V, VEX_L, Sched<[schedY.RM]>;
7917 def mr : AVX28I<0x8e, MRMDestMem, (outs),
7918 (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7919 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7920 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7921 VEX_4V, Sched<[schedX.MR]>;
7922 def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7923 (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7924 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7925 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7926 VEX_4V, VEX_L, Sched<[schedY.MR]>;
7929 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7930 int_x86_avx2_maskload_d,
7931 int_x86_avx2_maskload_d_256,
7932 int_x86_avx2_maskstore_d,
7933 int_x86_avx2_maskstore_d_256,
7934 WriteVecMaskMove32, WriteVecMaskMove32Y>;
7935 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7936 int_x86_avx2_maskload_q,
7937 int_x86_avx2_maskload_q_256,
7938 int_x86_avx2_maskstore_q,
7939 int_x86_avx2_maskstore_q_256,
7940 WriteVecMaskMove64, WriteVecMaskMove64Y>, REX_W;
7942 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7945 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7946 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7948 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7949 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7950 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7951 (VT immAllZerosV))),
7952 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7954 let Predicates = [HasAVX] in {
7955 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
7956 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
7957 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
7958 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
7960 let Predicates = [HasAVX1Only] in {
7961 // load/store i32/i64 not supported use ps/pd version
7962 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
7963 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
7964 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
7965 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
7967 let Predicates = [HasAVX2] in {
7968 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
7969 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
7970 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
7971 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
7974 //===----------------------------------------------------------------------===//
7975 // Variable Bit Shifts
7977 multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7978 ValueType vt128, ValueType vt256> {
7979 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7980 (ins VR128:$src1, VR128:$src2),
7981 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7983 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7984 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7985 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7986 (ins VR128:$src1, i128mem:$src2),
7987 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7989 (vt128 (OpNode VR128:$src1,
7990 (vt128 (load addr:$src2)))))]>,
7991 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7992 SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7993 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7994 (ins VR256:$src1, VR256:$src2),
7995 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7997 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
7998 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
7999 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8000 (ins VR256:$src1, i256mem:$src2),
8001 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8003 (vt256 (OpNode VR256:$src1,
8004 (vt256 (load addr:$src2)))))]>,
8005 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
8006 SchedWriteVarVecShift.YMM.ReadAfterFold]>;
8009 let Predicates = [HasAVX2, NoVLX] in {
8010 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
8011 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, REX_W;
8012 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
8013 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, REX_W;
8014 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
8017 //===----------------------------------------------------------------------===//
8018 // VGATHER - GATHER Operations
8020 // FIXME: Improve scheduling of gather instructions.
8021 multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
8022 X86MemOperand memop128, X86MemOperand memop256> {
8023 let mayLoad = 1, hasSideEffects = 0 in {
8024 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
8025 (ins VR128:$src1, memop128:$src2, VR128:$mask),
8026 !strconcat(OpcodeStr,
8027 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8028 []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
8029 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
8030 (ins RC256:$src1, memop256:$src2, RC256:$mask),
8031 !strconcat(OpcodeStr,
8032 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8033 []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
8037 let Predicates = [HasAVX2] in {
8038 let mayLoad = 1, hasSideEffects = 0, Constraints
8039 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8041 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq",
8042 VR256, vx128mem, vx256mem>, REX_W;
8043 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq",
8044 VR256, vx128mem, vy256mem>, REX_W;
8045 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd",
8046 VR256, vx128mem, vy256mem>;
8047 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd",
8048 VR128, vx64mem, vy128mem>;
8050 let ExeDomain = SSEPackedDouble in {
8051 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd",
8052 VR256, vx128mem, vx256mem>, REX_W;
8053 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd",
8054 VR256, vx128mem, vy256mem>, REX_W;
8057 let ExeDomain = SSEPackedSingle in {
8058 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps",
8059 VR256, vx128mem, vy256mem>;
8060 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps",
8061 VR128, vx64mem, vy128mem>;
8066 //===----------------------------------------------------------------------===//
8067 // GFNI instructions
8068 //===----------------------------------------------------------------------===//
8070 multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
8071 RegisterClass RC, PatFrag MemOpFrag,
8072 X86MemOperand X86MemOp, X86FoldableSchedWrite sched,
8074 let ExeDomain = SSEPackedInt,
8075 AsmString = !if(Is2Addr,
8076 OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
8077 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
8078 let isCommutable = 1 in
8079 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
8080 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
8081 Sched<[sched]>, T8PD;
8083 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
8084 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
8085 (MemOpFrag addr:$src2))))]>,
8086 Sched<[sched.Folded, sched.ReadAfterFold]>, T8PD;
8090 multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
8091 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
8092 X86MemOperand X86MemOp, X86FoldableSchedWrite sched,
8094 let AsmString = !if(Is2Addr,
8095 OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
8096 OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
8097 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
8098 (ins RC:$src1, RC:$src2, u8imm:$src3), "",
8099 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
8100 SSEPackedInt>, Sched<[sched]>;
8101 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
8102 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
8103 [(set RC:$dst, (OpVT (OpNode RC:$src1,
8104 (MemOpFrag addr:$src2),
8105 timm:$src3)))], SSEPackedInt>,
8106 Sched<[sched.Folded, sched.ReadAfterFold]>;
8110 multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
8111 let Constraints = "$src1 = $dst",
8112 Predicates = [HasGFNI, UseSSE2] in
8113 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
8114 VR128, load, i128mem, SchedWriteVecIMul.XMM, 1>;
8115 let Predicates = [HasGFNI, HasAVX, NoVLX] in {
8116 defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
8117 load, i128mem, SchedWriteVecIMul.XMM>,
8119 defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
8120 load, i256mem, SchedWriteVecIMul.YMM>,
8121 VEX_4V, VEX_L, REX_W;
8126 let Constraints = "$src1 = $dst",
8127 Predicates = [HasGFNI, UseSSE2] in
8128 defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
8129 i128mem, SchedWriteVecALU.XMM, 1>;
8130 let Predicates = [HasGFNI, HasAVX, NoVLX] in {
8131 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
8132 i128mem, SchedWriteVecALU.XMM>, VEX_4V;
8133 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
8134 i256mem, SchedWriteVecALU.YMM>, VEX_4V, VEX_L;
8136 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
8137 let isCommutable = 0 in {
8138 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
8139 X86GF2P8affineinvqb>, TAPD;
8140 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
8141 X86GF2P8affineqb>, TAPD;
8145 let Predicates = [HasAVXIFMA, NoVLX_Or_NoIFMA], Constraints = "$src1 = $dst",
8146 checkVEXPredicate = 1 in
8147 multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> {
8148 // NOTE: The SDNode have the multiply operands first with the add last.
8149 // This enables commuted load patterns to be autogenerated by tablegen.
8150 let isCommutable = 1 in {
8151 def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
8152 (ins VR128:$src1, VR128:$src2, VR128:$src3),
8153 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8154 [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
8155 VR128:$src3, VR128:$src1)))]>,
8156 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
8158 def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
8159 (ins VR128:$src1, VR128:$src2, i128mem:$src3),
8160 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8161 [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
8162 (loadv2i64 addr:$src3), VR128:$src1)))]>,
8163 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
8164 let isCommutable = 1 in {
8165 def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
8166 (ins VR256:$src1, VR256:$src2, VR256:$src3),
8167 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8168 [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
8169 VR256:$src3, VR256:$src1)))]>,
8170 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
8172 def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
8173 (ins VR256:$src1, VR256:$src2, i256mem:$src3),
8174 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8175 [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
8176 (loadv4i64 addr:$src3), VR256:$src1)))]>,
8177 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
8180 defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, REX_W, ExplicitVEXPrefix;
8181 defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, REX_W, ExplicitVEXPrefix;
8184 let Constraints = "$src1 = $dst" in
8185 multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT,
8186 RegisterClass RC, PatFrag MemOpFrag,
8187 X86MemOperand X86memop, SDNode OpNode,
8188 X86FoldableSchedWrite Sched,
8190 let isCommutable = IsCommutable in
8191 def rr : I<Opc, MRMSrcReg, (outs RC:$dst),
8192 (ins RC:$src1, RC:$src2, RC:$src3),
8193 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8194 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
8195 VEX_4V, Sched<[Sched]>;
8196 def rm : I<Opc, MRMSrcMem, (outs RC:$dst),
8197 (ins RC:$src1, RC:$src2, X86memop:$src3),
8198 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8199 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
8200 (MemOpFrag addr:$src3))))]>,
8201 VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
8204 let Predicates = [HasAVXVNNIINT8] in {
8205 defm VPDPBSSD : avx_dotprod_rm<0x50,"vpdpbssd", v4i32, VR128, loadv4i32,
8206 i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM,
8208 defm VPDPBSSDY : avx_dotprod_rm<0x50,"vpdpbssd", v8i32, VR256, loadv8i32,
8209 i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM,
8211 defm VPDPBUUD : avx_dotprod_rm<0x50,"vpdpbuud", v4i32, VR128, loadv4i32,
8212 i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM,
8214 defm VPDPBUUDY : avx_dotprod_rm<0x50,"vpdpbuud", v8i32, VR256, loadv8i32,
8215 i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM,
8217 defm VPDPBSSDS : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32,
8218 i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM,
8220 defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32,
8221 i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM,
8223 defm VPDPBUUDS : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32,
8224 i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM,
8226 defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32,
8227 i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM,
8229 defm VPDPBSUD : avx_dotprod_rm<0x50,"vpdpbsud", v4i32, VR128, loadv4i32,
8230 i128mem, X86vpdpbsud, SchedWriteVecIMul.XMM,
8232 defm VPDPBSUDY : avx_dotprod_rm<0x50,"vpdpbsud", v8i32, VR256, loadv8i32,
8233 i256mem, X86vpdpbsud, SchedWriteVecIMul.YMM,
8235 defm VPDPBSUDS : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32,
8236 i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM,
8238 defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32,
8239 i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM,
8244 multiclass AVX_NE_CONVERT_BASE<bits<8> Opcode, string OpcodeStr,
8245 X86MemOperand MemOp128, X86MemOperand MemOp256> {
8246 def rm : I<Opcode, MRMSrcMem, (outs VR128:$dst), (ins MemOp128:$src),
8247 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8249 (!cast<Intrinsic>("int_x86_"#OpcodeStr#"128") addr:$src))]>,
8250 Sched<[WriteCvtPH2PS]>, VEX;
8251 def Yrm : I<Opcode, MRMSrcMem, (outs VR256:$dst), (ins MemOp256:$src),
8252 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8254 (!cast<Intrinsic>("int_x86_"#OpcodeStr#"256") addr:$src))]>,
8255 Sched<[WriteCvtPH2PSY]>, VEX, VEX_L;
8258 multiclass VCVTNEPS2BF16_BASE {
8259 def rr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
8260 "vcvtneps2bf16\t{$src, $dst|$dst, $src}",
8261 [(set VR128:$dst, (int_x86_vcvtneps2bf16128 VR128:$src))]>,
8262 Sched<[WriteCvtPH2PS]>;
8263 def rm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
8264 "vcvtneps2bf16{x}\t{$src, $dst|$dst, $src}",
8265 [(set VR128:$dst, (int_x86_vcvtneps2bf16128 (loadv4f32 addr:$src)))]>,
8266 Sched<[WriteCvtPH2PS]>;
8267 def Yrr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
8268 "vcvtneps2bf16\t{$src, $dst|$dst, $src}",
8269 [(set VR128:$dst, (int_x86_vcvtneps2bf16256 VR256:$src))]>,
8270 Sched<[WriteCvtPH2PSY]>, VEX_L;
8271 def Yrm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
8272 "vcvtneps2bf16{y}\t{$src, $dst|$dst, $src}",
8273 [(set VR128:$dst, (int_x86_vcvtneps2bf16256 (loadv8f32 addr:$src)))]>,
8274 Sched<[WriteCvtPH2PSY]>, VEX_L;
8277 let Predicates = [HasAVXNECONVERT] in {
8278 defm VBCSTNEBF162PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnebf162ps", f16mem,
8280 defm VBCSTNESH2PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnesh2ps", f16mem, f16mem>,
8282 defm VCVTNEEBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneebf162ps", f128mem,
8284 defm VCVTNEEPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneeph2ps", f128mem,
8286 defm VCVTNEOBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneobf162ps", f128mem,
8288 defm VCVTNEOPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneoph2ps", f128mem,
8290 let checkVEXPredicate = 1 in
8291 defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8XS, ExplicitVEXPrefix;
8293 def : Pat<(v8bf16 (X86vfpround (v8f32 VR256:$src))),
8294 (VCVTNEPS2BF16Yrr VR256:$src)>;
8295 def : Pat<(v8bf16 (X86vfpround (loadv8f32 addr:$src))),
8296 (VCVTNEPS2BF16Yrm addr:$src)>;
8299 def : InstAlias<"vcvtneps2bf16x\t{$src, $dst|$dst, $src}",
8300 (VCVTNEPS2BF16rr VR128:$dst, VR128:$src), 0, "att">;
8301 def : InstAlias<"vcvtneps2bf16y\t{$src, $dst|$dst, $src}",
8302 (VCVTNEPS2BF16Yrr VR128:$dst, VR256:$src), 0, "att">;
8304 // FIXME: Is there a better scheduler class for SHA512 than WriteVecIMul?
8305 let Predicates = [HasSHA512], Constraints = "$src1 = $dst" in {
8306 def VSHA512MSG1rr : I<0xcc, MRMSrcReg, (outs VR256:$dst),
8307 (ins VR256:$src1, VR128:$src2),
8308 "vsha512msg1\t{$src2, $dst|$dst, $src2}",
8310 (int_x86_vsha512msg1 VR256:$src1, VR128:$src2))]>, VEX_L,
8311 VEX, T8XD, Sched<[WriteVecIMul]>;
8312 def VSHA512MSG2rr : I<0xcd, MRMSrcReg, (outs VR256:$dst),
8313 (ins VR256:$src1, VR256:$src2),
8314 "vsha512msg2\t{$src2, $dst|$dst, $src2}",
8316 (int_x86_vsha512msg2 VR256:$src1, VR256:$src2))]>, VEX_L,
8317 VEX, T8XD, Sched<[WriteVecIMul]>;
8318 def VSHA512RNDS2rr : I<0xcb, MRMSrcReg, (outs VR256:$dst),
8319 (ins VR256:$src1, VR256:$src2, VR128:$src3),
8320 "vsha512rnds2\t{$src3, $src2, $dst|$dst, $src2, $src3}",
8322 (int_x86_vsha512rnds2 VR256:$src1, VR256:$src2, VR128:$src3))]>,
8323 VEX_L, VEX_4V, T8XD, Sched<[WriteVecIMul]>;
8326 // FIXME: Is there a better scheduler class for SM3 than WriteVecIMul?
8327 let Predicates = [HasSM3], Constraints = "$src1 = $dst" in {
8328 multiclass SM3_Base<string OpStr> {
8329 def rr : I<0xda, MRMSrcReg, (outs VR128:$dst),
8330 (ins VR128:$src1, VR128:$src2, VR128:$src3),
8331 !strconcat(OpStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8333 (!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1,
8334 VR128:$src2, VR128:$src3))]>,
8335 Sched<[WriteVecIMul]>, VEX_4V;
8336 def rm : I<0xda, MRMSrcMem, (outs VR128:$dst),
8337 (ins VR128:$src1, VR128:$src2, i128mem:$src3),
8338 !strconcat(OpStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8340 (!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1,
8341 VR128:$src2, (loadv4i32 addr:$src3)))]>,
8342 Sched<[WriteVecIMul]>, VEX_4V;
8345 multiclass VSM3RNDS2_Base {
8346 def rr : Ii8<0xde, MRMSrcReg, (outs VR128:$dst),
8347 (ins VR128:$src1, VR128:$src2, VR128:$src3, i32u8imm:$src4),
8348 "vsm3rnds2\t{$src4, $src3, $src2, $dst|$dst, $src2, $src3, $src4}",
8350 (int_x86_vsm3rnds2 VR128:$src1,
8351 VR128:$src2, VR128:$src3, timm:$src4))]>,
8352 Sched<[WriteVecIMul]>;
8353 def rm : Ii8<0xde, MRMSrcMem, (outs VR128:$dst),
8354 (ins VR128:$src1, VR128:$src2, i128mem:$src3, i32u8imm:$src4),
8355 "vsm3rnds2\t{$src4, $src3, $src2, $dst|$dst, $src2, $src3, $src4}",
8357 (int_x86_vsm3rnds2 VR128:$src1,
8358 VR128:$src2, (loadv4i32 addr:$src3), timm:$src4))]>,
8359 Sched<[WriteVecIMul]>;
8363 defm VSM3MSG1 : SM3_Base<"vsm3msg1">, T8PS;
8364 defm VSM3MSG2 : SM3_Base<"vsm3msg2">, T8PD;
8365 defm VSM3RNDS2 : VSM3RNDS2_Base, VEX_4V, TAPD;
8367 // FIXME: Is there a better scheduler class for SM4 than WriteVecIMul?
8368 let Predicates = [HasSM4] in {
8369 multiclass SM4_Base<string OpStr, RegisterClass RC, string VL,
8370 PatFrag LD, X86MemOperand MemOp> {
8371 def rr : I<0xda, MRMSrcReg, (outs RC:$dst),
8372 (ins RC:$src1, RC:$src2),
8373 !strconcat(OpStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8374 [(set RC:$dst, (!cast<Intrinsic>("int_x86_"#OpStr#VL) RC:$src1,
8376 Sched<[WriteVecIMul]>;
8377 def rm : I<0xda, MRMSrcMem, (outs RC:$dst),
8378 (ins RC:$src1, MemOp:$src2),
8379 !strconcat(OpStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8380 [(set RC:$dst, (!cast<Intrinsic>("int_x86_"#OpStr#VL) RC:$src1,
8381 (LD addr:$src2)))]>,
8382 Sched<[WriteVecIMul]>;
8386 defm VSM4KEY4 : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8XS, VEX_4V;
8387 defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8XS, VEX_L, VEX_4V;
8388 defm VSM4RNDS4 : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8XD, VEX_4V;
8389 defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8XD, VEX_L, VEX_4V;
8391 let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in
8392 multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
8393 let isCommutable = IsCommutable in
8394 def rr : I<opc, MRMSrcReg, (outs VR128:$dst),
8395 (ins VR128:$src1, VR128:$src2, VR128:$src3),
8396 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8398 (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
8399 VR128:$src1, VR128:$src2, VR128:$src3)))]>,
8400 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
8402 def rm : I<opc, MRMSrcMem, (outs VR128:$dst),
8403 (ins VR128:$src1, VR128:$src2, i128mem:$src3),
8404 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8406 (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
8407 VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>,
8408 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
8410 let isCommutable = IsCommutable in
8411 def Yrr : I<opc, MRMSrcReg, (outs VR256:$dst),
8412 (ins VR256:$src1, VR256:$src2, VR256:$src3),
8413 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8415 (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
8416 VR256:$src1, VR256:$src2, VR256:$src3)))]>,
8417 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
8419 def Yrm : I<opc, MRMSrcMem, (outs VR256:$dst),
8420 (ins VR256:$src1, VR256:$src2, i256mem:$src3),
8421 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8423 (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
8424 VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>,
8425 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
8428 defm VPDPWSUD : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8XS;
8429 defm VPDPWSUDS : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8XS;
8430 defm VPDPWUSD : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8PD;
8431 defm VPDPWUSDS : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8PD;
8432 defm VPDPWUUD : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8PS;
8433 defm VPDPWUUDS : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8PS;