1 //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file describes the X86 SSE instruction set, defining the instructions,
10 // and properties of the instructions which are needed for code generation,
11 // machine code emission, and analysis.
13 //===----------------------------------------------------------------------===//
15 //===----------------------------------------------------------------------===//
16 // SSE 1 & 2 Instructions Classes
17 //===----------------------------------------------------------------------===//
19 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
21 RegisterClass RC, X86MemOperand x86memop,
22 Domain d, X86FoldableSchedWrite sched,
24 let isCodeGenOnly = 1 in {
25 let isCommutable = 1 in {
26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38 Sched<[sched.Folded, sched.ReadAfterFold]>;
42 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
44 SDPatternOperator OpNode, RegisterClass RC,
45 ValueType VT, string asm, Operand memopr,
46 ComplexPattern mem_cpat, Domain d,
47 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48 let hasSideEffects = 0 in {
49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60 [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
61 Sched<[sched.Folded, sched.ReadAfterFold]>;
65 /// sse12_fp_packed - SSE 1 & 2 packed instructions class
66 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
67 RegisterClass RC, ValueType vt,
68 X86MemOperand x86memop, PatFrag mem_frag,
69 Domain d, X86FoldableSchedWrite sched,
71 let isCommutable = 1 in
72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
85 Sched<[sched.Folded, sched.ReadAfterFold]>;
88 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89 multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90 string OpcodeStr, X86MemOperand x86memop,
91 X86FoldableSchedWrite sched,
92 list<dag> pat_rr, list<dag> pat_rm,
94 let isCommutable = 1, hasSideEffects = 0 in
95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
101 let hasSideEffects = 0, mayLoad = 1 in
102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
107 Sched<[sched.Folded, sched.ReadAfterFold]>;
111 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112 // This is expanded by ExpandPostRAPseudos.
113 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114 isPseudo = 1, SchedRW = [WriteZero] in {
115 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
116 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
117 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
118 [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
121 //===----------------------------------------------------------------------===//
122 // AVX & SSE - Zero/One Vectors
123 //===----------------------------------------------------------------------===//
125 // Alias instruction that maps zero vector to pxor / xorp* for sse.
126 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
127 // swizzled by ExecutionDomainFix to pxor.
128 // We set canFoldAsLoad because this can be converted to a constant-pool
129 // load of an all-zeros value if folding it would be beneficial.
130 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
131 isPseudo = 1, SchedRW = [WriteZero] in {
132 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
133 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
136 let Predicates = [NoAVX512] in
137 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
140 // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
141 // and doesn't need it because on sandy bridge the register is set to zero
142 // at the rename stage without using any execution unit, so SET0PSY
143 // and SET0PDY can be used for vector int instructions without penalty
144 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
145 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
146 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
147 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
150 // We set canFoldAsLoad because this can be converted to a constant-pool
151 // load of an all-ones value if folding it would be beneficial.
152 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
153 isPseudo = 1, SchedRW = [WriteZero] in {
154 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
155 [(set VR128:$dst, (v4i32 immAllOnesV))]>;
156 let Predicates = [HasAVX1Only, OptForMinSize] in {
157 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
158 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
160 let Predicates = [HasAVX2] in
161 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
162 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
165 //===----------------------------------------------------------------------===//
166 // SSE 1 & 2 - Move FP Scalar Instructions
168 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
169 // register copies because it's a partial register update; Register-to-register
170 // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
171 // that the insert be implementable in terms of a copy, and just mentioned, we
172 // don't use movss/movsd for copies.
173 //===----------------------------------------------------------------------===//
175 multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
176 X86MemOperand x86memop, string base_opc,
177 string asm_opr, Domain d, string Name> {
178 let isCommutable = 1 in
179 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
180 (ins VR128:$src1, VR128:$src2),
181 !strconcat(base_opc, asm_opr),
182 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
183 Sched<[SchedWriteFShuffle.XMM]>;
185 // For the disassembler
186 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
187 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
188 (ins VR128:$src1, VR128:$src2),
189 !strconcat(base_opc, asm_opr), []>,
190 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
193 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
194 X86MemOperand x86memop, string OpcodeStr,
195 Domain d, string Name, Predicate pred> {
197 let Predicates = [UseAVX, OptForSize] in
198 defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
199 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
201 VEX_4V, VEX_LIG, VEX_WIG;
203 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
204 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
205 [(store RC:$src, addr:$dst)], d>,
206 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
208 let Constraints = "$src1 = $dst" in {
209 let Predicates = [pred, NoSSE41_Or_OptForSize] in
210 defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
211 "\t{$src2, $dst|$dst, $src2}", d, Name>;
214 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
215 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
216 [(store RC:$src, addr:$dst)], d>,
217 Sched<[WriteFStore]>;
219 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
220 (!cast<Instruction>("V"#NAME#"rr_REV")
221 VR128:$dst, VR128:$src1, VR128:$src2), 0>;
222 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
223 (!cast<Instruction>(NAME#"rr_REV")
224 VR128:$dst, VR128:$src2), 0>;
227 // Loading from memory automatically zeroing upper bits.
228 multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
229 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
231 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
232 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
233 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
234 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
235 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
236 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
237 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
240 // _alt version uses FR32/FR64 register class.
241 let isCodeGenOnly = 1 in {
242 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
243 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
244 [(set RC:$dst, (mem_pat addr:$src))], d>,
245 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
246 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
247 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
248 [(set RC:$dst, (mem_pat addr:$src))], d>,
253 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
254 SSEPackedSingle, "MOVSS", UseSSE1>, XS;
255 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
256 SSEPackedDouble, "MOVSD", UseSSE2>, XD;
258 let canFoldAsLoad = 1, isReMaterializable = 1 in {
259 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
260 SSEPackedSingle>, XS;
261 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
262 SSEPackedDouble>, XD;
266 let Predicates = [UseAVX] in {
267 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
268 (VMOVSSrm addr:$src)>;
269 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
270 (VMOVSDrm addr:$src)>;
272 // Represent the same patterns above but in the form they appear for
274 def : Pat<(v8f32 (X86vzload32 addr:$src)),
275 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
276 def : Pat<(v4f64 (X86vzload64 addr:$src)),
277 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
280 let Predicates = [UseAVX, OptForSize] in {
281 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
282 // MOVSS to the lower bits.
283 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
284 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
285 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
286 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
288 // Move low f32 and clear high bits.
289 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
290 (SUBREG_TO_REG (i32 0),
291 (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
292 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
293 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
294 (SUBREG_TO_REG (i32 0),
295 (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
296 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
299 let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
300 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
301 // MOVSS to the lower bits.
302 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
303 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
304 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
305 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
308 let Predicates = [UseSSE2] in
309 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
310 (MOVSDrm addr:$src)>;
312 let Predicates = [UseSSE1] in
313 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
314 (MOVSSrm addr:$src)>;
316 //===----------------------------------------------------------------------===//
317 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
318 //===----------------------------------------------------------------------===//
320 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
321 X86MemOperand x86memop, PatFrag ld_frag,
322 string asm, Domain d,
323 X86SchedWriteMoveLS sched> {
324 let hasSideEffects = 0, isMoveReg = 1 in
325 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
326 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
328 let canFoldAsLoad = 1, isReMaterializable = 1 in
329 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
330 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
331 [(set RC:$dst, (ld_frag addr:$src))], d>,
335 let Predicates = [HasAVX, NoVLX] in {
336 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
337 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
339 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
340 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
342 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
343 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
345 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
346 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
349 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
350 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
351 PS, VEX, VEX_L, VEX_WIG;
352 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
353 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
354 PD, VEX, VEX_L, VEX_WIG;
355 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
356 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
357 PS, VEX, VEX_L, VEX_WIG;
358 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
359 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
360 PD, VEX, VEX_L, VEX_WIG;
363 let Predicates = [UseSSE1] in {
364 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
365 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
367 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
368 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
371 let Predicates = [UseSSE2] in {
372 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
373 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
375 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
376 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
380 let Predicates = [HasAVX, NoVLX] in {
381 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
382 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
383 "movaps\t{$src, $dst|$dst, $src}",
384 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
386 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
387 "movapd\t{$src, $dst|$dst, $src}",
388 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
390 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
391 "movups\t{$src, $dst|$dst, $src}",
392 [(store (v4f32 VR128:$src), addr:$dst)]>,
394 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
395 "movupd\t{$src, $dst|$dst, $src}",
396 [(store (v2f64 VR128:$src), addr:$dst)]>,
400 let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
401 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
402 "movaps\t{$src, $dst|$dst, $src}",
403 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
405 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
406 "movapd\t{$src, $dst|$dst, $src}",
407 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
409 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
410 "movups\t{$src, $dst|$dst, $src}",
411 [(store (v8f32 VR256:$src), addr:$dst)]>,
413 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
414 "movupd\t{$src, $dst|$dst, $src}",
415 [(store (v4f64 VR256:$src), addr:$dst)]>,
421 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
423 let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
424 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
426 "movaps\t{$src, $dst|$dst, $src}", []>,
427 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
428 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
430 "movapd\t{$src, $dst|$dst, $src}", []>,
431 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
432 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
434 "movups\t{$src, $dst|$dst, $src}", []>,
435 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
436 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
438 "movupd\t{$src, $dst|$dst, $src}", []>,
439 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
442 let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
443 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
445 "movaps\t{$src, $dst|$dst, $src}", []>,
446 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
447 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
449 "movapd\t{$src, $dst|$dst, $src}", []>,
450 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
451 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
453 "movups\t{$src, $dst|$dst, $src}", []>,
454 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
455 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
457 "movupd\t{$src, $dst|$dst, $src}", []>,
458 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
462 // Reversed version with ".s" suffix for GAS compatibility.
463 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
464 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
465 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
466 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
467 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
468 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
469 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
470 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
471 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
472 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
473 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
474 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
475 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
476 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
477 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
478 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
480 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
481 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
482 "movaps\t{$src, $dst|$dst, $src}",
483 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
484 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
485 "movapd\t{$src, $dst|$dst, $src}",
486 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
487 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
488 "movups\t{$src, $dst|$dst, $src}",
489 [(store (v4f32 VR128:$src), addr:$dst)]>;
490 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
491 "movupd\t{$src, $dst|$dst, $src}",
492 [(store (v2f64 VR128:$src), addr:$dst)]>;
496 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
497 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
498 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
499 "movaps\t{$src, $dst|$dst, $src}", []>,
500 FoldGenData<"MOVAPSrr">;
501 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
502 "movapd\t{$src, $dst|$dst, $src}", []>,
503 FoldGenData<"MOVAPDrr">;
504 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
505 "movups\t{$src, $dst|$dst, $src}", []>,
506 FoldGenData<"MOVUPSrr">;
507 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
508 "movupd\t{$src, $dst|$dst, $src}", []>,
509 FoldGenData<"MOVUPDrr">;
512 // Reversed version with ".s" suffix for GAS compatibility.
513 def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
514 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
515 def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
516 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
517 def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
518 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
519 def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
520 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
522 let Predicates = [HasAVX, NoVLX] in {
523 // 256-bit load/store need to use floating point load/store in case we don't
524 // have AVX2. Execution domain fixing will convert to integer if AVX2 is
525 // available and changing the domain is beneficial.
526 def : Pat<(alignedloadv4i64 addr:$src),
527 (VMOVAPSYrm addr:$src)>;
528 def : Pat<(alignedloadv8i32 addr:$src),
529 (VMOVAPSYrm addr:$src)>;
530 def : Pat<(alignedloadv16i16 addr:$src),
531 (VMOVAPSYrm addr:$src)>;
532 def : Pat<(alignedloadv32i8 addr:$src),
533 (VMOVAPSYrm addr:$src)>;
534 def : Pat<(loadv4i64 addr:$src),
535 (VMOVUPSYrm addr:$src)>;
536 def : Pat<(loadv8i32 addr:$src),
537 (VMOVUPSYrm addr:$src)>;
538 def : Pat<(loadv16i16 addr:$src),
539 (VMOVUPSYrm addr:$src)>;
540 def : Pat<(loadv32i8 addr:$src),
541 (VMOVUPSYrm addr:$src)>;
543 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
544 (VMOVAPSYmr addr:$dst, VR256:$src)>;
545 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
546 (VMOVAPSYmr addr:$dst, VR256:$src)>;
547 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
548 (VMOVAPSYmr addr:$dst, VR256:$src)>;
549 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
550 (VMOVAPSYmr addr:$dst, VR256:$src)>;
551 def : Pat<(store (v4i64 VR256:$src), addr:$dst),
552 (VMOVUPSYmr addr:$dst, VR256:$src)>;
553 def : Pat<(store (v8i32 VR256:$src), addr:$dst),
554 (VMOVUPSYmr addr:$dst, VR256:$src)>;
555 def : Pat<(store (v16i16 VR256:$src), addr:$dst),
556 (VMOVUPSYmr addr:$dst, VR256:$src)>;
557 def : Pat<(store (v32i8 VR256:$src), addr:$dst),
558 (VMOVUPSYmr addr:$dst, VR256:$src)>;
561 // Use movaps / movups for SSE integer load / store (one byte shorter).
562 // The instructions selected below are then converted to MOVDQA/MOVDQU
563 // during the SSE domain pass.
564 let Predicates = [UseSSE1] in {
565 def : Pat<(alignedloadv2i64 addr:$src),
566 (MOVAPSrm addr:$src)>;
567 def : Pat<(alignedloadv4i32 addr:$src),
568 (MOVAPSrm addr:$src)>;
569 def : Pat<(alignedloadv8i16 addr:$src),
570 (MOVAPSrm addr:$src)>;
571 def : Pat<(alignedloadv16i8 addr:$src),
572 (MOVAPSrm addr:$src)>;
573 def : Pat<(loadv2i64 addr:$src),
574 (MOVUPSrm addr:$src)>;
575 def : Pat<(loadv4i32 addr:$src),
576 (MOVUPSrm addr:$src)>;
577 def : Pat<(loadv8i16 addr:$src),
578 (MOVUPSrm addr:$src)>;
579 def : Pat<(loadv16i8 addr:$src),
580 (MOVUPSrm addr:$src)>;
582 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
583 (MOVAPSmr addr:$dst, VR128:$src)>;
584 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
585 (MOVAPSmr addr:$dst, VR128:$src)>;
586 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
587 (MOVAPSmr addr:$dst, VR128:$src)>;
588 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
589 (MOVAPSmr addr:$dst, VR128:$src)>;
590 def : Pat<(store (v2i64 VR128:$src), addr:$dst),
591 (MOVUPSmr addr:$dst, VR128:$src)>;
592 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
593 (MOVUPSmr addr:$dst, VR128:$src)>;
594 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
595 (MOVUPSmr addr:$dst, VR128:$src)>;
596 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
597 (MOVUPSmr addr:$dst, VR128:$src)>;
600 //===----------------------------------------------------------------------===//
601 // SSE 1 & 2 - Move Low packed FP Instructions
602 //===----------------------------------------------------------------------===//
604 multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode,
605 string base_opc, string asm_opr> {
606 // No pattern as they need be special cased between high and low.
607 let hasSideEffects = 0, mayLoad = 1 in
608 def PSrm : PI<opc, MRMSrcMem,
609 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
610 !strconcat(base_opc, "s", asm_opr),
611 [], SSEPackedSingle>, PS,
612 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
614 def PDrm : PI<opc, MRMSrcMem,
615 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
616 !strconcat(base_opc, "d", asm_opr),
617 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
618 (scalar_to_vector (loadf64 addr:$src2)))))],
619 SSEPackedDouble>, PD,
620 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
623 multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
625 let Predicates = [UseAVX] in
626 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
627 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
630 let Constraints = "$src1 = $dst" in
631 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
632 "\t{$src2, $dst|$dst, $src2}">;
635 defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
637 let SchedRW = [WriteFStore] in {
638 let Predicates = [UseAVX] in {
639 let mayStore = 1, hasSideEffects = 0 in
640 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
641 "movlps\t{$src, $dst|$dst, $src}",
644 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
645 "movlpd\t{$src, $dst|$dst, $src}",
646 [(store (f64 (extractelt (v2f64 VR128:$src),
647 (iPTR 0))), addr:$dst)]>,
650 let mayStore = 1, hasSideEffects = 0 in
651 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
652 "movlps\t{$src, $dst|$dst, $src}",
654 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
655 "movlpd\t{$src, $dst|$dst, $src}",
656 [(store (f64 (extractelt (v2f64 VR128:$src),
657 (iPTR 0))), addr:$dst)]>;
660 let Predicates = [UseSSE1] in {
661 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
662 // end up with a movsd or blend instead of shufp.
663 // No need for aligned load, we're only loading 64-bits.
664 def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1,
666 (MOVLPSrm VR128:$src1, addr:$src2)>;
667 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
668 (MOVLPSrm VR128:$src1, addr:$src2)>;
670 def : Pat<(v4f32 (X86vzload64 addr:$src)),
671 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
672 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
673 (MOVLPSmr addr:$dst, VR128:$src)>;
676 //===----------------------------------------------------------------------===//
677 // SSE 1 & 2 - Move Hi packed FP Instructions
678 //===----------------------------------------------------------------------===//
680 defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
682 let SchedRW = [WriteFStore] in {
683 // v2f64 extract element 1 is always custom lowered to unpack high to low
684 // and extract element 0 so the non-store version isn't too horrible.
685 let Predicates = [UseAVX] in {
686 let mayStore = 1, hasSideEffects = 0 in
687 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
688 "movhps\t{$src, $dst|$dst, $src}",
690 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
691 "movhpd\t{$src, $dst|$dst, $src}",
692 [(store (f64 (extractelt
693 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
694 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
696 let mayStore = 1, hasSideEffects = 0 in
697 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
698 "movhps\t{$src, $dst|$dst, $src}",
700 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
701 "movhpd\t{$src, $dst|$dst, $src}",
702 [(store (f64 (extractelt
703 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
704 (iPTR 0))), addr:$dst)]>;
707 let Predicates = [UseAVX] in {
708 // Also handle an i64 load because that may get selected as a faster way to
710 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
711 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
712 (VMOVHPDrm VR128:$src1, addr:$src2)>;
713 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
714 (VMOVHPDrm VR128:$src1, addr:$src2)>;
716 def : Pat<(store (f64 (extractelt
717 (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
718 (iPTR 0))), addr:$dst),
719 (VMOVHPDmr addr:$dst, VR128:$src)>;
722 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
723 (VMOVLPDrm VR128:$src1, addr:$src2)>;
726 let Predicates = [UseSSE1] in {
727 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
728 // end up with a movsd or blend instead of shufp.
729 // No need for aligned load, we're only loading 64-bits.
730 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))),
731 (MOVHPSrm VR128:$src1, addr:$src2)>;
732 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
733 (MOVHPSrm VR128:$src1, addr:$src2)>;
735 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
737 (MOVHPSmr addr:$dst, VR128:$src)>;
740 let Predicates = [UseSSE2] in {
743 // Also handle an i64 load because that may get selected as a faster way to
745 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
746 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
747 (MOVHPDrm VR128:$src1, addr:$src2)>;
748 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
749 (MOVHPDrm VR128:$src1, addr:$src2)>;
751 def : Pat<(store (f64 (extractelt
752 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
753 (iPTR 0))), addr:$dst),
754 (MOVHPDmr addr:$dst, VR128:$src)>;
757 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
758 (MOVLPDrm VR128:$src1, addr:$src2)>;
761 let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
762 // Use MOVLPD to load into the low bits from a full vector unless we can use
764 def : Pat<(X86Movsd VR128:$src1, (v2f64 (nonvolatile_load addr:$src2))),
765 (MOVLPDrm VR128:$src1, addr:$src2)>;
768 //===----------------------------------------------------------------------===//
769 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
770 //===----------------------------------------------------------------------===//
772 let Predicates = [UseAVX] in {
773 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
774 (ins VR128:$src1, VR128:$src2),
775 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
777 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
778 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
779 let isCommutable = 1 in
780 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
781 (ins VR128:$src1, VR128:$src2),
782 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
784 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
785 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
788 let Constraints = "$src1 = $dst" in {
789 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
790 (ins VR128:$src1, VR128:$src2),
791 "movlhps\t{$src2, $dst|$dst, $src2}",
793 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
794 Sched<[SchedWriteFShuffle.XMM]>;
795 let isCommutable = 1 in
796 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
797 (ins VR128:$src1, VR128:$src2),
798 "movhlps\t{$src2, $dst|$dst, $src2}",
800 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
801 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
804 //===----------------------------------------------------------------------===//
805 // SSE 1 & 2 - Conversion Instructions
806 //===----------------------------------------------------------------------===//
808 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
809 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
810 string asm, string mem, X86FoldableSchedWrite sched,
811 SchedRead Int2Fpu = ReadDefault> {
812 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
813 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
814 [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
815 Sched<[sched, Int2Fpu]>;
816 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
817 mem#"\t{$src, $dst|$dst, $src}",
818 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
819 Sched<[sched.Folded]>;
822 multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
823 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
824 string asm, Domain d, X86FoldableSchedWrite sched> {
825 let hasSideEffects = 0 in {
826 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
827 [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>,
830 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
831 [(set RC:$dst, (DstTy (sint_to_fp
832 (SrcTy (ld_frag addr:$src)))))], d>,
833 Sched<[sched.Folded]>;
837 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
838 X86MemOperand x86memop, string asm, string mem,
839 X86FoldableSchedWrite sched> {
840 let hasSideEffects = 0, Predicates = [UseAVX] in {
841 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
842 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
843 Sched<[sched, ReadDefault, ReadInt2Fpu]>;
845 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
846 (ins DstRC:$src1, x86memop:$src),
847 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
848 Sched<[sched.Folded, sched.ReadAfterFold]>;
849 } // hasSideEffects = 0
852 let isCodeGenOnly = 1, Predicates = [UseAVX] in {
853 defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
854 "cvttss2si", "cvttss2si",
857 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
858 "cvttss2si", "cvttss2si",
860 XS, VEX, VEX_W, VEX_LIG;
861 defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
862 "cvttsd2si", "cvttsd2si",
865 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
866 "cvttsd2si", "cvttsd2si",
868 XD, VEX, VEX_W, VEX_LIG;
871 // The assembler can recognize rr 64-bit instructions by seeing a rxx
872 // register, but the same isn't true when only using memory operands,
873 // provide other assembly "l" and "q" forms to address this explicitly
874 // where appropriate to do so.
875 let isCodeGenOnly = 1 in {
876 defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
877 WriteCvtI2SS>, XS, VEX_4V, VEX_LIG;
878 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
879 WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
880 defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
881 WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
882 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
883 WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
884 } // isCodeGenOnly = 1
886 let Predicates = [UseAVX] in {
887 def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
888 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
889 def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
890 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
891 def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
892 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
893 def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
894 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
896 def : Pat<(f32 (sint_to_fp GR32:$src)),
897 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
898 def : Pat<(f32 (sint_to_fp GR64:$src)),
899 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
900 def : Pat<(f64 (sint_to_fp GR32:$src)),
901 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
902 def : Pat<(f64 (sint_to_fp GR64:$src)),
903 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
906 let isCodeGenOnly = 1 in {
907 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
908 "cvttss2si", "cvttss2si",
910 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
911 "cvttss2si", "cvttss2si",
912 WriteCvtSS2I>, XS, REX_W;
913 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
914 "cvttsd2si", "cvttsd2si",
916 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
917 "cvttsd2si", "cvttsd2si",
918 WriteCvtSD2I>, XD, REX_W;
919 defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
920 "cvtsi2ss", "cvtsi2ss{l}",
921 WriteCvtI2SS, ReadInt2Fpu>, XS;
922 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
923 "cvtsi2ss", "cvtsi2ss{q}",
924 WriteCvtI2SS, ReadInt2Fpu>, XS, REX_W;
925 defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
926 "cvtsi2sd", "cvtsi2sd{l}",
927 WriteCvtI2SD, ReadInt2Fpu>, XD;
928 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
929 "cvtsi2sd", "cvtsi2sd{q}",
930 WriteCvtI2SD, ReadInt2Fpu>, XD, REX_W;
931 } // isCodeGenOnly = 1
933 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
934 // and/or XMM operand(s).
936 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
937 ValueType DstVT, ValueType SrcVT, SDNode OpNode,
938 Operand memop, ComplexPattern mem_cpat, string asm,
939 X86FoldableSchedWrite sched> {
940 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
941 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
942 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
944 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
945 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
946 [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>,
947 Sched<[sched.Folded]>;
950 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
951 RegisterClass DstRC, X86MemOperand x86memop,
952 string asm, string mem, X86FoldableSchedWrite sched,
954 let hasSideEffects = 0 in {
955 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
957 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
958 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
959 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
961 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
962 (ins DstRC:$src1, x86memop:$src2),
964 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
965 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
966 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
970 let Predicates = [UseAVX] in {
971 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
972 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
973 WriteCvtSD2I>, XD, VEX, VEX_LIG;
974 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
975 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
976 WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG;
978 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
979 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD;
980 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
981 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
984 let Predicates = [UseAVX] in {
985 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
986 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG;
987 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
988 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG, VEX_W;
989 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
990 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG;
991 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
992 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG, VEX_W;
994 let Constraints = "$src1 = $dst" in {
995 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
996 i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS;
997 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
998 i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, REX_W;
999 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1000 i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD;
1001 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1002 i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, REX_W;
1005 def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1006 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1007 def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1008 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1009 def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1010 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1011 def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1012 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1014 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1015 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1016 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1017 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1019 def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1020 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1021 def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1022 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1023 def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1024 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1025 def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1026 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1028 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1029 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1030 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1031 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1035 // Aliases for intrinsics
1036 let Predicates = [UseAVX] in {
1037 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1038 ssmem, sse_load_f32, "cvttss2si",
1039 WriteCvtSS2I>, XS, VEX, VEX_LIG;
1040 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1041 X86cvtts2Int, ssmem, sse_load_f32,
1042 "cvttss2si", WriteCvtSS2I>,
1043 XS, VEX, VEX_LIG, VEX_W;
1044 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1045 sdmem, sse_load_f64, "cvttsd2si",
1046 WriteCvtSS2I>, XD, VEX, VEX_LIG;
1047 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1048 X86cvtts2Int, sdmem, sse_load_f64,
1049 "cvttsd2si", WriteCvtSS2I>,
1050 XD, VEX, VEX_LIG, VEX_W;
1052 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1053 ssmem, sse_load_f32, "cvttss2si",
1055 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1056 X86cvtts2Int, ssmem, sse_load_f32,
1057 "cvttss2si", WriteCvtSS2I>, XS, REX_W;
1058 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1059 sdmem, sse_load_f64, "cvttsd2si",
1061 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1062 X86cvtts2Int, sdmem, sse_load_f64,
1063 "cvttsd2si", WriteCvtSD2I>, XD, REX_W;
1065 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1066 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1067 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1068 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1069 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1070 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1071 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1072 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1073 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1074 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1075 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1076 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1077 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1078 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1079 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1080 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1082 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1083 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1084 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1085 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1086 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1087 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1088 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1089 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1090 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1091 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1092 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1093 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1094 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1095 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1096 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1097 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1099 let Predicates = [UseAVX] in {
1100 defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1101 ssmem, sse_load_f32, "cvtss2si",
1102 WriteCvtSS2I>, XS, VEX, VEX_LIG;
1103 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1104 ssmem, sse_load_f32, "cvtss2si",
1105 WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG;
1107 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1108 ssmem, sse_load_f32, "cvtss2si",
1110 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1111 ssmem, sse_load_f32, "cvtss2si",
1112 WriteCvtSS2I>, XS, REX_W;
1114 defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1115 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1116 SSEPackedSingle, WriteCvtI2PS>,
1117 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1118 defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1119 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1120 SSEPackedSingle, WriteCvtI2PSY>,
1121 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1123 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1124 "cvtdq2ps\t{$src, $dst|$dst, $src}",
1125 SSEPackedSingle, WriteCvtI2PS>,
1126 PS, Requires<[UseSSE2]>;
1129 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1130 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1131 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1132 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1133 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1134 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1135 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1136 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1137 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1138 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1139 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1140 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1141 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1142 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1143 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1144 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1147 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1148 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1149 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1150 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1151 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1152 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1153 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1154 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1155 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1156 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1157 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1158 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1159 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1160 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1161 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1162 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1166 // Convert scalar double to scalar single
1167 let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in {
1168 def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1169 (ins FR32:$src1, FR64:$src2),
1170 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1171 VEX_4V, VEX_LIG, VEX_WIG,
1172 Sched<[WriteCvtSD2SS]>;
1174 def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1175 (ins FR32:$src1, f64mem:$src2),
1176 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1177 XD, VEX_4V, VEX_LIG, VEX_WIG,
1178 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1181 def : Pat<(f32 (fpround FR64:$src)),
1182 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1185 let isCodeGenOnly = 1 in {
1186 def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1187 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1188 [(set FR32:$dst, (fpround FR64:$src))]>,
1189 Sched<[WriteCvtSD2SS]>;
1190 def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1191 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1192 [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>,
1193 XD, Requires<[UseSSE2, OptForSize]>,
1194 Sched<[WriteCvtSD2SS.Folded]>;
1197 def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1198 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1199 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1201 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1202 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1203 Sched<[WriteCvtSD2SS]>;
1204 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1205 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1206 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1208 (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>,
1209 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1210 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1211 let Constraints = "$src1 = $dst" in {
1212 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1213 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1214 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1216 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1217 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1218 def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1219 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1220 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1222 (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>,
1223 XD, Requires<[UseSSE2]>,
1224 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1227 // Convert scalar single to scalar double
1228 // SSE2 instructions with XS prefix
1229 let isCodeGenOnly = 1, hasSideEffects = 0 in {
1230 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1231 (ins FR64:$src1, FR32:$src2),
1232 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1233 XS, VEX_4V, VEX_LIG, VEX_WIG,
1234 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>;
1236 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1237 (ins FR64:$src1, f32mem:$src2),
1238 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1239 XS, VEX_4V, VEX_LIG, VEX_WIG,
1240 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1241 Requires<[UseAVX, OptForSize]>;
1242 } // isCodeGenOnly = 1, hasSideEffects = 0
1244 def : Pat<(f64 (fpextend FR32:$src)),
1245 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1246 def : Pat<(fpextend (loadf32 addr:$src)),
1247 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1249 let isCodeGenOnly = 1 in {
1250 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1251 "cvtss2sd\t{$src, $dst|$dst, $src}",
1252 [(set FR64:$dst, (fpextend FR32:$src))]>,
1253 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>;
1254 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1255 "cvtss2sd\t{$src, $dst|$dst, $src}",
1256 [(set FR64:$dst, (fpextend (loadf32 addr:$src)))]>,
1257 XS, Requires<[UseSSE2, OptForSize]>,
1258 Sched<[WriteCvtSS2SD.Folded]>;
1259 } // isCodeGenOnly = 1
1261 let hasSideEffects = 0 in {
1262 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1263 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1264 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1265 []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
1266 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1268 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1269 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1270 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1271 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
1272 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1273 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1274 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1275 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1276 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1277 []>, XS, Requires<[UseSSE2]>,
1278 Sched<[WriteCvtSS2SD]>;
1280 def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1281 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1282 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1283 []>, XS, Requires<[UseSSE2]>,
1284 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1286 } // hasSideEffects = 0
1288 // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1289 // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1290 // vmovs{s,d} instructions
1291 let Predicates = [UseAVX] in {
1292 def : Pat<(v4f32 (X86Movss
1294 (v4f32 (scalar_to_vector
1295 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1296 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1298 def : Pat<(v2f64 (X86Movsd
1300 (v2f64 (scalar_to_vector
1301 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1302 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1304 def : Pat<(v4f32 (X86Movss
1306 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1307 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1309 def : Pat<(v4f32 (X86Movss
1311 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
1312 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1314 def : Pat<(v4f32 (X86Movss
1316 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1317 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1319 def : Pat<(v4f32 (X86Movss
1321 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
1322 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1324 def : Pat<(v2f64 (X86Movsd
1326 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1327 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1329 def : Pat<(v2f64 (X86Movsd
1331 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
1332 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1334 def : Pat<(v2f64 (X86Movsd
1336 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1337 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1339 def : Pat<(v2f64 (X86Movsd
1341 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
1342 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1343 } // Predicates = [UseAVX]
1345 let Predicates = [UseSSE2] in {
1346 def : Pat<(v4f32 (X86Movss
1348 (v4f32 (scalar_to_vector
1349 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1350 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1352 def : Pat<(v2f64 (X86Movsd
1354 (v2f64 (scalar_to_vector
1355 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1356 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1358 def : Pat<(v2f64 (X86Movsd
1360 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1361 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1363 def : Pat<(v2f64 (X86Movsd
1365 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
1366 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1368 def : Pat<(v2f64 (X86Movsd
1370 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1371 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1373 def : Pat<(v2f64 (X86Movsd
1375 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
1376 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1377 } // Predicates = [UseSSE2]
1379 let Predicates = [UseSSE1] in {
1380 def : Pat<(v4f32 (X86Movss
1382 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1383 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1385 def : Pat<(v4f32 (X86Movss
1387 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
1388 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1390 def : Pat<(v4f32 (X86Movss
1392 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1393 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1395 def : Pat<(v4f32 (X86Movss
1397 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
1398 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1399 } // Predicates = [UseSSE1]
1401 let Predicates = [HasAVX, NoVLX] in {
1402 // Convert packed single/double fp to doubleword
1403 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1404 "cvtps2dq\t{$src, $dst|$dst, $src}",
1405 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1406 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1407 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1408 "cvtps2dq\t{$src, $dst|$dst, $src}",
1410 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1411 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1412 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1413 "cvtps2dq\t{$src, $dst|$dst, $src}",
1415 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1416 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1417 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1418 "cvtps2dq\t{$src, $dst|$dst, $src}",
1420 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1421 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1423 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1424 "cvtps2dq\t{$src, $dst|$dst, $src}",
1425 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1426 Sched<[WriteCvtPS2I]>;
1427 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1428 "cvtps2dq\t{$src, $dst|$dst, $src}",
1430 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1431 Sched<[WriteCvtPS2ILd]>;
1434 // Convert Packed Double FP to Packed DW Integers
1435 let Predicates = [HasAVX, NoVLX] in {
1436 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1437 // register, but the same isn't true when using memory operands instead.
1438 // Provide other assembly rr and rm forms to address this explicitly.
1439 def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1440 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1442 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1443 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1446 def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1447 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1449 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1450 Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1453 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1454 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1456 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1457 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1458 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1459 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1461 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1462 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1465 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1466 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1467 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1468 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1470 def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1471 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1473 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1474 Sched<[WriteCvtPD2ILd]>;
1475 def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1476 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1478 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1479 Sched<[WriteCvtPD2I]>;
1481 // Convert with truncation packed single/double fp to doubleword
1482 // SSE2 packed instructions with XS prefix
1483 let Predicates = [HasAVX, NoVLX] in {
1484 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1485 "cvttps2dq\t{$src, $dst|$dst, $src}",
1487 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
1488 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1489 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1490 "cvttps2dq\t{$src, $dst|$dst, $src}",
1492 (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>,
1493 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1494 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1495 "cvttps2dq\t{$src, $dst|$dst, $src}",
1497 (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>,
1498 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1499 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1500 "cvttps2dq\t{$src, $dst|$dst, $src}",
1502 (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>,
1504 Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1507 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1508 "cvttps2dq\t{$src, $dst|$dst, $src}",
1510 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
1511 Sched<[WriteCvtPS2I]>;
1512 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1513 "cvttps2dq\t{$src, $dst|$dst, $src}",
1515 (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>,
1516 Sched<[WriteCvtPS2ILd]>;
1518 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1519 // register, but the same isn't true when using memory operands instead.
1520 // Provide other assembly rr and rm forms to address this explicitly.
1521 let Predicates = [HasAVX, NoVLX] in {
1523 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1524 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1526 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
1527 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1528 def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1529 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1531 (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>,
1532 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1535 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1536 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1538 (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>,
1539 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1540 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1541 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1543 (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>,
1544 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1545 } // Predicates = [HasAVX, NoVLX]
1547 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1548 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1549 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1550 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1552 let Predicates = [HasAVX, NoVLX] in {
1553 def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
1554 (VCVTTPD2DQYrr VR256:$src)>;
1555 def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
1556 (VCVTTPD2DQYrm addr:$src)>;
1559 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1560 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1562 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
1563 Sched<[WriteCvtPD2I]>;
1564 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1565 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1567 (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>,
1568 Sched<[WriteCvtPD2ILd]>;
1570 // Convert packed single to packed double
1571 let Predicates = [HasAVX, NoVLX] in {
1572 // SSE2 instructions without OpSize prefix
1573 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1574 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1575 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
1576 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1577 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1578 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1579 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1580 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1581 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1582 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1583 [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>,
1584 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1585 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1586 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1587 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1588 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1591 let Predicates = [UseSSE2] in {
1592 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1593 "cvtps2pd\t{$src, $dst|$dst, $src}",
1594 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
1595 PS, Sched<[WriteCvtPS2PD]>;
1596 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1597 "cvtps2pd\t{$src, $dst|$dst, $src}",
1598 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1599 PS, Sched<[WriteCvtPS2PD.Folded]>;
1602 // Convert Packed DW Integers to Packed Double FP
1603 let Predicates = [HasAVX, NoVLX] in {
1604 let hasSideEffects = 0, mayLoad = 1 in
1605 def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1606 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1608 (v2f64 (X86VSintToFP
1610 (v2i64 (scalar_to_vector
1611 (loadi64 addr:$src)))))))]>,
1612 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1613 def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1614 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1616 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
1617 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1618 def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1619 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1621 (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>,
1622 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1624 def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1625 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1627 (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
1628 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1631 let hasSideEffects = 0, mayLoad = 1 in
1632 def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1633 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1635 (v2f64 (X86VSintToFP
1637 (v2i64 (scalar_to_vector
1638 (loadi64 addr:$src)))))))]>,
1639 Sched<[WriteCvtI2PDLd]>;
1640 def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1641 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1643 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
1644 Sched<[WriteCvtI2PD]>;
1646 // AVX register conversion intrinsics
1647 let Predicates = [HasAVX, NoVLX] in {
1648 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1649 (VCVTDQ2PDrm addr:$src)>;
1650 } // Predicates = [HasAVX, NoVLX]
1652 // SSE2 register conversion intrinsics
1653 let Predicates = [UseSSE2] in {
1654 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1655 (CVTDQ2PDrm addr:$src)>;
1656 } // Predicates = [UseSSE2]
1658 // Convert packed double to packed single
1659 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1660 // register, but the same isn't true when using memory operands instead.
1661 // Provide other assembly rr and rm forms to address this explicitly.
1662 let Predicates = [HasAVX, NoVLX] in {
1664 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1665 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1666 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
1667 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1668 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1669 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1670 [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>,
1671 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1673 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1674 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1675 [(set VR128:$dst, (X86vfpround VR256:$src))]>,
1676 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1677 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1678 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1679 [(set VR128:$dst, (X86vfpround (loadv4f64 addr:$src)))]>,
1680 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1681 } // Predicates = [HasAVX, NoVLX]
1683 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1684 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1685 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1686 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1688 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1689 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1690 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
1691 Sched<[WriteCvtPD2PS]>;
1692 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1693 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1694 [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>,
1695 Sched<[WriteCvtPD2PS.Folded]>;
1697 let Predicates = [HasAVX, NoVLX] in {
1698 def : Pat<(v4f32 (fpround (v4f64 VR256:$src))),
1699 (VCVTPD2PSYrr VR256:$src)>;
1700 def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))),
1701 (VCVTPD2PSYrm addr:$src)>;
1704 //===----------------------------------------------------------------------===//
1705 // SSE 1 & 2 - Compare Instructions
1706 //===----------------------------------------------------------------------===//
1708 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1709 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1710 SDNode OpNode, ValueType VT,
1711 PatFrag ld_frag, string asm,
1712 X86FoldableSchedWrite sched> {
1713 let isCommutable = 1 in
1714 def rr : SIi8<0xC2, MRMSrcReg,
1715 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1716 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
1718 def rm : SIi8<0xC2, MRMSrcMem,
1719 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1720 [(set RC:$dst, (OpNode (VT RC:$src1),
1721 (ld_frag addr:$src2), imm:$cc))]>,
1722 Sched<[sched.Folded, sched.ReadAfterFold]>;
1725 let isCodeGenOnly = 1 in {
1726 let ExeDomain = SSEPackedSingle in
1727 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
1728 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1729 SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
1730 let ExeDomain = SSEPackedDouble in
1731 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
1732 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1733 SchedWriteFCmpSizes.PD.Scl>,
1734 XD, VEX_4V, VEX_LIG, VEX_WIG;
1736 let Constraints = "$src1 = $dst" in {
1737 let ExeDomain = SSEPackedSingle in
1738 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
1739 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1740 SchedWriteFCmpSizes.PS.Scl>, XS;
1741 let ExeDomain = SSEPackedDouble in
1742 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
1743 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1744 SchedWriteFCmpSizes.PD.Scl>, XD;
1748 multiclass sse12_cmp_scalar_int<Operand memop,
1749 Intrinsic Int, string asm, X86FoldableSchedWrite sched,
1750 ComplexPattern mem_cpat> {
1751 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1752 (ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
1753 [(set VR128:$dst, (Int VR128:$src1,
1754 VR128:$src, imm:$cc))]>,
1757 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1758 (ins VR128:$src1, memop:$src, u8imm:$cc), asm,
1759 [(set VR128:$dst, (Int VR128:$src1,
1760 mem_cpat:$src, imm:$cc))]>,
1761 Sched<[sched.Folded, sched.ReadAfterFold]>;
1764 // Aliases to match intrinsics which expect XMM operand(s).
1765 let ExeDomain = SSEPackedSingle in
1766 defm VCMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
1767 "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
1768 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1769 XS, VEX_4V, VEX_LIG, VEX_WIG;
1770 let ExeDomain = SSEPackedDouble in
1771 defm VCMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
1772 "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
1773 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1774 XD, VEX_4V, VEX_LIG, VEX_WIG;
1775 let Constraints = "$src1 = $dst" in {
1776 let ExeDomain = SSEPackedSingle in
1777 defm CMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
1778 "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}",
1779 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1780 let ExeDomain = SSEPackedDouble in
1781 defm CMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
1782 "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}",
1783 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1787 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1788 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
1789 ValueType vt, X86MemOperand x86memop,
1790 PatFrag ld_frag, string OpcodeStr,
1791 X86FoldableSchedWrite sched> {
1792 let hasSideEffects = 0 in {
1793 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1794 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1795 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1798 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1799 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1800 [(set EFLAGS, (OpNode (vt RC:$src1),
1801 (ld_frag addr:$src2)))]>,
1802 Sched<[sched.Folded, sched.ReadAfterFold]>;
1806 // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1807 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1808 ValueType vt, Operand memop,
1809 ComplexPattern mem_cpat, string OpcodeStr,
1810 X86FoldableSchedWrite sched> {
1811 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1812 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1813 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1816 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1817 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1818 [(set EFLAGS, (OpNode (vt RC:$src1),
1820 Sched<[sched.Folded, sched.ReadAfterFold]>;
1823 let Defs = [EFLAGS] in {
1824 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
1825 "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1826 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
1827 "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1828 let Pattern = []<dag> in {
1829 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
1830 "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1831 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
1832 "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1835 let isCodeGenOnly = 1 in {
1836 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1837 sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1838 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1839 sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1841 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1842 sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1843 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1844 sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1846 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
1847 "ucomiss", WriteFCom>, PS;
1848 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
1849 "ucomisd", WriteFCom>, PD;
1851 let Pattern = []<dag> in {
1852 defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
1853 "comiss", WriteFCom>, PS;
1854 defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
1855 "comisd", WriteFCom>, PD;
1858 let isCodeGenOnly = 1 in {
1859 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1860 sse_load_f32, "ucomiss", WriteFCom>, PS;
1861 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1862 sse_load_f64, "ucomisd", WriteFCom>, PD;
1864 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1865 sse_load_f32, "comiss", WriteFCom>, PS;
1866 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1867 sse_load_f64, "comisd", WriteFCom>, PD;
1869 } // Defs = [EFLAGS]
1871 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
1872 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1873 ValueType VT, string asm,
1874 X86FoldableSchedWrite sched,
1875 Domain d, PatFrag ld_frag> {
1876 let isCommutable = 1 in
1877 def rri : PIi8<0xC2, MRMSrcReg,
1878 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1879 [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
1881 def rmi : PIi8<0xC2, MRMSrcMem,
1882 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1884 (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
1885 Sched<[sched.Folded, sched.ReadAfterFold]>;
1888 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1889 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1890 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
1891 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1892 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1893 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
1894 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1895 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1896 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
1897 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1898 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1899 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
1900 let Constraints = "$src1 = $dst" in {
1901 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1902 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1903 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1904 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1905 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1906 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
1909 def CommutableCMPCC : PatLeaf<(imm), [{
1910 uint64_t Imm = N->getZExtValue() & 0x7;
1911 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
1914 // Patterns to select compares with loads in first operand.
1915 let Predicates = [HasAVX] in {
1916 def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1,
1917 CommutableCMPCC:$cc)),
1918 (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
1920 def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1,
1921 CommutableCMPCC:$cc)),
1922 (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>;
1924 def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1,
1925 CommutableCMPCC:$cc)),
1926 (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
1928 def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1,
1929 CommutableCMPCC:$cc)),
1930 (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
1932 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1933 CommutableCMPCC:$cc)),
1934 (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
1936 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
1937 CommutableCMPCC:$cc)),
1938 (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
1941 let Predicates = [UseSSE2] in {
1942 def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1,
1943 CommutableCMPCC:$cc)),
1944 (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
1946 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1947 CommutableCMPCC:$cc)),
1948 (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
1951 let Predicates = [UseSSE1] in {
1952 def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1,
1953 CommutableCMPCC:$cc)),
1954 (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
1956 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
1957 CommutableCMPCC:$cc)),
1958 (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
1961 //===----------------------------------------------------------------------===//
1962 // SSE 1 & 2 - Shuffle Instructions
1963 //===----------------------------------------------------------------------===//
1965 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
1966 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
1967 ValueType vt, string asm, PatFrag mem_frag,
1968 X86FoldableSchedWrite sched, Domain d,
1969 bit IsCommutable = 0> {
1970 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
1971 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
1972 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
1973 (i8 imm:$src3))))], d>,
1974 Sched<[sched.Folded, sched.ReadAfterFold]>;
1975 let isCommutable = IsCommutable in
1976 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
1977 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
1978 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
1979 (i8 imm:$src3))))], d>,
1983 let Predicates = [HasAVX, NoVLX] in {
1984 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
1985 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
1986 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
1987 PS, VEX_4V, VEX_WIG;
1988 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
1989 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
1990 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
1991 PS, VEX_4V, VEX_L, VEX_WIG;
1992 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
1993 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
1994 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
1995 PD, VEX_4V, VEX_WIG;
1996 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
1997 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
1998 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
1999 PD, VEX_4V, VEX_L, VEX_WIG;
2001 let Constraints = "$src1 = $dst" in {
2002 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2003 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2004 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2005 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2006 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2007 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2010 //===----------------------------------------------------------------------===//
2011 // SSE 1 & 2 - Unpack FP Instructions
2012 //===----------------------------------------------------------------------===//
2014 /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2015 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2016 PatFrag mem_frag, RegisterClass RC,
2017 X86MemOperand x86memop, string asm,
2018 X86FoldableSchedWrite sched, Domain d,
2019 bit IsCommutable = 0> {
2020 let isCommutable = IsCommutable in
2021 def rr : PI<opc, MRMSrcReg,
2022 (outs RC:$dst), (ins RC:$src1, RC:$src2),
2024 (vt (OpNode RC:$src1, RC:$src2)))], d>,
2026 def rm : PI<opc, MRMSrcMem,
2027 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2029 (vt (OpNode RC:$src1,
2030 (mem_frag addr:$src2))))], d>,
2031 Sched<[sched.Folded, sched.ReadAfterFold]>;
2034 let Predicates = [HasAVX, NoVLX] in {
2035 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2036 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2037 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2038 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2039 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2040 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2041 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2042 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2043 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2044 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2045 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2046 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2048 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2049 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2050 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2051 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2052 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2053 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2054 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2055 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2056 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2057 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2058 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2059 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2060 }// Predicates = [HasAVX, NoVLX]
2062 let Constraints = "$src1 = $dst" in {
2063 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2064 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2065 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2066 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2067 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2068 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2069 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2070 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2071 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2072 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2073 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2074 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2075 } // Constraints = "$src1 = $dst"
2077 let Predicates = [HasAVX1Only] in {
2078 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2079 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2080 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2081 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2082 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2083 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2084 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2085 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2087 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2088 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2089 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2090 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2091 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2092 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2093 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2094 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2097 let Predicates = [UseSSE2] in {
2098 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2099 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2100 (v2f64 (nonvolatile_load addr:$src2)))),
2101 (MOVHPDrm VR128:$src1, addr:$src2)>;
2104 //===----------------------------------------------------------------------===//
2105 // SSE 1 & 2 - Extract Floating-Point Sign mask
2106 //===----------------------------------------------------------------------===//
2108 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2109 multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2110 string asm, Domain d> {
2111 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2112 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2113 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2114 Sched<[WriteFMOVMSK]>;
2117 let Predicates = [HasAVX] in {
2118 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2119 SSEPackedSingle>, PS, VEX, VEX_WIG;
2120 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2121 SSEPackedDouble>, PD, VEX, VEX_WIG;
2122 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2123 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2124 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2125 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2127 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2128 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2129 (VMOVMSKPSrr VR128:$src)>;
2130 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2131 (VMOVMSKPDrr VR128:$src)>;
2132 def : Pat<(X86movmsk (v8i32 VR256:$src)),
2133 (VMOVMSKPSYrr VR256:$src)>;
2134 def : Pat<(X86movmsk (v4i64 VR256:$src)),
2135 (VMOVMSKPDYrr VR256:$src)>;
2138 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2139 SSEPackedSingle>, PS;
2140 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2141 SSEPackedDouble>, PD;
2143 let Predicates = [UseSSE2] in {
2144 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2145 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2146 (MOVMSKPSrr VR128:$src)>;
2147 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2148 (MOVMSKPDrr VR128:$src)>;
2151 //===---------------------------------------------------------------------===//
2152 // SSE2 - Packed Integer Logical Instructions
2153 //===---------------------------------------------------------------------===//
2155 let ExeDomain = SSEPackedInt in { // SSE integer instructions
2157 /// PDI_binop_rm - Simple SSE2 binary operator.
2158 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2159 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2160 X86MemOperand x86memop, X86FoldableSchedWrite sched,
2161 bit IsCommutable, bit Is2Addr> {
2162 let isCommutable = IsCommutable in
2163 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2164 (ins RC:$src1, RC:$src2),
2166 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2167 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2168 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2170 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2171 (ins RC:$src1, x86memop:$src2),
2173 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2174 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2175 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2176 Sched<[sched.Folded, sched.ReadAfterFold]>;
2178 } // ExeDomain = SSEPackedInt
2180 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2181 ValueType OpVT128, ValueType OpVT256,
2182 X86SchedWriteWidths sched, bit IsCommutable,
2184 let Predicates = [HasAVX, prd] in
2185 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2186 VR128, load, i128mem, sched.XMM,
2187 IsCommutable, 0>, VEX_4V, VEX_WIG;
2189 let Constraints = "$src1 = $dst" in
2190 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2191 memop, i128mem, sched.XMM, IsCommutable, 1>;
2193 let Predicates = [HasAVX2, prd] in
2194 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2195 OpVT256, VR256, load, i256mem, sched.YMM,
2196 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2199 // These are ordered here for pattern ordering requirements with the fp versions
2201 defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2202 SchedWriteVecLogic, 1, NoVLX>;
2203 defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2204 SchedWriteVecLogic, 1, NoVLX>;
2205 defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2206 SchedWriteVecLogic, 1, NoVLX>;
2207 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2208 SchedWriteVecLogic, 0, NoVLX>;
2210 //===----------------------------------------------------------------------===//
2211 // SSE 1 & 2 - Logical Instructions
2212 //===----------------------------------------------------------------------===//
2214 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2216 /// There are no patterns here because isel prefers integer versions for SSE2
2217 /// and later. There are SSE1 v4f32 patterns later.
2218 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2219 SDNode OpNode, X86SchedWriteWidths sched> {
2220 let Predicates = [HasAVX, NoVLX] in {
2221 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2222 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2223 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2225 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2226 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2227 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2229 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2230 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2231 [], [], 0>, PS, VEX_4V, VEX_WIG;
2233 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2234 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2235 [], [], 0>, PD, VEX_4V, VEX_WIG;
2238 let Constraints = "$src1 = $dst" in {
2239 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2240 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2243 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2244 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2249 defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
2250 defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
2251 defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
2252 let isCommutable = 0 in
2253 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
2255 let Predicates = [HasAVX2, NoVLX] in {
2256 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2257 (VPANDYrr VR256:$src1, VR256:$src2)>;
2258 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2259 (VPANDYrr VR256:$src1, VR256:$src2)>;
2260 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2261 (VPANDYrr VR256:$src1, VR256:$src2)>;
2263 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2264 (VPORYrr VR256:$src1, VR256:$src2)>;
2265 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2266 (VPORYrr VR256:$src1, VR256:$src2)>;
2267 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2268 (VPORYrr VR256:$src1, VR256:$src2)>;
2270 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2271 (VPXORYrr VR256:$src1, VR256:$src2)>;
2272 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2273 (VPXORYrr VR256:$src1, VR256:$src2)>;
2274 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2275 (VPXORYrr VR256:$src1, VR256:$src2)>;
2277 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2278 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2279 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2280 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2281 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2282 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2284 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2285 (VPANDYrm VR256:$src1, addr:$src2)>;
2286 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2287 (VPANDYrm VR256:$src1, addr:$src2)>;
2288 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2289 (VPANDYrm VR256:$src1, addr:$src2)>;
2291 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2292 (VPORYrm VR256:$src1, addr:$src2)>;
2293 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2294 (VPORYrm VR256:$src1, addr:$src2)>;
2295 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2296 (VPORYrm VR256:$src1, addr:$src2)>;
2298 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2299 (VPXORYrm VR256:$src1, addr:$src2)>;
2300 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2301 (VPXORYrm VR256:$src1, addr:$src2)>;
2302 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2303 (VPXORYrm VR256:$src1, addr:$src2)>;
2305 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2306 (VPANDNYrm VR256:$src1, addr:$src2)>;
2307 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2308 (VPANDNYrm VR256:$src1, addr:$src2)>;
2309 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2310 (VPANDNYrm VR256:$src1, addr:$src2)>;
2313 // If only AVX1 is supported, we need to handle integer operations with
2314 // floating point instructions since the integer versions aren't available.
2315 let Predicates = [HasAVX1Only] in {
2316 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2317 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2318 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2319 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2320 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2321 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2322 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2323 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2325 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2326 (VORPSYrr VR256:$src1, VR256:$src2)>;
2327 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2328 (VORPSYrr VR256:$src1, VR256:$src2)>;
2329 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2330 (VORPSYrr VR256:$src1, VR256:$src2)>;
2331 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2332 (VORPSYrr VR256:$src1, VR256:$src2)>;
2334 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2335 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2336 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2337 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2338 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2339 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2340 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2341 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2343 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2344 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2345 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2346 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2347 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2348 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2349 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2350 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2352 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2353 (VANDPSYrm VR256:$src1, addr:$src2)>;
2354 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2355 (VANDPSYrm VR256:$src1, addr:$src2)>;
2356 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2357 (VANDPSYrm VR256:$src1, addr:$src2)>;
2358 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2359 (VANDPSYrm VR256:$src1, addr:$src2)>;
2361 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2362 (VORPSYrm VR256:$src1, addr:$src2)>;
2363 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2364 (VORPSYrm VR256:$src1, addr:$src2)>;
2365 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2366 (VORPSYrm VR256:$src1, addr:$src2)>;
2367 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2368 (VORPSYrm VR256:$src1, addr:$src2)>;
2370 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2371 (VXORPSYrm VR256:$src1, addr:$src2)>;
2372 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2373 (VXORPSYrm VR256:$src1, addr:$src2)>;
2374 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2375 (VXORPSYrm VR256:$src1, addr:$src2)>;
2376 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2377 (VXORPSYrm VR256:$src1, addr:$src2)>;
2379 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2380 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2381 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2382 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2383 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2384 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2385 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2386 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2389 let Predicates = [HasAVX, NoVLX] in {
2390 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2391 (VPANDrr VR128:$src1, VR128:$src2)>;
2392 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2393 (VPANDrr VR128:$src1, VR128:$src2)>;
2394 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2395 (VPANDrr VR128:$src1, VR128:$src2)>;
2397 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2398 (VPORrr VR128:$src1, VR128:$src2)>;
2399 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2400 (VPORrr VR128:$src1, VR128:$src2)>;
2401 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2402 (VPORrr VR128:$src1, VR128:$src2)>;
2404 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2405 (VPXORrr VR128:$src1, VR128:$src2)>;
2406 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2407 (VPXORrr VR128:$src1, VR128:$src2)>;
2408 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2409 (VPXORrr VR128:$src1, VR128:$src2)>;
2411 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2412 (VPANDNrr VR128:$src1, VR128:$src2)>;
2413 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2414 (VPANDNrr VR128:$src1, VR128:$src2)>;
2415 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2416 (VPANDNrr VR128:$src1, VR128:$src2)>;
2418 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2419 (VPANDrm VR128:$src1, addr:$src2)>;
2420 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2421 (VPANDrm VR128:$src1, addr:$src2)>;
2422 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2423 (VPANDrm VR128:$src1, addr:$src2)>;
2425 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2426 (VPORrm VR128:$src1, addr:$src2)>;
2427 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2428 (VPORrm VR128:$src1, addr:$src2)>;
2429 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2430 (VPORrm VR128:$src1, addr:$src2)>;
2432 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2433 (VPXORrm VR128:$src1, addr:$src2)>;
2434 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2435 (VPXORrm VR128:$src1, addr:$src2)>;
2436 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2437 (VPXORrm VR128:$src1, addr:$src2)>;
2439 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2440 (VPANDNrm VR128:$src1, addr:$src2)>;
2441 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2442 (VPANDNrm VR128:$src1, addr:$src2)>;
2443 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2444 (VPANDNrm VR128:$src1, addr:$src2)>;
2447 let Predicates = [UseSSE2] in {
2448 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2449 (PANDrr VR128:$src1, VR128:$src2)>;
2450 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2451 (PANDrr VR128:$src1, VR128:$src2)>;
2452 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2453 (PANDrr VR128:$src1, VR128:$src2)>;
2455 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2456 (PORrr VR128:$src1, VR128:$src2)>;
2457 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2458 (PORrr VR128:$src1, VR128:$src2)>;
2459 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2460 (PORrr VR128:$src1, VR128:$src2)>;
2462 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2463 (PXORrr VR128:$src1, VR128:$src2)>;
2464 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2465 (PXORrr VR128:$src1, VR128:$src2)>;
2466 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2467 (PXORrr VR128:$src1, VR128:$src2)>;
2469 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2470 (PANDNrr VR128:$src1, VR128:$src2)>;
2471 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2472 (PANDNrr VR128:$src1, VR128:$src2)>;
2473 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2474 (PANDNrr VR128:$src1, VR128:$src2)>;
2476 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2477 (PANDrm VR128:$src1, addr:$src2)>;
2478 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2479 (PANDrm VR128:$src1, addr:$src2)>;
2480 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2481 (PANDrm VR128:$src1, addr:$src2)>;
2483 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2484 (PORrm VR128:$src1, addr:$src2)>;
2485 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2486 (PORrm VR128:$src1, addr:$src2)>;
2487 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2488 (PORrm VR128:$src1, addr:$src2)>;
2490 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2491 (PXORrm VR128:$src1, addr:$src2)>;
2492 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2493 (PXORrm VR128:$src1, addr:$src2)>;
2494 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2495 (PXORrm VR128:$src1, addr:$src2)>;
2497 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2498 (PANDNrm VR128:$src1, addr:$src2)>;
2499 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2500 (PANDNrm VR128:$src1, addr:$src2)>;
2501 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2502 (PANDNrm VR128:$src1, addr:$src2)>;
2505 // Patterns for packed operations when we don't have integer type available.
2506 def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2507 (ANDPSrr VR128:$src1, VR128:$src2)>;
2508 def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2509 (ORPSrr VR128:$src1, VR128:$src2)>;
2510 def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2511 (XORPSrr VR128:$src1, VR128:$src2)>;
2512 def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2513 (ANDNPSrr VR128:$src1, VR128:$src2)>;
2515 def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2516 (ANDPSrm VR128:$src1, addr:$src2)>;
2517 def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2518 (ORPSrm VR128:$src1, addr:$src2)>;
2519 def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2520 (XORPSrm VR128:$src1, addr:$src2)>;
2521 def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2522 (ANDNPSrm VR128:$src1, addr:$src2)>;
2524 //===----------------------------------------------------------------------===//
2525 // SSE 1 & 2 - Arithmetic Instructions
2526 //===----------------------------------------------------------------------===//
2528 /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2531 /// In addition, we also have a special variant of the scalar form here to
2532 /// represent the associated intrinsic operation. This form is unlike the
2533 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
2534 /// and leaves the top elements unmodified (therefore these cannot be commuted).
2536 /// These three forms can each be reg+reg or reg+mem.
2539 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2541 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2542 SDNode OpNode, X86SchedWriteSizes sched> {
2543 let Predicates = [HasAVX, NoVLX] in {
2544 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2545 VR128, v4f32, f128mem, loadv4f32,
2546 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2547 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2548 VR128, v2f64, f128mem, loadv2f64,
2549 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2551 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2552 OpNode, VR256, v8f32, f256mem, loadv8f32,
2553 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2554 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2555 OpNode, VR256, v4f64, f256mem, loadv4f64,
2556 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2559 let Constraints = "$src1 = $dst" in {
2560 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2561 v4f32, f128mem, memopv4f32, SSEPackedSingle,
2563 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2564 v2f64, f128mem, memopv2f64, SSEPackedDouble,
2569 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2570 X86SchedWriteSizes sched> {
2571 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2572 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2573 XS, VEX_4V, VEX_LIG, VEX_WIG;
2574 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2575 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2576 XD, VEX_4V, VEX_LIG, VEX_WIG;
2578 let Constraints = "$src1 = $dst" in {
2579 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2580 OpNode, FR32, f32mem, SSEPackedSingle,
2582 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2583 OpNode, FR64, f64mem, SSEPackedDouble,
2588 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2589 SDPatternOperator OpNode,
2590 X86SchedWriteSizes sched> {
2591 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2592 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2593 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2594 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2595 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2596 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2598 let Constraints = "$src1 = $dst" in {
2599 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2600 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2601 SSEPackedSingle, sched.PS.Scl>, XS;
2602 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2603 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2604 SSEPackedDouble, sched.PD.Scl>, XD;
2608 // Binary Arithmetic instructions
2609 defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>,
2610 basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>,
2611 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2612 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>,
2613 basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>,
2614 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2615 let isCommutable = 0 in {
2616 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>,
2617 basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>,
2618 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2619 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>,
2620 basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>,
2621 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2622 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2623 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2624 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2625 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2626 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2627 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2630 let isCodeGenOnly = 1 in {
2631 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2632 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2633 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2634 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2637 // Patterns used to select SSE scalar fp arithmetic instructions from
2640 // (1) a scalar fp operation followed by a blend
2642 // The effect is that the backend no longer emits unnecessary vector
2643 // insert instructions immediately after SSE scalar fp instructions
2644 // like addss or mulss.
2646 // For example, given the following code:
2647 // __m128 foo(__m128 A, __m128 B) {
2652 // Previously we generated:
2653 // addss %xmm0, %xmm1
2654 // movss %xmm1, %xmm0
2657 // addss %xmm1, %xmm0
2659 // (2) a vector packed single/double fp operation followed by a vector insert
2661 // The effect is that the backend converts the packed fp instruction
2662 // followed by a vector insert into a single SSE scalar fp instruction.
2664 // For example, given the following code:
2665 // __m128 foo(__m128 A, __m128 B) {
2666 // __m128 C = A + B;
2667 // return (__m128) {c[0], a[1], a[2], a[3]};
2670 // Previously we generated:
2671 // addps %xmm0, %xmm1
2672 // movss %xmm1, %xmm0
2675 // addss %xmm1, %xmm0
2677 // TODO: Some canonicalization in lowering would simplify the number of
2678 // patterns we have to try to match.
2679 multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
2680 ValueType VT, ValueType EltTy,
2681 RegisterClass RC, PatFrag ld_frag,
2682 Predicate BasePredicate> {
2683 let Predicates = [BasePredicate] in {
2684 // extracted scalar math op with insert via movss/movsd
2685 def : Pat<(VT (Move (VT VR128:$dst),
2686 (VT (scalar_to_vector
2687 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2689 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2690 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2691 def : Pat<(VT (Move (VT VR128:$dst),
2692 (VT (scalar_to_vector
2693 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2694 (ld_frag addr:$src)))))),
2695 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2698 // Repeat for AVX versions of the instructions.
2699 let Predicates = [UseAVX] in {
2700 // extracted scalar math op with insert via movss/movsd
2701 def : Pat<(VT (Move (VT VR128:$dst),
2702 (VT (scalar_to_vector
2703 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2705 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2706 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2707 def : Pat<(VT (Move (VT VR128:$dst),
2708 (VT (scalar_to_vector
2709 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2710 (ld_frag addr:$src)))))),
2711 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2715 defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2716 defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2717 defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2718 defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2720 defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2721 defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2722 defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2723 defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2726 /// In addition, we also have a special variant of the scalar form here to
2727 /// represent the associated intrinsic operation. This form is unlike the
2728 /// plain scalar form, in that it takes an entire vector (instead of a
2729 /// scalar) and leaves the top elements undefined.
2731 /// And, we have a special variant form for a full-vector intrinsic form.
2733 /// sse_fp_unop_s - SSE1 unops in scalar form
2734 /// For the non-AVX defs, we need $src1 to be tied to $dst because
2735 /// the HW instructions are 2 operand / destructive.
2736 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2737 ValueType ScalarVT, X86MemOperand x86memop,
2738 Operand intmemop, SDNode OpNode, Domain d,
2739 X86FoldableSchedWrite sched, Predicate target> {
2740 let isCodeGenOnly = 1, hasSideEffects = 0 in {
2741 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2742 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2743 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2746 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2747 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2748 [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2749 Sched<[sched.Folded]>,
2750 Requires<[target, OptForSize]>;
2753 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2754 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2755 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2758 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2759 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2760 Sched<[sched.Folded, sched.ReadAfterFold]>;
2765 multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
2766 ComplexPattern int_cpat, Intrinsic Intr,
2767 Predicate target, string Suffix> {
2768 let Predicates = [target] in {
2769 // These are unary operations, but they are modeled as having 2 source operands
2770 // because the high elements of the destination are unchanged in SSE.
2771 def : Pat<(Intr VR128:$src),
2772 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2774 // We don't want to fold scalar loads into these instructions unless
2775 // optimizing for size. This is because the folded instruction will have a
2776 // partial register update, while the unfolded sequence will not, e.g.
2778 // rcpss %xmm0, %xmm0
2779 // which has a clobber before the rcp, vs.
2781 let Predicates = [target, OptForSize] in {
2782 def : Pat<(Intr int_cpat:$src2),
2783 (!cast<Instruction>(NAME#m_Int)
2784 (vt (IMPLICIT_DEF)), addr:$src2)>;
2788 multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
2789 Intrinsic Intr, Predicate target> {
2790 let Predicates = [target] in {
2791 def : Pat<(Intr VR128:$src),
2792 (!cast<Instruction>(NAME#r_Int) VR128:$src,
2795 let Predicates = [target, OptForSize] in {
2796 def : Pat<(Intr int_cpat:$src2),
2797 (!cast<Instruction>(NAME#m_Int)
2798 (vt (IMPLICIT_DEF)), addr:$src2)>;
2802 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2803 ValueType ScalarVT, X86MemOperand x86memop,
2804 Operand intmemop, SDNode OpNode, Domain d,
2805 X86FoldableSchedWrite sched, Predicate target> {
2806 let isCodeGenOnly = 1, hasSideEffects = 0 in {
2807 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2808 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2809 [], d>, Sched<[sched]>;
2811 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2812 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2813 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2815 let hasSideEffects = 0, ExeDomain = d in {
2816 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2817 (ins VR128:$src1, VR128:$src2),
2818 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2819 []>, Sched<[sched]>;
2821 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2822 (ins VR128:$src1, intmemop:$src2),
2823 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2824 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2827 // We don't want to fold scalar loads into these instructions unless
2828 // optimizing for size. This is because the folded instruction will have a
2829 // partial register update, while the unfolded sequence will not, e.g.
2830 // vmovss mem, %xmm0
2831 // vrcpss %xmm0, %xmm0, %xmm0
2832 // which has a clobber before the rcp, vs.
2833 // vrcpss mem, %xmm0, %xmm0
2834 // TODO: In theory, we could fold the load, and avoid the stall caused by
2835 // the partial register store, either in BreakFalseDeps or with smarter RA.
2836 let Predicates = [target] in {
2837 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r)
2838 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2840 let Predicates = [target, OptForSize] in {
2841 def : Pat<(ScalarVT (OpNode (load addr:$src))),
2842 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2847 /// sse1_fp_unop_p - SSE1 unops in packed form.
2848 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
2849 X86SchedWriteWidths sched, list<Predicate> prds> {
2850 let Predicates = prds in {
2851 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2852 !strconcat("v", OpcodeStr,
2853 "ps\t{$src, $dst|$dst, $src}"),
2854 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2855 VEX, Sched<[sched.XMM]>, VEX_WIG;
2856 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2857 !strconcat("v", OpcodeStr,
2858 "ps\t{$src, $dst|$dst, $src}"),
2859 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2860 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2861 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2862 !strconcat("v", OpcodeStr,
2863 "ps\t{$src, $dst|$dst, $src}"),
2864 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2865 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2866 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2867 !strconcat("v", OpcodeStr,
2868 "ps\t{$src, $dst|$dst, $src}"),
2869 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2870 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2873 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2874 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2875 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2877 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2878 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2879 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2880 Sched<[sched.XMM.Folded]>;
2883 /// sse2_fp_unop_p - SSE2 unops in vector forms.
2884 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2885 SDNode OpNode, X86SchedWriteWidths sched> {
2886 let Predicates = [HasAVX, NoVLX] in {
2887 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2888 !strconcat("v", OpcodeStr,
2889 "pd\t{$src, $dst|$dst, $src}"),
2890 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2891 VEX, Sched<[sched.XMM]>, VEX_WIG;
2892 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2893 !strconcat("v", OpcodeStr,
2894 "pd\t{$src, $dst|$dst, $src}"),
2895 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2896 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2897 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2898 !strconcat("v", OpcodeStr,
2899 "pd\t{$src, $dst|$dst, $src}"),
2900 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2901 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2902 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2903 !strconcat("v", OpcodeStr,
2904 "pd\t{$src, $dst|$dst, $src}"),
2905 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
2906 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2909 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2910 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2911 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2913 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2914 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2915 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
2916 Sched<[sched.XMM.Folded]>;
2919 multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
2920 X86SchedWriteWidths sched, Predicate AVXTarget> {
2921 defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2922 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
2924 defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2925 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
2927 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
2930 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2931 X86SchedWriteWidths sched, Predicate AVXTarget> {
2932 defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
2933 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
2934 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
2935 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
2936 XS, VEX_4V, VEX_LIG, VEX_WIG;
2939 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2940 X86SchedWriteWidths sched, Predicate AVXTarget> {
2941 defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
2942 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
2943 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
2944 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
2945 XD, VEX_4V, VEX_LIG, VEX_WIG;
2949 defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>,
2950 sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
2951 sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>,
2952 sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>;
2954 // Reciprocal approximations. Note that these typically require refinement
2955 // in order to obtain suitable precision.
2956 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
2957 sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
2958 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
2959 defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
2960 sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
2961 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
2963 // There is no f64 version of the reciprocal approximation instructions.
2965 multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
2966 ValueType VT, Predicate BasePredicate> {
2967 let Predicates = [BasePredicate] in {
2968 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
2969 (OpNode (extractelt VT:$src, 0))))),
2970 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2973 // Repeat for AVX versions of the instructions.
2974 let Predicates = [UseAVX] in {
2975 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
2976 (OpNode (extractelt VT:$src, 0))))),
2977 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2981 defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
2982 defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
2984 multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
2985 SDNode Move, ValueType VT,
2986 Predicate BasePredicate> {
2987 let Predicates = [BasePredicate] in {
2988 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
2989 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2992 // Repeat for AVX versions of the instructions.
2993 let Predicates = [HasAVX] in {
2994 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
2995 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2999 defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3001 defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3005 //===----------------------------------------------------------------------===//
3006 // SSE 1 & 2 - Non-temporal stores
3007 //===----------------------------------------------------------------------===//
3009 let AddedComplexity = 400 in { // Prefer non-temporal versions
3010 let Predicates = [HasAVX, NoVLX] in {
3011 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3012 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3013 (ins f128mem:$dst, VR128:$src),
3014 "movntps\t{$src, $dst|$dst, $src}",
3015 [(alignednontemporalstore (v4f32 VR128:$src),
3016 addr:$dst)]>, VEX, VEX_WIG;
3017 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3018 (ins f128mem:$dst, VR128:$src),
3019 "movntpd\t{$src, $dst|$dst, $src}",
3020 [(alignednontemporalstore (v2f64 VR128:$src),
3021 addr:$dst)]>, VEX, VEX_WIG;
3024 let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3025 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3026 (ins f256mem:$dst, VR256:$src),
3027 "movntps\t{$src, $dst|$dst, $src}",
3028 [(alignednontemporalstore (v8f32 VR256:$src),
3029 addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3030 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3031 (ins f256mem:$dst, VR256:$src),
3032 "movntpd\t{$src, $dst|$dst, $src}",
3033 [(alignednontemporalstore (v4f64 VR256:$src),
3034 addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3037 let ExeDomain = SSEPackedInt in {
3038 def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
3039 (ins i128mem:$dst, VR128:$src),
3040 "movntdq\t{$src, $dst|$dst, $src}",
3041 [(alignednontemporalstore (v2i64 VR128:$src),
3042 addr:$dst)]>, VEX, VEX_WIG,
3043 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3044 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3045 (ins i256mem:$dst, VR256:$src),
3046 "movntdq\t{$src, $dst|$dst, $src}",
3047 [(alignednontemporalstore (v4i64 VR256:$src),
3048 addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3049 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3053 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3054 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3055 "movntps\t{$src, $dst|$dst, $src}",
3056 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3057 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3058 "movntpd\t{$src, $dst|$dst, $src}",
3059 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3062 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3063 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3064 "movntdq\t{$src, $dst|$dst, $src}",
3065 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3067 let SchedRW = [WriteStoreNT] in {
3068 // There is no AVX form for instructions below this point
3069 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3070 "movnti{l}\t{$src, $dst|$dst, $src}",
3071 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3072 PS, Requires<[HasSSE2]>;
3073 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3074 "movnti{q}\t{$src, $dst|$dst, $src}",
3075 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3076 PS, Requires<[HasSSE2]>;
3077 } // SchedRW = [WriteStoreNT]
3079 let Predicates = [HasAVX, NoVLX] in {
3080 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3081 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3082 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3083 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3084 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3085 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3087 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3088 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3089 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3090 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3091 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3092 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3095 let Predicates = [UseSSE2] in {
3096 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3097 (MOVNTDQmr addr:$dst, VR128:$src)>;
3098 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3099 (MOVNTDQmr addr:$dst, VR128:$src)>;
3100 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3101 (MOVNTDQmr addr:$dst, VR128:$src)>;
3104 } // AddedComplexity
3106 //===----------------------------------------------------------------------===//
3107 // SSE 1 & 2 - Prefetch and memory fence
3108 //===----------------------------------------------------------------------===//
3110 // Prefetch intrinsic.
3111 let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3112 def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3113 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3114 def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3115 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3116 def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3117 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3118 def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3119 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3122 // FIXME: How should flush instruction be modeled?
3123 let SchedRW = [WriteLoad] in {
3125 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3126 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3127 PS, Requires<[HasSSE2]>;
3130 let SchedRW = [WriteNop] in {
3131 // Pause. This "instruction" is encoded as "rep; nop", so even though it
3132 // was introduced with SSE2, it's backward compatible.
3133 def PAUSE : I<0x90, RawFrm, (outs), (ins),
3134 "pause", [(int_x86_sse2_pause)]>, OBXS;
3137 let SchedRW = [WriteFence] in {
3138 // Load, store, and memory fence
3139 // TODO: As with mfence, we may want to ease the availablity of sfence/lfence
3140 // to include any 64-bit target.
3141 def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3142 PS, Requires<[HasSSE1]>;
3143 def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3144 PS, Requires<[HasSSE2]>;
3145 def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3146 PS, Requires<[HasMFence]>;
3149 def : Pat<(X86MFence), (MFENCE)>;
3151 //===----------------------------------------------------------------------===//
3152 // SSE 1 & 2 - Load/Store XCSR register
3153 //===----------------------------------------------------------------------===//
3155 let mayLoad=1, hasSideEffects=1 in
3156 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3157 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3158 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3159 let mayStore=1, hasSideEffects=1 in
3160 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3161 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3162 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3164 let mayLoad=1, hasSideEffects=1 in
3165 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3166 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3167 TB, Sched<[WriteLDMXCSR]>;
3168 let mayStore=1, hasSideEffects=1 in
3169 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3170 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3171 TB, Sched<[WriteSTMXCSR]>;
3173 //===---------------------------------------------------------------------===//
3174 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3175 //===---------------------------------------------------------------------===//
3177 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3179 let hasSideEffects = 0 in {
3180 def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3181 "movdqa\t{$src, $dst|$dst, $src}", []>,
3182 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3183 def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3184 "movdqu\t{$src, $dst|$dst, $src}", []>,
3185 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3186 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3187 "movdqa\t{$src, $dst|$dst, $src}", []>,
3188 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3189 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3190 "movdqu\t{$src, $dst|$dst, $src}", []>,
3191 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3195 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3196 def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3197 "movdqa\t{$src, $dst|$dst, $src}", []>,
3198 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3199 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3200 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3201 "movdqa\t{$src, $dst|$dst, $src}", []>,
3202 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3203 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3204 def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3205 "movdqu\t{$src, $dst|$dst, $src}", []>,
3206 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3207 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3208 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3209 "movdqu\t{$src, $dst|$dst, $src}", []>,
3210 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3211 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3214 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3215 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3216 def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3217 "movdqa\t{$src, $dst|$dst, $src}",
3218 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3219 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3220 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3221 "movdqa\t{$src, $dst|$dst, $src}", []>,
3222 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3223 VEX, VEX_L, VEX_WIG;
3224 def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3225 "vmovdqu\t{$src, $dst|$dst, $src}",
3226 [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3227 Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3229 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3230 "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3231 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3232 XS, VEX, VEX_L, VEX_WIG;
3235 let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3236 def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
3237 (ins i128mem:$dst, VR128:$src),
3238 "movdqa\t{$src, $dst|$dst, $src}",
3239 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3240 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3241 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3242 (ins i256mem:$dst, VR256:$src),
3243 "movdqa\t{$src, $dst|$dst, $src}", []>,
3244 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3245 def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3246 "vmovdqu\t{$src, $dst|$dst, $src}",
3247 [(store (v2i64 VR128:$src), addr:$dst)]>,
3248 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3249 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3250 "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3251 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3254 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3255 let hasSideEffects = 0 in {
3256 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3257 "movdqa\t{$src, $dst|$dst, $src}", []>;
3259 def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3260 "movdqu\t{$src, $dst|$dst, $src}", []>,
3261 XS, Requires<[UseSSE2]>;
3265 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3266 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3267 "movdqa\t{$src, $dst|$dst, $src}", []>,
3268 FoldGenData<"MOVDQArr">;
3270 def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3271 "movdqu\t{$src, $dst|$dst, $src}", []>,
3272 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3276 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3277 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3278 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3279 "movdqa\t{$src, $dst|$dst, $src}",
3280 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3281 def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3282 "movdqu\t{$src, $dst|$dst, $src}",
3283 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3284 XS, Requires<[UseSSE2]>;
3287 let mayStore = 1, hasSideEffects = 0,
3288 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3289 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3290 "movdqa\t{$src, $dst|$dst, $src}",
3291 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3292 def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3293 "movdqu\t{$src, $dst|$dst, $src}",
3294 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3295 XS, Requires<[UseSSE2]>;
3298 } // ExeDomain = SSEPackedInt
3300 // Reversed version with ".s" suffix for GAS compatibility.
3301 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3302 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3303 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3304 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3305 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3306 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3307 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3308 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3310 // Reversed version with ".s" suffix for GAS compatibility.
3311 def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3312 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3313 def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3314 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3316 let Predicates = [HasAVX, NoVLX] in {
3317 // Additional patterns for other integer sizes.
3318 def : Pat<(alignedloadv4i32 addr:$src),
3319 (VMOVDQArm addr:$src)>;
3320 def : Pat<(alignedloadv8i16 addr:$src),
3321 (VMOVDQArm addr:$src)>;
3322 def : Pat<(alignedloadv16i8 addr:$src),
3323 (VMOVDQArm addr:$src)>;
3324 def : Pat<(loadv4i32 addr:$src),
3325 (VMOVDQUrm addr:$src)>;
3326 def : Pat<(loadv8i16 addr:$src),
3327 (VMOVDQUrm addr:$src)>;
3328 def : Pat<(loadv16i8 addr:$src),
3329 (VMOVDQUrm addr:$src)>;
3331 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3332 (VMOVDQAmr addr:$dst, VR128:$src)>;
3333 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3334 (VMOVDQAmr addr:$dst, VR128:$src)>;
3335 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3336 (VMOVDQAmr addr:$dst, VR128:$src)>;
3337 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3338 (VMOVDQUmr addr:$dst, VR128:$src)>;
3339 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3340 (VMOVDQUmr addr:$dst, VR128:$src)>;
3341 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3342 (VMOVDQUmr addr:$dst, VR128:$src)>;
3345 //===---------------------------------------------------------------------===//
3346 // SSE2 - Packed Integer Arithmetic Instructions
3347 //===---------------------------------------------------------------------===//
3349 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3351 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3352 multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3353 ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3354 PatFrag memop_frag, X86MemOperand x86memop,
3355 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3356 let isCommutable = 1 in
3357 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3358 (ins RC:$src1, RC:$src2),
3360 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3361 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3362 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3364 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3365 (ins RC:$src1, x86memop:$src2),
3367 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3368 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3369 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3370 (memop_frag addr:$src2))))]>,
3371 Sched<[sched.Folded, sched.ReadAfterFold]>;
3373 } // ExeDomain = SSEPackedInt
3375 defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3376 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3377 defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3378 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3379 defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3380 SchedWriteVecALU, 1, NoVLX>;
3381 defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3382 SchedWriteVecALU, 1, NoVLX>;
3383 defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3384 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3385 defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3386 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3387 defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3388 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3389 defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3390 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3391 defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3392 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3393 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3394 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3395 defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3396 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3397 defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3398 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3399 defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3400 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3401 defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3402 SchedWriteVecALU, 0, NoVLX>;
3403 defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3404 SchedWriteVecALU, 0, NoVLX>;
3405 defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3406 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3407 defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3408 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3409 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3410 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3411 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3412 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3413 defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3414 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3415 defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3416 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3417 defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3418 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3419 defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3420 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3421 defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
3422 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3423 defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
3424 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3425 defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3426 SchedWriteVecIMul, 1, NoVLX>;
3428 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3429 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3430 load, i128mem, SchedWriteVecIMul.XMM, 0>,
3433 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3434 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3435 VR256, load, i256mem, SchedWriteVecIMul.YMM,
3436 0>, VEX_4V, VEX_L, VEX_WIG;
3437 let Constraints = "$src1 = $dst" in
3438 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3439 memop, i128mem, SchedWriteVecIMul.XMM>;
3441 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3442 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3443 load, i128mem, SchedWritePSADBW.XMM, 0>,
3445 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3446 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3447 load, i256mem, SchedWritePSADBW.YMM, 0>,
3448 VEX_4V, VEX_L, VEX_WIG;
3449 let Constraints = "$src1 = $dst" in
3450 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3451 memop, i128mem, SchedWritePSADBW.XMM>;
3453 //===---------------------------------------------------------------------===//
3454 // SSE2 - Packed Integer Logical Instructions
3455 //===---------------------------------------------------------------------===//
3457 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3458 string OpcodeStr, SDNode OpNode,
3459 SDNode OpNode2, RegisterClass RC,
3460 X86FoldableSchedWrite sched,
3461 X86FoldableSchedWrite schedImm,
3462 ValueType DstVT, ValueType SrcVT,
3463 PatFrag ld_frag, bit Is2Addr = 1> {
3464 // src2 is always 128-bit
3465 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3466 (ins RC:$src1, VR128:$src2),
3468 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3469 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3470 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3472 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3473 (ins RC:$src1, i128mem:$src2),
3475 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3476 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3477 [(set RC:$dst, (DstVT (OpNode RC:$src1,
3478 (SrcVT (ld_frag addr:$src2)))))]>,
3479 Sched<[sched.Folded, sched.ReadAfterFold]>;
3480 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3481 (ins RC:$src1, u8imm:$src2),
3483 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3484 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3485 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>,
3489 multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3490 string OpcodeStr, SDNode OpNode,
3491 SDNode OpNode2, ValueType DstVT128,
3492 ValueType DstVT256, ValueType SrcVT,
3493 X86SchedWriteWidths sched,
3494 X86SchedWriteWidths schedImm, Predicate prd> {
3495 let Predicates = [HasAVX, prd] in
3496 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3497 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3498 DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3499 let Predicates = [HasAVX2, prd] in
3500 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3501 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3502 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3504 let Constraints = "$src1 = $dst" in
3505 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3506 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3510 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3511 SDNode OpNode, RegisterClass RC, ValueType VT,
3512 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3513 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3515 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3516 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3517 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>,
3521 multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3522 SDNode OpNode, X86SchedWriteWidths sched> {
3523 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3524 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3525 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3526 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3527 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3528 VR256, v32i8, sched.YMM, 0>,
3529 VEX_4V, VEX_L, VEX_WIG;
3530 let Constraints = "$src1 = $dst" in
3531 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3535 let ExeDomain = SSEPackedInt in {
3536 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3537 v8i16, v16i16, v8i16, SchedWriteVecShift,
3538 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3539 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3540 v4i32, v8i32, v4i32, SchedWriteVecShift,
3541 SchedWriteVecShiftImm, NoVLX>;
3542 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3543 v2i64, v4i64, v2i64, SchedWriteVecShift,
3544 SchedWriteVecShiftImm, NoVLX>;
3546 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3547 v8i16, v16i16, v8i16, SchedWriteVecShift,
3548 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3549 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3550 v4i32, v8i32, v4i32, SchedWriteVecShift,
3551 SchedWriteVecShiftImm, NoVLX>;
3552 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3553 v2i64, v4i64, v2i64, SchedWriteVecShift,
3554 SchedWriteVecShiftImm, NoVLX>;
3556 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3557 v8i16, v16i16, v8i16, SchedWriteVecShift,
3558 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3559 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3560 v4i32, v8i32, v4i32, SchedWriteVecShift,
3561 SchedWriteVecShiftImm, NoVLX>;
3563 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3565 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3567 } // ExeDomain = SSEPackedInt
3569 //===---------------------------------------------------------------------===//
3570 // SSE2 - Packed Integer Comparison Instructions
3571 //===---------------------------------------------------------------------===//
3573 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3574 SchedWriteVecALU, 1, TruePredicate>;
3575 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3576 SchedWriteVecALU, 1, TruePredicate>;
3577 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3578 SchedWriteVecALU, 1, TruePredicate>;
3579 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3580 SchedWriteVecALU, 0, TruePredicate>;
3581 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3582 SchedWriteVecALU, 0, TruePredicate>;
3583 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3584 SchedWriteVecALU, 0, TruePredicate>;
3586 //===---------------------------------------------------------------------===//
3587 // SSE2 - Packed Integer Shuffle Instructions
3588 //===---------------------------------------------------------------------===//
3590 let ExeDomain = SSEPackedInt in {
3591 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3592 SDNode OpNode, X86SchedWriteWidths sched,
3594 let Predicates = [HasAVX, prd] in {
3595 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3596 (ins VR128:$src1, u8imm:$src2),
3597 !strconcat("v", OpcodeStr,
3598 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3600 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
3601 VEX, Sched<[sched.XMM]>, VEX_WIG;
3602 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3603 (ins i128mem:$src1, u8imm:$src2),
3604 !strconcat("v", OpcodeStr,
3605 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3607 (vt128 (OpNode (load addr:$src1),
3608 (i8 imm:$src2))))]>, VEX,
3609 Sched<[sched.XMM.Folded]>, VEX_WIG;
3612 let Predicates = [HasAVX2, prd] in {
3613 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3614 (ins VR256:$src1, u8imm:$src2),
3615 !strconcat("v", OpcodeStr,
3616 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3618 (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>,
3619 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3620 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3621 (ins i256mem:$src1, u8imm:$src2),
3622 !strconcat("v", OpcodeStr,
3623 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3625 (vt256 (OpNode (load addr:$src1),
3626 (i8 imm:$src2))))]>, VEX, VEX_L,
3627 Sched<[sched.YMM.Folded]>, VEX_WIG;
3630 let Predicates = [UseSSE2] in {
3631 def ri : Ii8<0x70, MRMSrcReg,
3632 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3633 !strconcat(OpcodeStr,
3634 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3636 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
3638 def mi : Ii8<0x70, MRMSrcMem,
3639 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3640 !strconcat(OpcodeStr,
3641 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3643 (vt128 (OpNode (memop addr:$src1),
3644 (i8 imm:$src2))))]>,
3645 Sched<[sched.XMM.Folded]>;
3648 } // ExeDomain = SSEPackedInt
3650 defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3651 SchedWriteShuffle, NoVLX>, PD;
3652 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3653 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3654 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3655 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3657 //===---------------------------------------------------------------------===//
3658 // Packed Integer Pack Instructions (SSE & AVX)
3659 //===---------------------------------------------------------------------===//
3661 let ExeDomain = SSEPackedInt in {
3662 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3663 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3664 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3665 PatFrag ld_frag, bit Is2Addr = 1> {
3666 def rr : PDI<opc, MRMSrcReg,
3667 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3669 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3670 !strconcat(OpcodeStr,
3671 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3673 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3675 def rm : PDI<opc, MRMSrcMem,
3676 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3678 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3679 !strconcat(OpcodeStr,
3680 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3682 (OutVT (OpNode (ArgVT RC:$src1),
3683 (ld_frag addr:$src2))))]>,
3684 Sched<[sched.Folded, sched.ReadAfterFold]>;
3687 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3688 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3689 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3690 PatFrag ld_frag, bit Is2Addr = 1> {
3691 def rr : SS48I<opc, MRMSrcReg,
3692 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3694 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3695 !strconcat(OpcodeStr,
3696 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3698 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3700 def rm : SS48I<opc, MRMSrcMem,
3701 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3703 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3704 !strconcat(OpcodeStr,
3705 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3707 (OutVT (OpNode (ArgVT RC:$src1),
3708 (ld_frag addr:$src2))))]>,
3709 Sched<[sched.Folded, sched.ReadAfterFold]>;
3712 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3713 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3714 i128mem, SchedWriteShuffle.XMM, load, 0>,
3716 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3717 i128mem, SchedWriteShuffle.XMM, load, 0>,
3720 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3721 i128mem, SchedWriteShuffle.XMM, load, 0>,
3723 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3724 i128mem, SchedWriteShuffle.XMM, load, 0>,
3728 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3729 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3730 i256mem, SchedWriteShuffle.YMM, load, 0>,
3731 VEX_4V, VEX_L, VEX_WIG;
3732 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3733 i256mem, SchedWriteShuffle.YMM, load, 0>,
3734 VEX_4V, VEX_L, VEX_WIG;
3736 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3737 i256mem, SchedWriteShuffle.YMM, load, 0>,
3738 VEX_4V, VEX_L, VEX_WIG;
3739 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3740 i256mem, SchedWriteShuffle.YMM, load, 0>,
3744 let Constraints = "$src1 = $dst" in {
3745 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3746 i128mem, SchedWriteShuffle.XMM, memop>;
3747 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3748 i128mem, SchedWriteShuffle.XMM, memop>;
3750 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3751 i128mem, SchedWriteShuffle.XMM, memop>;
3753 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3754 i128mem, SchedWriteShuffle.XMM, memop>;
3756 } // ExeDomain = SSEPackedInt
3758 //===---------------------------------------------------------------------===//
3759 // SSE2 - Packed Integer Unpack Instructions
3760 //===---------------------------------------------------------------------===//
3762 let ExeDomain = SSEPackedInt in {
3763 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3764 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3765 X86FoldableSchedWrite sched, PatFrag ld_frag,
3767 def rr : PDI<opc, MRMSrcReg,
3768 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3770 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3771 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3772 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3774 def rm : PDI<opc, MRMSrcMem,
3775 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3777 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3778 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3779 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3780 Sched<[sched.Folded, sched.ReadAfterFold]>;
3783 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3784 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3785 i128mem, SchedWriteShuffle.XMM, load, 0>,
3787 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3788 i128mem, SchedWriteShuffle.XMM, load, 0>,
3790 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3791 i128mem, SchedWriteShuffle.XMM, load, 0>,
3793 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3794 i128mem, SchedWriteShuffle.XMM, load, 0>,
3798 let Predicates = [HasAVX, NoVLX] in {
3799 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3800 i128mem, SchedWriteShuffle.XMM, load, 0>,
3802 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3803 i128mem, SchedWriteShuffle.XMM, load, 0>,
3805 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3806 i128mem, SchedWriteShuffle.XMM, load, 0>,
3808 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3809 i128mem, SchedWriteShuffle.XMM, load, 0>,
3813 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3814 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3815 i256mem, SchedWriteShuffle.YMM, load, 0>,
3816 VEX_4V, VEX_L, VEX_WIG;
3817 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3818 i256mem, SchedWriteShuffle.YMM, load, 0>,
3819 VEX_4V, VEX_L, VEX_WIG;
3820 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3821 i256mem, SchedWriteShuffle.YMM, load, 0>,
3822 VEX_4V, VEX_L, VEX_WIG;
3823 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3824 i256mem, SchedWriteShuffle.YMM, load, 0>,
3825 VEX_4V, VEX_L, VEX_WIG;
3828 let Predicates = [HasAVX2, NoVLX] in {
3829 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3830 i256mem, SchedWriteShuffle.YMM, load, 0>,
3831 VEX_4V, VEX_L, VEX_WIG;
3832 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3833 i256mem, SchedWriteShuffle.YMM, load, 0>,
3834 VEX_4V, VEX_L, VEX_WIG;
3835 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3836 i256mem, SchedWriteShuffle.YMM, load, 0>,
3837 VEX_4V, VEX_L, VEX_WIG;
3838 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3839 i256mem, SchedWriteShuffle.YMM, load, 0>,
3840 VEX_4V, VEX_L, VEX_WIG;
3843 let Constraints = "$src1 = $dst" in {
3844 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3845 i128mem, SchedWriteShuffle.XMM, memop>;
3846 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3847 i128mem, SchedWriteShuffle.XMM, memop>;
3848 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3849 i128mem, SchedWriteShuffle.XMM, memop>;
3850 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3851 i128mem, SchedWriteShuffle.XMM, memop>;
3853 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3854 i128mem, SchedWriteShuffle.XMM, memop>;
3855 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3856 i128mem, SchedWriteShuffle.XMM, memop>;
3857 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3858 i128mem, SchedWriteShuffle.XMM, memop>;
3859 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3860 i128mem, SchedWriteShuffle.XMM, memop>;
3862 } // ExeDomain = SSEPackedInt
3864 //===---------------------------------------------------------------------===//
3865 // SSE2 - Packed Integer Extract and Insert
3866 //===---------------------------------------------------------------------===//
3868 let ExeDomain = SSEPackedInt in {
3869 multiclass sse2_pinsrw<bit Is2Addr = 1> {
3870 def rr : Ii8<0xC4, MRMSrcReg,
3871 (outs VR128:$dst), (ins VR128:$src1,
3872 GR32orGR64:$src2, u8imm:$src3),
3874 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3875 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3877 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
3878 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3879 def rm : Ii8<0xC4, MRMSrcMem,
3880 (outs VR128:$dst), (ins VR128:$src1,
3881 i16mem:$src2, u8imm:$src3),
3883 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3884 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3886 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3888 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
3892 let Predicates = [HasAVX, NoBWI] in
3893 def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
3894 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3895 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3896 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3898 PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
3899 def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
3900 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3901 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3902 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3904 Sched<[WriteVecExtract]>;
3907 let Predicates = [HasAVX, NoBWI] in
3908 defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
3910 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
3911 defm PINSRW : sse2_pinsrw, PD;
3913 } // ExeDomain = SSEPackedInt
3915 //===---------------------------------------------------------------------===//
3916 // SSE2 - Packed Mask Creation
3917 //===---------------------------------------------------------------------===//
3919 let ExeDomain = SSEPackedInt in {
3921 def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3923 "pmovmskb\t{$src, $dst|$dst, $src}",
3924 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3925 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
3927 let Predicates = [HasAVX2] in {
3928 def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3930 "pmovmskb\t{$src, $dst|$dst, $src}",
3931 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
3932 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
3935 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
3936 "pmovmskb\t{$src, $dst|$dst, $src}",
3937 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3938 Sched<[WriteVecMOVMSK]>;
3940 } // ExeDomain = SSEPackedInt
3942 //===---------------------------------------------------------------------===//
3943 // SSE2 - Conditional Store
3944 //===---------------------------------------------------------------------===//
3946 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3947 let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
3948 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
3949 (ins VR128:$src, VR128:$mask),
3950 "maskmovdqu\t{$mask, $src|$src, $mask}",
3951 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
3953 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
3954 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
3955 (ins VR128:$src, VR128:$mask),
3956 "maskmovdqu\t{$mask, $src|$src, $mask}",
3957 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
3960 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
3961 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
3962 "maskmovdqu\t{$mask, $src|$src, $mask}",
3963 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
3964 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
3965 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
3966 "maskmovdqu\t{$mask, $src|$src, $mask}",
3967 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
3969 } // ExeDomain = SSEPackedInt
3971 //===---------------------------------------------------------------------===//
3972 // SSE2 - Move Doubleword/Quadword
3973 //===---------------------------------------------------------------------===//
3975 //===---------------------------------------------------------------------===//
3976 // Move Int Doubleword to Packed Double Int
3978 let ExeDomain = SSEPackedInt in {
3979 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
3980 "movd\t{$src, $dst|$dst, $src}",
3982 (v4i32 (scalar_to_vector GR32:$src)))]>,
3983 VEX, Sched<[WriteVecMoveFromGpr]>;
3984 def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
3985 "movd\t{$src, $dst|$dst, $src}",
3987 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
3988 VEX, Sched<[WriteVecLoad]>;
3989 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
3990 "movq\t{$src, $dst|$dst, $src}",
3992 (v2i64 (scalar_to_vector GR64:$src)))]>,
3993 VEX, Sched<[WriteVecMoveFromGpr]>;
3994 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
3995 def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
3996 "movq\t{$src, $dst|$dst, $src}", []>,
3997 VEX, Sched<[WriteVecLoad]>;
3998 let isCodeGenOnly = 1 in
3999 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4000 "movq\t{$src, $dst|$dst, $src}",
4001 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4002 VEX, Sched<[WriteVecMoveFromGpr]>;
4004 def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4005 "movd\t{$src, $dst|$dst, $src}",
4007 (v4i32 (scalar_to_vector GR32:$src)))]>,
4008 Sched<[WriteVecMoveFromGpr]>;
4009 def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4010 "movd\t{$src, $dst|$dst, $src}",
4012 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4013 Sched<[WriteVecLoad]>;
4014 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4015 "movq\t{$src, $dst|$dst, $src}",
4017 (v2i64 (scalar_to_vector GR64:$src)))]>,
4018 Sched<[WriteVecMoveFromGpr]>;
4019 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4020 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4021 "movq\t{$src, $dst|$dst, $src}", []>,
4022 Sched<[WriteVecLoad]>;
4023 let isCodeGenOnly = 1 in
4024 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4025 "movq\t{$src, $dst|$dst, $src}",
4026 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4027 Sched<[WriteVecMoveFromGpr]>;
4028 } // ExeDomain = SSEPackedInt
4030 //===---------------------------------------------------------------------===//
4031 // Move Int Doubleword to Single Scalar
4033 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4034 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4035 "movd\t{$src, $dst|$dst, $src}",
4036 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4037 VEX, Sched<[WriteVecMoveFromGpr]>;
4039 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4040 "movd\t{$src, $dst|$dst, $src}",
4041 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4042 Sched<[WriteVecMoveFromGpr]>;
4044 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4046 //===---------------------------------------------------------------------===//
4047 // Move Packed Doubleword Int to Packed Double Int
4049 let ExeDomain = SSEPackedInt in {
4050 def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4051 "movd\t{$src, $dst|$dst, $src}",
4052 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4054 Sched<[WriteVecMoveToGpr]>;
4055 def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
4056 (ins i32mem:$dst, VR128:$src),
4057 "movd\t{$src, $dst|$dst, $src}",
4058 [(store (i32 (extractelt (v4i32 VR128:$src),
4059 (iPTR 0))), addr:$dst)]>,
4060 VEX, Sched<[WriteVecStore]>;
4061 def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4062 "movd\t{$src, $dst|$dst, $src}",
4063 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4065 Sched<[WriteVecMoveToGpr]>;
4066 def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4067 "movd\t{$src, $dst|$dst, $src}",
4068 [(store (i32 (extractelt (v4i32 VR128:$src),
4069 (iPTR 0))), addr:$dst)]>,
4070 Sched<[WriteVecStore]>;
4071 } // ExeDomain = SSEPackedInt
4073 //===---------------------------------------------------------------------===//
4074 // Move Packed Doubleword Int first element to Doubleword Int
4076 let ExeDomain = SSEPackedInt in {
4077 let SchedRW = [WriteVecMoveToGpr] in {
4078 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4079 "movq\t{$src, $dst|$dst, $src}",
4080 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4084 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4085 "movq\t{$src, $dst|$dst, $src}",
4086 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4090 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4091 def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4092 (ins i64mem:$dst, VR128:$src),
4093 "movq\t{$src, $dst|$dst, $src}", []>,
4094 VEX, Sched<[WriteVecStore]>;
4095 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4096 def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4097 "movq\t{$src, $dst|$dst, $src}", []>,
4098 Sched<[WriteVecStore]>;
4099 } // ExeDomain = SSEPackedInt
4101 //===---------------------------------------------------------------------===//
4102 // Bitcast FR64 <-> GR64
4104 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4105 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4106 "movq\t{$src, $dst|$dst, $src}",
4107 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4108 VEX, Sched<[WriteVecMoveToGpr]>;
4110 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4111 "movq\t{$src, $dst|$dst, $src}",
4112 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4113 Sched<[WriteVecMoveToGpr]>;
4114 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4116 //===---------------------------------------------------------------------===//
4117 // Move Scalar Single to Double Int
4119 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4120 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4121 "movd\t{$src, $dst|$dst, $src}",
4122 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4123 VEX, Sched<[WriteVecMoveToGpr]>;
4124 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4125 "movd\t{$src, $dst|$dst, $src}",
4126 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4127 Sched<[WriteVecMoveToGpr]>;
4128 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4130 let Predicates = [UseAVX] in {
4131 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4132 (VMOVDI2PDIrr GR32:$src)>;
4134 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4135 (VMOV64toPQIrr GR64:$src)>;
4137 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4138 // These instructions also write zeros in the high part of a 256-bit register.
4139 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4140 (VMOVDI2PDIrm addr:$src)>;
4141 def : Pat<(v4i32 (X86vzload32 addr:$src)),
4142 (VMOVDI2PDIrm addr:$src)>;
4143 def : Pat<(v8i32 (X86vzload32 addr:$src)),
4144 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4147 let Predicates = [UseSSE2] in {
4148 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4149 (MOVDI2PDIrr GR32:$src)>;
4151 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4152 (MOV64toPQIrr GR64:$src)>;
4153 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4154 (MOVDI2PDIrm addr:$src)>;
4155 def : Pat<(v4i32 (X86vzload32 addr:$src)),
4156 (MOVDI2PDIrm addr:$src)>;
4159 // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4160 // "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4162 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4163 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4164 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4165 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4166 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4167 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4168 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4169 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4170 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4172 //===---------------------------------------------------------------------===//
4173 // SSE2 - Move Quadword
4174 //===---------------------------------------------------------------------===//
4176 //===---------------------------------------------------------------------===//
4177 // Move Quadword Int to Packed Quadword Int
4180 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4181 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4182 "vmovq\t{$src, $dst|$dst, $src}",
4184 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4185 VEX, Requires<[UseAVX]>, VEX_WIG;
4186 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4187 "movq\t{$src, $dst|$dst, $src}",
4189 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4190 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4191 } // ExeDomain, SchedRW
4193 //===---------------------------------------------------------------------===//
4194 // Move Packed Quadword Int to Quadword Int
4196 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4197 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4198 "movq\t{$src, $dst|$dst, $src}",
4199 [(store (i64 (extractelt (v2i64 VR128:$src),
4200 (iPTR 0))), addr:$dst)]>,
4202 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4203 "movq\t{$src, $dst|$dst, $src}",
4204 [(store (i64 (extractelt (v2i64 VR128:$src),
4205 (iPTR 0))), addr:$dst)]>;
4206 } // ExeDomain, SchedRW
4208 // For disassembler only
4209 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4210 SchedRW = [SchedWriteVecLogic.XMM] in {
4211 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4212 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4213 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4214 "movq\t{$src, $dst|$dst, $src}", []>;
4217 def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4218 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4219 def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4220 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4222 let Predicates = [UseAVX] in {
4223 def : Pat<(v2i64 (X86vzload64 addr:$src)),
4224 (VMOVQI2PQIrm addr:$src)>;
4225 def : Pat<(v4i64 (X86vzload64 addr:$src)),
4226 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4228 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4229 (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4232 let Predicates = [UseSSE2] in {
4233 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4235 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4236 (MOVPQI2QImr addr:$dst, VR128:$src)>;
4239 //===---------------------------------------------------------------------===//
4240 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4241 // IA32 document. movq xmm1, xmm2 does clear the high bits.
4243 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4244 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4245 "vmovq\t{$src, $dst|$dst, $src}",
4246 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4247 XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4248 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4249 "movq\t{$src, $dst|$dst, $src}",
4250 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4251 XS, Requires<[UseSSE2]>;
4252 } // ExeDomain, SchedRW
4254 let Predicates = [UseAVX] in {
4255 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4256 (VMOVZPQILo2PQIrr VR128:$src)>;
4258 let Predicates = [UseSSE2] in {
4259 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4260 (MOVZPQILo2PQIrr VR128:$src)>;
4263 let Predicates = [UseAVX] in {
4264 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4265 (SUBREG_TO_REG (i32 0),
4266 (v2f64 (VMOVZPQILo2PQIrr
4267 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4269 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4270 (SUBREG_TO_REG (i32 0),
4271 (v2i64 (VMOVZPQILo2PQIrr
4272 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4276 //===---------------------------------------------------------------------===//
4277 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4278 //===---------------------------------------------------------------------===//
4280 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4281 ValueType vt, RegisterClass RC, PatFrag mem_frag,
4282 X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4283 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4284 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4285 [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4287 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4288 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4289 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4290 Sched<[sched.Folded]>;
4293 let Predicates = [HasAVX, NoVLX] in {
4294 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4295 v4f32, VR128, loadv4f32, f128mem,
4296 SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4297 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4298 v4f32, VR128, loadv4f32, f128mem,
4299 SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4300 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4301 v8f32, VR256, loadv8f32, f256mem,
4302 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4303 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4304 v8f32, VR256, loadv8f32, f256mem,
4305 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4307 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4308 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4309 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4310 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4312 let Predicates = [HasAVX, NoVLX] in {
4313 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4314 (VMOVSHDUPrr VR128:$src)>;
4315 def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4316 (VMOVSHDUPrm addr:$src)>;
4317 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4318 (VMOVSLDUPrr VR128:$src)>;
4319 def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4320 (VMOVSLDUPrm addr:$src)>;
4321 def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4322 (VMOVSHDUPYrr VR256:$src)>;
4323 def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4324 (VMOVSHDUPYrm addr:$src)>;
4325 def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4326 (VMOVSLDUPYrr VR256:$src)>;
4327 def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4328 (VMOVSLDUPYrm addr:$src)>;
4331 let Predicates = [UseSSE3] in {
4332 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4333 (MOVSHDUPrr VR128:$src)>;
4334 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4335 (MOVSHDUPrm addr:$src)>;
4336 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4337 (MOVSLDUPrr VR128:$src)>;
4338 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4339 (MOVSLDUPrm addr:$src)>;
4342 //===---------------------------------------------------------------------===//
4343 // SSE3 - Replicate Double FP - MOVDDUP
4344 //===---------------------------------------------------------------------===//
4346 multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4347 def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4348 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4349 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4351 def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4352 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4355 (scalar_to_vector (loadf64 addr:$src)))))]>,
4356 Sched<[sched.XMM.Folded]>;
4359 // FIXME: Merge with above classes when there are patterns for the ymm version
4360 multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4361 def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4362 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4363 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4365 def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4366 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4368 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4369 Sched<[sched.YMM.Folded]>;
4372 let Predicates = [HasAVX, NoVLX] in {
4373 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4375 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4376 VEX, VEX_L, VEX_WIG;
4379 defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4382 let Predicates = [HasAVX, NoVLX] in {
4383 def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
4384 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4385 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4386 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4389 let Predicates = [UseSSE3] in {
4390 // No need for aligned memory as this only loads 64-bits.
4391 def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
4392 (MOVDDUPrm addr:$src)>;
4393 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4394 (MOVDDUPrm addr:$src)>;
4397 //===---------------------------------------------------------------------===//
4398 // SSE3 - Move Unaligned Integer
4399 //===---------------------------------------------------------------------===//
4401 let Predicates = [HasAVX] in {
4402 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4403 "vlddqu\t{$src, $dst|$dst, $src}",
4404 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4405 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4406 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4407 "vlddqu\t{$src, $dst|$dst, $src}",
4408 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4409 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4412 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4413 "lddqu\t{$src, $dst|$dst, $src}",
4414 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4415 Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4417 //===---------------------------------------------------------------------===//
4418 // SSE3 - Arithmetic
4419 //===---------------------------------------------------------------------===//
4421 multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4422 X86MemOperand x86memop, X86FoldableSchedWrite sched,
4423 PatFrag ld_frag, bit Is2Addr = 1> {
4424 def rr : I<0xD0, MRMSrcReg,
4425 (outs RC:$dst), (ins RC:$src1, RC:$src2),
4427 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4428 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4429 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4431 def rm : I<0xD0, MRMSrcMem,
4432 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4434 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4435 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4436 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4437 Sched<[sched.Folded, sched.ReadAfterFold]>;
4440 let Predicates = [HasAVX] in {
4441 let ExeDomain = SSEPackedSingle in {
4442 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4443 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4444 XD, VEX_4V, VEX_WIG;
4445 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4446 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4447 XD, VEX_4V, VEX_L, VEX_WIG;
4449 let ExeDomain = SSEPackedDouble in {
4450 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4451 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4452 PD, VEX_4V, VEX_WIG;
4453 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4454 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4455 PD, VEX_4V, VEX_L, VEX_WIG;
4458 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4459 let ExeDomain = SSEPackedSingle in
4460 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4461 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4462 let ExeDomain = SSEPackedDouble in
4463 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4464 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4467 //===---------------------------------------------------------------------===//
4468 // SSE3 Instructions
4469 //===---------------------------------------------------------------------===//
4472 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4473 X86MemOperand x86memop, SDNode OpNode,
4474 X86FoldableSchedWrite sched, PatFrag ld_frag,
4476 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4478 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4479 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4480 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4483 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4485 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4486 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4487 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4488 Sched<[sched.Folded, sched.ReadAfterFold]>;
4490 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4491 X86MemOperand x86memop, SDNode OpNode,
4492 X86FoldableSchedWrite sched, PatFrag ld_frag,
4494 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4496 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4497 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4498 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4501 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4503 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4504 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4505 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4506 Sched<[sched.Folded, sched.ReadAfterFold]>;
4509 let Predicates = [HasAVX] in {
4510 let ExeDomain = SSEPackedSingle in {
4511 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4512 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4513 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4514 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4515 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4516 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4517 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4518 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4520 let ExeDomain = SSEPackedDouble in {
4521 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4522 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4523 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4524 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4525 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4526 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4527 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4528 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4532 let Constraints = "$src1 = $dst" in {
4533 let ExeDomain = SSEPackedSingle in {
4534 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4535 WriteFHAdd, memopv4f32>;
4536 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4537 WriteFHAdd, memopv4f32>;
4539 let ExeDomain = SSEPackedDouble in {
4540 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4541 WriteFHAdd, memopv2f64>;
4542 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4543 WriteFHAdd, memopv2f64>;
4547 //===---------------------------------------------------------------------===//
4548 // SSSE3 - Packed Absolute Instructions
4549 //===---------------------------------------------------------------------===//
4551 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4552 multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4553 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4554 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4556 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4557 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4560 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4562 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4564 (vt (OpNode (ld_frag addr:$src))))]>,
4565 Sched<[sched.XMM.Folded]>;
4568 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4569 multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4570 SDNode OpNode, X86SchedWriteWidths sched> {
4571 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4573 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4574 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4577 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4579 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4581 (vt (OpNode (load addr:$src))))]>,
4582 Sched<[sched.YMM.Folded]>;
4585 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4586 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4587 load>, VEX, VEX_WIG;
4588 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4589 load>, VEX, VEX_WIG;
4591 let Predicates = [HasAVX, NoVLX] in {
4592 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4593 load>, VEX, VEX_WIG;
4595 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4596 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4597 VEX, VEX_L, VEX_WIG;
4598 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4599 VEX, VEX_L, VEX_WIG;
4601 let Predicates = [HasAVX2, NoVLX] in {
4602 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4603 VEX, VEX_L, VEX_WIG;
4606 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4608 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4610 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4613 //===---------------------------------------------------------------------===//
4614 // SSSE3 - Packed Binary Operator Instructions
4615 //===---------------------------------------------------------------------===//
4617 /// SS3I_binop_rm - Simple SSSE3 bin op
4618 multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4619 ValueType DstVT, ValueType OpVT, RegisterClass RC,
4620 PatFrag memop_frag, X86MemOperand x86memop,
4621 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4622 let isCommutable = 1 in
4623 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4624 (ins RC:$src1, RC:$src2),
4626 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4627 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4628 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4630 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4631 (ins RC:$src1, x86memop:$src2),
4633 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4634 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4636 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4637 Sched<[sched.Folded, sched.ReadAfterFold]>;
4640 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4641 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4642 Intrinsic IntId128, X86FoldableSchedWrite sched,
4643 PatFrag ld_frag, bit Is2Addr = 1> {
4644 let isCommutable = 1 in
4645 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4646 (ins VR128:$src1, VR128:$src2),
4648 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4649 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4650 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4652 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4653 (ins VR128:$src1, i128mem:$src2),
4655 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4656 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4658 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4659 Sched<[sched.Folded, sched.ReadAfterFold]>;
4662 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4664 X86FoldableSchedWrite sched> {
4665 let isCommutable = 1 in
4666 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4667 (ins VR256:$src1, VR256:$src2),
4668 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4669 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4671 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4672 (ins VR256:$src1, i256mem:$src2),
4673 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4675 (IntId256 VR256:$src1, (load addr:$src2)))]>,
4676 Sched<[sched.Folded, sched.ReadAfterFold]>;
4679 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4680 let isCommutable = 0 in {
4681 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4682 VR128, load, i128mem,
4683 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4684 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4685 v16i8, VR128, load, i128mem,
4686 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4688 defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4689 VR128, load, i128mem,
4690 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4693 let ImmT = NoImm, Predicates = [HasAVX] in {
4694 let isCommutable = 0 in {
4695 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4697 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4698 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4700 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4701 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4703 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4704 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4706 SchedWritePHAdd.XMM, 0>, VEX_4V;
4707 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
4708 int_x86_ssse3_psign_b_128,
4709 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4710 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
4711 int_x86_ssse3_psign_w_128,
4712 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4713 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
4714 int_x86_ssse3_psign_d_128,
4715 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4716 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
4717 int_x86_ssse3_phadd_sw_128,
4718 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4719 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
4720 int_x86_ssse3_phsub_sw_128,
4721 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4725 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4726 let isCommutable = 0 in {
4727 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4728 VR256, load, i256mem,
4729 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4730 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4731 v32i8, VR256, load, i256mem,
4732 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4734 defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4735 VR256, load, i256mem,
4736 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4739 let ImmT = NoImm, Predicates = [HasAVX2] in {
4740 let isCommutable = 0 in {
4741 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4742 VR256, load, i256mem,
4743 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4744 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4746 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4747 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4748 VR256, load, i256mem,
4749 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4750 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4752 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
4753 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4754 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4755 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4756 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4757 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4758 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4759 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4760 int_x86_avx2_phadd_sw,
4761 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4762 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4763 int_x86_avx2_phsub_sw,
4764 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4768 // None of these have i8 immediate fields.
4769 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4770 let isCommutable = 0 in {
4771 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4772 memop, i128mem, SchedWritePHAdd.XMM>;
4773 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4774 memop, i128mem, SchedWritePHAdd.XMM>;
4775 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4776 memop, i128mem, SchedWritePHAdd.XMM>;
4777 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4778 memop, i128mem, SchedWritePHAdd.XMM>;
4779 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4780 SchedWriteVecALU.XMM, memop>;
4781 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4782 SchedWriteVecALU.XMM, memop>;
4783 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4784 SchedWriteVecALU.XMM, memop>;
4785 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4786 memop, i128mem, SchedWriteVarShuffle.XMM>;
4787 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
4788 int_x86_ssse3_phadd_sw_128,
4789 SchedWritePHAdd.XMM, memop>;
4790 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
4791 int_x86_ssse3_phsub_sw_128,
4792 SchedWritePHAdd.XMM, memop>;
4793 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4794 v16i8, VR128, memop, i128mem,
4795 SchedWriteVecIMul.XMM>;
4797 defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4798 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4801 //===---------------------------------------------------------------------===//
4802 // SSSE3 - Packed Align Instruction Patterns
4803 //===---------------------------------------------------------------------===//
4805 multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4806 PatFrag memop_frag, X86MemOperand x86memop,
4807 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4808 let hasSideEffects = 0 in {
4809 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4810 (ins RC:$src1, RC:$src2, u8imm:$src3),
4812 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4814 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4815 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>,
4818 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4819 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4821 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4823 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4824 [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4825 (memop_frag addr:$src2),
4826 (i8 imm:$src3))))]>,
4827 Sched<[sched.Folded, sched.ReadAfterFold]>;
4831 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4832 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4833 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4834 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4835 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4836 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4837 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4838 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4839 SchedWriteShuffle.XMM>;
4841 //===---------------------------------------------------------------------===//
4842 // SSSE3 - Thread synchronization
4843 //===---------------------------------------------------------------------===//
4845 let SchedRW = [WriteSystem] in {
4846 let Uses = [EAX, ECX, EDX] in
4847 def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4848 TB, Requires<[HasSSE3, Not64BitMode]>;
4849 let Uses = [RAX, ECX, EDX] in
4850 def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4851 TB, Requires<[HasSSE3, In64BitMode]>;
4853 let Uses = [ECX, EAX] in
4854 def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
4855 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4858 def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4859 def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4861 def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4862 Requires<[Not64BitMode]>;
4863 def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4864 Requires<[In64BitMode]>;
4866 //===----------------------------------------------------------------------===//
4867 // SSE4.1 - Packed Move with Sign/Zero Extend
4868 // NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
4869 //===----------------------------------------------------------------------===//
4871 multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4872 RegisterClass OutRC, RegisterClass InRC,
4873 X86FoldableSchedWrite sched> {
4874 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
4875 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4878 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
4879 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4880 Sched<[sched.Folded]>;
4883 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
4884 X86MemOperand MemOp, X86MemOperand MemYOp,
4886 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
4887 SchedWriteShuffle.XMM>;
4888 let Predicates = [HasAVX, prd] in
4889 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
4890 VR128, VR128, SchedWriteShuffle.XMM>,
4892 let Predicates = [HasAVX2, prd] in
4893 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
4894 VR256, VR128, WriteShuffle256>,
4895 VEX, VEX_L, VEX_WIG;
4898 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4899 X86MemOperand MemYOp, Predicate prd> {
4900 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
4901 MemOp, MemYOp, prd>;
4902 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
4903 !strconcat("pmovzx", OpcodeStr),
4904 MemOp, MemYOp, prd>;
4907 defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
4908 defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
4909 defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
4911 defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
4912 defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
4914 defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
4917 multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
4918 SDNode ExtOp, SDNode InVecOp> {
4919 // Register-Register patterns
4920 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4921 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
4922 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
4924 let Predicates = [HasAVX2, NoVLX] in {
4925 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
4926 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
4927 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
4928 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
4930 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
4931 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
4932 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
4933 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
4935 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
4936 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
4939 // Simple Register-Memory patterns
4940 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4941 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4942 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4944 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
4945 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4948 let Predicates = [HasAVX2, NoVLX] in {
4949 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4950 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4951 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4952 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
4954 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
4955 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
4956 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
4957 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
4959 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
4960 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
4963 // AVX2 Register-Memory patterns
4964 let Predicates = [HasAVX2, NoVLX] in {
4965 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
4966 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
4968 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
4969 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4970 def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
4971 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4973 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
4974 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
4976 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
4977 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
4978 def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
4979 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
4981 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
4982 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
4983 def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))),
4984 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
4988 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
4989 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
4991 // SSE4.1/AVX patterns.
4992 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
4994 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4995 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
4996 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
4998 let Predicates = [HasAVX, NoVLX] in {
4999 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5000 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5001 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5002 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5004 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5005 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5006 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5007 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5009 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5010 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5012 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5013 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5014 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5016 let Predicates = [HasAVX, NoVLX] in {
5017 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5018 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5019 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5020 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5022 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5023 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5024 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5025 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5027 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5028 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5030 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5031 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5032 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5033 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5034 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5035 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5036 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5037 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5038 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5040 let Predicates = [HasAVX, NoVLX] in {
5041 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5042 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5043 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5044 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5045 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5046 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5048 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5049 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5050 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5051 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5053 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5054 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5055 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5056 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5057 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5058 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5059 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5060 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5062 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5063 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5064 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5065 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5066 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5067 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5069 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5070 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5071 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5072 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5073 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5074 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5075 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5076 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5080 defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5081 defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5083 let Predicates = [UseSSE41] in {
5084 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5085 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5088 //===----------------------------------------------------------------------===//
5089 // SSE4.1 - Extract Instructions
5090 //===----------------------------------------------------------------------===//
5092 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5093 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5094 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5095 (ins VR128:$src1, u8imm:$src2),
5096 !strconcat(OpcodeStr,
5097 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5098 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5100 Sched<[WriteVecExtract]>;
5101 let hasSideEffects = 0, mayStore = 1 in
5102 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5103 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5104 !strconcat(OpcodeStr,
5105 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5106 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
5107 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5110 let Predicates = [HasAVX, NoBWI] in
5111 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
5113 defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
5116 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5117 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5118 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5119 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5120 (ins VR128:$src1, u8imm:$src2),
5121 !strconcat(OpcodeStr,
5122 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5123 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5125 let hasSideEffects = 0, mayStore = 1 in
5126 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5127 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5128 !strconcat(OpcodeStr,
5129 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5130 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
5131 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5134 let Predicates = [HasAVX, NoBWI] in
5135 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
5137 defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
5140 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5141 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5142 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5143 (ins VR128:$src1, u8imm:$src2),
5144 !strconcat(OpcodeStr,
5145 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5147 (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5148 Sched<[WriteVecExtract]>;
5149 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5150 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5151 !strconcat(OpcodeStr,
5152 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5153 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5154 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5157 let Predicates = [HasAVX, NoDQI] in
5158 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5160 defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
5162 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5163 multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5164 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5165 (ins VR128:$src1, u8imm:$src2),
5166 !strconcat(OpcodeStr,
5167 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5169 (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5170 Sched<[WriteVecExtract]>;
5171 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5172 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5173 !strconcat(OpcodeStr,
5174 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5175 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5176 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5179 let Predicates = [HasAVX, NoDQI] in
5180 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5182 defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W;
5184 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5186 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5187 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5188 (ins VR128:$src1, u8imm:$src2),
5189 !strconcat(OpcodeStr,
5190 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5191 [(set GR32orGR64:$dst,
5192 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5193 Sched<[WriteVecExtract]>;
5194 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5195 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5196 !strconcat(OpcodeStr,
5197 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5198 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5199 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5202 let ExeDomain = SSEPackedSingle in {
5203 let Predicates = [UseAVX] in
5204 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5205 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
5208 //===----------------------------------------------------------------------===//
5209 // SSE4.1 - Insert Instructions
5210 //===----------------------------------------------------------------------===//
5212 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5213 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5214 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5216 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5218 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5220 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
5221 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5222 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5223 (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5225 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5227 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5229 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>,
5230 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5233 let Predicates = [HasAVX, NoBWI] in
5234 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
5235 let Constraints = "$src1 = $dst" in
5236 defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
5238 multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5239 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5240 (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5242 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5244 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5246 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5247 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5248 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5249 (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5251 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5253 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5255 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5256 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5259 let Predicates = [HasAVX, NoDQI] in
5260 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5261 let Constraints = "$src1 = $dst" in
5262 defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5264 multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5265 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5266 (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5268 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5270 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5272 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5273 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5274 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5275 (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5277 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5279 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5281 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5282 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5285 let Predicates = [HasAVX, NoDQI] in
5286 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5287 let Constraints = "$src1 = $dst" in
5288 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5290 // insertps has a few different modes, there's the first two here below which
5291 // are optimized inserts that won't zero arbitrary elements in the destination
5292 // vector. The next one matches the intrinsic and could zero arbitrary elements
5293 // in the target vector.
5294 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5295 let isCommutable = 1 in
5296 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5297 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5299 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5301 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5303 (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>,
5304 Sched<[SchedWriteFShuffle.XMM]>;
5305 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5306 (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5308 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5310 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5312 (X86insertps VR128:$src1,
5313 (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5315 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5318 let ExeDomain = SSEPackedSingle in {
5319 let Predicates = [UseAVX] in
5320 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5322 let Constraints = "$src1 = $dst" in
5323 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5326 //===----------------------------------------------------------------------===//
5327 // SSE4.1 - Round Instructions
5328 //===----------------------------------------------------------------------===//
5330 multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5331 X86MemOperand x86memop, RegisterClass RC,
5332 ValueType VT, PatFrag mem_frag, SDNode OpNode,
5333 X86FoldableSchedWrite sched> {
5334 // Intrinsic operation, reg.
5335 // Vector intrinsic operation, reg
5336 def r : SS4AIi8<opc, MRMSrcReg,
5337 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5338 !strconcat(OpcodeStr,
5339 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5340 [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>,
5343 // Vector intrinsic operation, mem
5344 def m : SS4AIi8<opc, MRMSrcMem,
5345 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5346 !strconcat(OpcodeStr,
5347 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5349 (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>,
5350 Sched<[sched.Folded]>;
5353 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5354 string OpcodeStr, X86FoldableSchedWrite sched> {
5355 let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
5356 def SSr : SS4AIi8<opcss, MRMSrcReg,
5357 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5358 !strconcat(OpcodeStr,
5359 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5360 []>, Sched<[sched]>;
5363 def SSm : SS4AIi8<opcss, MRMSrcMem,
5364 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5365 !strconcat(OpcodeStr,
5366 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5367 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5368 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5370 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
5371 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5372 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5373 !strconcat(OpcodeStr,
5374 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5375 []>, Sched<[sched]>;
5378 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5379 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5380 !strconcat(OpcodeStr,
5381 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5382 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5383 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5386 multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5387 string OpcodeStr, X86FoldableSchedWrite sched> {
5388 let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
5389 def SSr : SS4AIi8<opcss, MRMSrcReg,
5390 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5391 !strconcat(OpcodeStr,
5392 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5393 []>, Sched<[sched]>;
5396 def SSm : SS4AIi8<opcss, MRMSrcMem,
5397 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5398 !strconcat(OpcodeStr,
5399 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5400 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5401 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5403 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
5404 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5405 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5406 !strconcat(OpcodeStr,
5407 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5408 []>, Sched<[sched]>;
5411 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5412 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5413 !strconcat(OpcodeStr,
5414 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5415 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5416 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5419 multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5420 string OpcodeStr, X86FoldableSchedWrite sched,
5421 ValueType VT32, ValueType VT64,
5422 SDNode OpNode, bit Is2Addr = 1> {
5423 let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
5424 def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5425 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5427 !strconcat(OpcodeStr,
5428 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5429 !strconcat(OpcodeStr,
5430 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5431 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
5434 def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5435 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5437 !strconcat(OpcodeStr,
5438 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5439 !strconcat(OpcodeStr,
5440 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5442 (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
5443 Sched<[sched.Folded, sched.ReadAfterFold]>;
5444 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5446 let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
5447 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5448 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5450 !strconcat(OpcodeStr,
5451 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5452 !strconcat(OpcodeStr,
5453 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5454 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
5457 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5458 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5460 !strconcat(OpcodeStr,
5461 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5462 !strconcat(OpcodeStr,
5463 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5465 (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
5466 Sched<[sched.Folded, sched.ReadAfterFold]>;
5467 } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5470 // FP round - roundss, roundps, roundsd, roundpd
5471 let Predicates = [HasAVX, NoVLX] in {
5472 let ExeDomain = SSEPackedSingle in {
5474 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5475 loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>,
5477 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5478 loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>,
5479 VEX, VEX_L, VEX_WIG;
5482 let ExeDomain = SSEPackedDouble in {
5483 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5484 loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>,
5486 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5487 loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>,
5488 VEX, VEX_L, VEX_WIG;
5491 let Predicates = [UseAVX] in {
5492 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5493 v4f32, v2f64, X86RndScales, 0>,
5494 VEX_4V, VEX_LIG, VEX_WIG;
5495 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5496 VEX_4V, VEX_LIG, VEX_WIG;
5499 let Predicates = [UseAVX] in {
5500 def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
5501 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, imm:$src2)>;
5502 def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
5503 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, imm:$src2)>;
5506 let Predicates = [UseAVX, OptForSize] in {
5507 def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
5508 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
5509 def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
5510 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
5513 let ExeDomain = SSEPackedSingle in
5514 defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5515 memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>;
5516 let ExeDomain = SSEPackedDouble in
5517 defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5518 memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>;
5520 defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5522 let Constraints = "$src1 = $dst" in
5523 defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5524 v4f32, v2f64, X86RndScales>;
5526 let Predicates = [UseSSE41] in {
5527 def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
5528 (ROUNDSSr FR32:$src1, imm:$src2)>;
5529 def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
5530 (ROUNDSDr FR64:$src1, imm:$src2)>;
5533 let Predicates = [UseSSE41, OptForSize] in {
5534 def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
5535 (ROUNDSSm addr:$src1, imm:$src2)>;
5536 def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
5537 (ROUNDSDm addr:$src1, imm:$src2)>;
5540 //===----------------------------------------------------------------------===//
5541 // SSE4.1 - Packed Bit Test
5542 //===----------------------------------------------------------------------===//
5544 // ptest instruction we'll lower to this in X86ISelLowering primarily from
5545 // the intel intrinsic that corresponds to this.
5546 let Defs = [EFLAGS], Predicates = [HasAVX] in {
5547 def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5548 "vptest\t{$src2, $src1|$src1, $src2}",
5549 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5550 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
5551 def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5552 "vptest\t{$src2, $src1|$src1, $src2}",
5553 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5554 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5557 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5558 "vptest\t{$src2, $src1|$src1, $src2}",
5559 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5560 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
5561 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5562 "vptest\t{$src2, $src1|$src1, $src2}",
5563 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5564 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5565 VEX, VEX_L, VEX_WIG;
5568 let Defs = [EFLAGS] in {
5569 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5570 "ptest\t{$src2, $src1|$src1, $src2}",
5571 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5572 Sched<[SchedWriteVecTest.XMM]>;
5573 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5574 "ptest\t{$src2, $src1|$src1, $src2}",
5575 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5576 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5579 // The bit test instructions below are AVX only
5580 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5581 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5582 X86FoldableSchedWrite sched> {
5583 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5584 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5585 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5586 Sched<[sched]>, VEX;
5587 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5588 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5589 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5590 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5593 let Defs = [EFLAGS], Predicates = [HasAVX] in {
5594 let ExeDomain = SSEPackedSingle in {
5595 defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5596 SchedWriteFTest.XMM>;
5597 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5598 SchedWriteFTest.YMM>, VEX_L;
5600 let ExeDomain = SSEPackedDouble in {
5601 defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5602 SchedWriteFTest.XMM>;
5603 defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5604 SchedWriteFTest.YMM>, VEX_L;
5608 //===----------------------------------------------------------------------===//
5609 // SSE4.1 - Misc Instructions
5610 //===----------------------------------------------------------------------===//
5612 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5613 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5614 "popcnt{w}\t{$src, $dst|$dst, $src}",
5615 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5616 Sched<[WritePOPCNT]>, OpSize16, XS;
5617 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5618 "popcnt{w}\t{$src, $dst|$dst, $src}",
5619 [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5620 (implicit EFLAGS)]>,
5621 Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5623 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5624 "popcnt{l}\t{$src, $dst|$dst, $src}",
5625 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5626 Sched<[WritePOPCNT]>, OpSize32, XS;
5628 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5629 "popcnt{l}\t{$src, $dst|$dst, $src}",
5630 [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5631 (implicit EFLAGS)]>,
5632 Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5634 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5635 "popcnt{q}\t{$src, $dst|$dst, $src}",
5636 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5637 Sched<[WritePOPCNT]>, XS;
5638 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5639 "popcnt{q}\t{$src, $dst|$dst, $src}",
5640 [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5641 (implicit EFLAGS)]>,
5642 Sched<[WritePOPCNT.Folded]>, XS;
5645 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5646 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5647 SDNode OpNode, PatFrag ld_frag,
5648 X86FoldableSchedWrite Sched> {
5649 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5651 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5652 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5654 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5656 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5658 (v8i16 (OpNode (ld_frag addr:$src))))]>,
5659 Sched<[Sched.Folded]>;
5662 // PHMIN has the same profile as PSAD, thus we use the same scheduling
5663 // model, although the naming is misleading.
5664 let Predicates = [HasAVX] in
5665 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5667 WritePHMINPOS>, VEX, VEX_WIG;
5668 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5672 /// SS48I_binop_rm - Simple SSE41 binary operator.
5673 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5674 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5675 X86MemOperand x86memop, X86FoldableSchedWrite sched,
5677 let isCommutable = 1 in
5678 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5679 (ins RC:$src1, RC:$src2),
5681 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5682 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5683 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5685 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5686 (ins RC:$src1, x86memop:$src2),
5688 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5689 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5691 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5692 Sched<[sched.Folded, sched.ReadAfterFold]>;
5695 let Predicates = [HasAVX, NoVLX] in {
5696 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5697 load, i128mem, SchedWriteVecALU.XMM, 0>,
5699 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5700 load, i128mem, SchedWriteVecALU.XMM, 0>,
5702 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5703 load, i128mem, SchedWriteVecALU.XMM, 0>,
5705 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5706 load, i128mem, SchedWriteVecALU.XMM, 0>,
5708 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5709 load, i128mem, SchedWriteVecIMul.XMM, 0>,
5712 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5713 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5714 load, i128mem, SchedWriteVecALU.XMM, 0>,
5716 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5717 load, i128mem, SchedWriteVecALU.XMM, 0>,
5719 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5720 load, i128mem, SchedWriteVecALU.XMM, 0>,
5722 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5723 load, i128mem, SchedWriteVecALU.XMM, 0>,
5727 let Predicates = [HasAVX2, NoVLX] in {
5728 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5729 load, i256mem, SchedWriteVecALU.YMM, 0>,
5730 VEX_4V, VEX_L, VEX_WIG;
5731 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5732 load, i256mem, SchedWriteVecALU.YMM, 0>,
5733 VEX_4V, VEX_L, VEX_WIG;
5734 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5735 load, i256mem, SchedWriteVecALU.YMM, 0>,
5736 VEX_4V, VEX_L, VEX_WIG;
5737 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5738 load, i256mem, SchedWriteVecALU.YMM, 0>,
5739 VEX_4V, VEX_L, VEX_WIG;
5740 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5741 load, i256mem, SchedWriteVecIMul.YMM, 0>,
5742 VEX_4V, VEX_L, VEX_WIG;
5744 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5745 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5746 load, i256mem, SchedWriteVecALU.YMM, 0>,
5747 VEX_4V, VEX_L, VEX_WIG;
5748 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5749 load, i256mem, SchedWriteVecALU.YMM, 0>,
5750 VEX_4V, VEX_L, VEX_WIG;
5751 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5752 load, i256mem, SchedWriteVecALU.YMM, 0>,
5753 VEX_4V, VEX_L, VEX_WIG;
5754 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5755 load, i256mem, SchedWriteVecALU.YMM, 0>,
5756 VEX_4V, VEX_L, VEX_WIG;
5759 let Constraints = "$src1 = $dst" in {
5760 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5761 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5762 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5763 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5764 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5765 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5766 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5767 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5768 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5769 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5770 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5771 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5772 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5773 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5774 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5775 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5776 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5777 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5780 let Predicates = [HasAVX, NoVLX] in
5781 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5782 load, i128mem, SchedWritePMULLD.XMM, 0>,
5784 let Predicates = [HasAVX] in
5785 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5786 load, i128mem, SchedWriteVecALU.XMM, 0>,
5789 let Predicates = [HasAVX2, NoVLX] in
5790 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5791 load, i256mem, SchedWritePMULLD.YMM, 0>,
5792 VEX_4V, VEX_L, VEX_WIG;
5793 let Predicates = [HasAVX2] in
5794 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5795 load, i256mem, SchedWriteVecALU.YMM, 0>,
5796 VEX_4V, VEX_L, VEX_WIG;
5798 let Constraints = "$src1 = $dst" in {
5799 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5800 memop, i128mem, SchedWritePMULLD.XMM, 1>;
5801 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5802 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5805 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5806 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5807 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5808 X86MemOperand x86memop, bit Is2Addr,
5809 X86FoldableSchedWrite sched> {
5810 let isCommutable = 1 in
5811 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5812 (ins RC:$src1, RC:$src2, u8imm:$src3),
5814 !strconcat(OpcodeStr,
5815 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5816 !strconcat(OpcodeStr,
5817 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5818 [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
5820 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5821 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5823 !strconcat(OpcodeStr,
5824 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5825 !strconcat(OpcodeStr,
5826 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5828 (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>,
5829 Sched<[sched.Folded, sched.ReadAfterFold]>;
5832 /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5833 multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5834 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5835 X86MemOperand x86memop, bit Is2Addr,
5836 X86FoldableSchedWrite sched> {
5837 let isCommutable = 1 in
5838 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5839 (ins RC:$src1, RC:$src2, u8imm:$src3),
5841 !strconcat(OpcodeStr,
5842 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5843 !strconcat(OpcodeStr,
5844 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5845 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
5847 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5848 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5850 !strconcat(OpcodeStr,
5851 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5852 !strconcat(OpcodeStr,
5853 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5855 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
5856 Sched<[sched.Folded, sched.ReadAfterFold]>;
5859 def BlendCommuteImm2 : SDNodeXForm<imm, [{
5860 uint8_t Imm = N->getZExtValue() & 0x03;
5861 return getI8Imm(Imm ^ 0x03, SDLoc(N));
5864 def BlendCommuteImm4 : SDNodeXForm<imm, [{
5865 uint8_t Imm = N->getZExtValue() & 0x0f;
5866 return getI8Imm(Imm ^ 0x0f, SDLoc(N));
5869 def BlendCommuteImm8 : SDNodeXForm<imm, [{
5870 uint8_t Imm = N->getZExtValue() & 0xff;
5871 return getI8Imm(Imm ^ 0xff, SDLoc(N));
5874 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
5875 def BlendScaleImm4 : SDNodeXForm<imm, [{
5876 uint8_t Imm = N->getZExtValue();
5878 for (unsigned i = 0; i != 4; ++i) {
5880 NewImm |= 0x3 << (i * 2);
5882 return getI8Imm(NewImm, SDLoc(N));
5885 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
5886 def BlendScaleImm2 : SDNodeXForm<imm, [{
5887 uint8_t Imm = N->getZExtValue();
5889 for (unsigned i = 0; i != 2; ++i) {
5891 NewImm |= 0xf << (i * 4);
5893 return getI8Imm(NewImm, SDLoc(N));
5896 // Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
5897 def BlendScaleImm2to4 : SDNodeXForm<imm, [{
5898 uint8_t Imm = N->getZExtValue();
5900 for (unsigned i = 0; i != 2; ++i) {
5902 NewImm |= 0x3 << (i * 2);
5904 return getI8Imm(NewImm, SDLoc(N));
5907 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
5908 def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
5909 uint8_t Imm = N->getZExtValue();
5911 for (unsigned i = 0; i != 4; ++i) {
5913 NewImm |= 0x3 << (i * 2);
5915 return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5918 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
5919 def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
5920 uint8_t Imm = N->getZExtValue();
5922 for (unsigned i = 0; i != 2; ++i) {
5924 NewImm |= 0xf << (i * 4);
5926 return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5929 // Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
5930 def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{
5931 uint8_t Imm = N->getZExtValue();
5933 for (unsigned i = 0; i != 2; ++i) {
5935 NewImm |= 0x3 << (i * 2);
5937 return getI8Imm(NewImm ^ 0xf, SDLoc(N));
5940 let Predicates = [HasAVX] in {
5941 let isCommutable = 0 in {
5942 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
5943 VR128, load, i128mem, 0,
5944 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
5947 let ExeDomain = SSEPackedSingle in
5948 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
5949 VR128, load, f128mem, 0,
5950 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
5951 let ExeDomain = SSEPackedDouble in
5952 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
5953 VR128, load, f128mem, 0,
5954 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
5955 let ExeDomain = SSEPackedSingle in
5956 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
5957 VR256, load, i256mem, 0,
5958 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
5961 let Predicates = [HasAVX2] in {
5962 let isCommutable = 0 in {
5963 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
5964 VR256, load, i256mem, 0,
5965 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
5969 let Constraints = "$src1 = $dst" in {
5970 let isCommutable = 0 in {
5971 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
5972 VR128, memop, i128mem, 1,
5973 SchedWriteMPSAD.XMM>;
5976 let ExeDomain = SSEPackedSingle in
5977 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
5978 VR128, memop, f128mem, 1,
5979 SchedWriteDPPS.XMM>;
5980 let ExeDomain = SSEPackedDouble in
5981 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
5982 VR128, memop, f128mem, 1,
5983 SchedWriteDPPD.XMM>;
5986 /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
5987 multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5988 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5989 X86MemOperand x86memop, bit Is2Addr, Domain d,
5990 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
5991 let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
5992 let isCommutable = 1 in
5993 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5994 (ins RC:$src1, RC:$src2, u8imm:$src3),
5996 !strconcat(OpcodeStr,
5997 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5998 !strconcat(OpcodeStr,
5999 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6000 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
6002 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6003 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6005 !strconcat(OpcodeStr,
6006 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6007 !strconcat(OpcodeStr,
6008 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6010 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
6011 Sched<[sched.Folded, sched.ReadAfterFold]>;
6014 // Pattern to commute if load is in first source.
6015 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)),
6016 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6017 (commuteXForm imm:$src3))>;
6020 let Predicates = [HasAVX] in {
6021 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6022 VR128, load, f128mem, 0, SSEPackedSingle,
6023 SchedWriteFBlend.XMM, BlendCommuteImm4>,
6025 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6026 VR256, load, f256mem, 0, SSEPackedSingle,
6027 SchedWriteFBlend.YMM, BlendCommuteImm8>,
6028 VEX_4V, VEX_L, VEX_WIG;
6029 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6030 VR128, load, f128mem, 0, SSEPackedDouble,
6031 SchedWriteFBlend.XMM, BlendCommuteImm2>,
6033 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6034 VR256, load, f256mem, 0, SSEPackedDouble,
6035 SchedWriteFBlend.YMM, BlendCommuteImm4>,
6036 VEX_4V, VEX_L, VEX_WIG;
6037 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6038 VR128, load, i128mem, 0, SSEPackedInt,
6039 SchedWriteBlend.XMM, BlendCommuteImm8>,
6043 let Predicates = [HasAVX2] in {
6044 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6045 VR256, load, i256mem, 0, SSEPackedInt,
6046 SchedWriteBlend.YMM, BlendCommuteImm8>,
6047 VEX_4V, VEX_L, VEX_WIG;
6050 // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6051 // ExecutionDomainFixPass will cleanup domains later on.
6052 let Predicates = [HasAVX1Only] in {
6053 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
6054 (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>;
6055 def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
6056 (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>;
6057 def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
6058 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
6060 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6061 // it from becoming movsd via commuting under optsize.
6062 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
6063 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
6064 def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
6065 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
6066 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
6067 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
6069 def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3),
6070 (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>;
6071 def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3),
6072 (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>;
6073 def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3),
6074 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>;
6076 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6077 // it from becoming movss via commuting under optsize.
6078 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
6079 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
6080 def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3),
6081 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
6082 def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3),
6083 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
6086 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6087 VR128, memop, f128mem, 1, SSEPackedSingle,
6088 SchedWriteFBlend.XMM, BlendCommuteImm4>;
6089 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6090 VR128, memop, f128mem, 1, SSEPackedDouble,
6091 SchedWriteFBlend.XMM, BlendCommuteImm2>;
6092 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6093 VR128, memop, i128mem, 1, SSEPackedInt,
6094 SchedWriteBlend.XMM, BlendCommuteImm8>;
6096 let Predicates = [UseSSE41] in {
6097 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6098 // it from becoming movss via commuting under optsize.
6099 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
6100 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
6101 def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3),
6102 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
6103 def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3),
6104 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
6106 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
6107 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
6108 def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3),
6109 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
6110 def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3),
6111 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
6114 // For insertion into the zero index (low half) of a 256-bit vector, it is
6115 // more efficient to generate a blend with immediate instead of an insert*128.
6116 let Predicates = [HasAVX] in {
6117 def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6118 (VBLENDPDYrri VR256:$src1,
6119 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6120 VR128:$src2, sub_xmm), 0x3)>;
6121 def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6122 (VBLENDPSYrri VR256:$src1,
6123 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6124 VR128:$src2, sub_xmm), 0xf)>;
6126 def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6127 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6128 VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6129 def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6130 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6131 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6134 /// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6135 multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6136 X86MemOperand x86memop, ValueType VT,
6137 PatFrag mem_frag, SDNode OpNode,
6138 X86FoldableSchedWrite sched> {
6139 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6140 (ins RC:$src1, RC:$src2, RC:$src3),
6141 !strconcat(OpcodeStr,
6142 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6143 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6144 SSEPackedInt>, TAPD, VEX_4V,
6147 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6148 (ins RC:$src1, x86memop:$src2, RC:$src3),
6149 !strconcat(OpcodeStr,
6150 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6152 (OpNode RC:$src3, (mem_frag addr:$src2),
6153 RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6154 Sched<[sched.Folded, sched.ReadAfterFold,
6156 ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6159 sched.ReadAfterFold]>;
6162 let Predicates = [HasAVX] in {
6163 let ExeDomain = SSEPackedDouble in {
6164 defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6165 v2f64, loadv2f64, X86Blendv,
6166 SchedWriteFVarBlend.XMM>;
6167 defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6168 v4f64, loadv4f64, X86Blendv,
6169 SchedWriteFVarBlend.YMM>, VEX_L;
6170 } // ExeDomain = SSEPackedDouble
6171 let ExeDomain = SSEPackedSingle in {
6172 defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6173 v4f32, loadv4f32, X86Blendv,
6174 SchedWriteFVarBlend.XMM>;
6175 defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6176 v8f32, loadv8f32, X86Blendv,
6177 SchedWriteFVarBlend.YMM>, VEX_L;
6178 } // ExeDomain = SSEPackedSingle
6179 defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6180 v16i8, loadv16i8, X86Blendv,
6181 SchedWriteVarBlend.XMM>;
6184 let Predicates = [HasAVX2] in {
6185 defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6186 v32i8, loadv32i8, X86Blendv,
6187 SchedWriteVarBlend.YMM>, VEX_L;
6190 let Predicates = [HasAVX] in {
6191 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6192 (v4i32 VR128:$src2))),
6193 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6194 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6195 (v2i64 VR128:$src2))),
6196 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6197 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6198 (v8i32 VR256:$src2))),
6199 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6200 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6201 (v4i64 VR256:$src2))),
6202 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6205 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6206 // changed to use blends because blends have better throughput on sandybridge
6207 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6208 let Predicates = [HasAVX, OptForSpeed] in {
6209 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6210 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6211 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6212 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6214 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6215 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6216 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6217 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6218 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6219 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6221 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6222 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6223 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6224 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6225 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6226 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6228 // Move low f32 and clear high bits.
6229 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6230 (SUBREG_TO_REG (i32 0),
6231 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6232 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6233 (i8 1))), sub_xmm)>;
6234 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6235 (SUBREG_TO_REG (i32 0),
6236 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6237 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6238 (i8 3))), sub_xmm)>;
6241 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6242 // changed to use blends because blends have better throughput on sandybridge
6243 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6244 let Predicates = [UseSSE41, OptForSpeed] in {
6245 // With SSE41 we can use blends for these patterns.
6246 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6247 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6248 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6249 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6251 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6252 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6253 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6254 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6255 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6256 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6258 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6259 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6260 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6261 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6262 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6263 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6267 /// SS41I_ternary - SSE 4.1 ternary operator
6268 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6269 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6270 PatFrag mem_frag, X86MemOperand x86memop,
6271 SDNode OpNode, X86FoldableSchedWrite sched> {
6272 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6273 (ins VR128:$src1, VR128:$src2),
6274 !strconcat(OpcodeStr,
6275 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6277 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6280 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6281 (ins VR128:$src1, x86memop:$src2),
6282 !strconcat(OpcodeStr,
6283 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6285 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6286 Sched<[sched.Folded, sched.ReadAfterFold]>;
6290 let ExeDomain = SSEPackedDouble in
6291 defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6292 X86Blendv, SchedWriteFVarBlend.XMM>;
6293 let ExeDomain = SSEPackedSingle in
6294 defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6295 X86Blendv, SchedWriteFVarBlend.XMM>;
6296 defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6297 X86Blendv, SchedWriteVarBlend.XMM>;
6299 // Aliases with the implicit xmm0 argument
6300 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6301 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6302 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6303 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6304 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6305 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6306 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6307 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6308 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6309 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6310 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6311 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6313 let Predicates = [UseSSE41] in {
6314 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6315 (v4i32 VR128:$src2))),
6316 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6317 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6318 (v2i64 VR128:$src2))),
6319 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6322 let AddedComplexity = 400 in { // Prefer non-temporal versions
6324 let Predicates = [HasAVX, NoVLX] in
6325 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6326 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6327 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6328 let Predicates = [HasAVX2, NoVLX] in
6329 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6330 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6331 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6332 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6333 "movntdqa\t{$src, $dst|$dst, $src}", []>,
6334 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6336 let Predicates = [HasAVX2, NoVLX] in {
6337 def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6338 (VMOVNTDQAYrm addr:$src)>;
6339 def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6340 (VMOVNTDQAYrm addr:$src)>;
6341 def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6342 (VMOVNTDQAYrm addr:$src)>;
6343 def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6344 (VMOVNTDQAYrm addr:$src)>;
6345 def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6346 (VMOVNTDQAYrm addr:$src)>;
6347 def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6348 (VMOVNTDQAYrm addr:$src)>;
6351 let Predicates = [HasAVX, NoVLX] in {
6352 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6353 (VMOVNTDQArm addr:$src)>;
6354 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6355 (VMOVNTDQArm addr:$src)>;
6356 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6357 (VMOVNTDQArm addr:$src)>;
6358 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6359 (VMOVNTDQArm addr:$src)>;
6360 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6361 (VMOVNTDQArm addr:$src)>;
6362 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6363 (VMOVNTDQArm addr:$src)>;
6366 let Predicates = [UseSSE41] in {
6367 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6368 (MOVNTDQArm addr:$src)>;
6369 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6370 (MOVNTDQArm addr:$src)>;
6371 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6372 (MOVNTDQArm addr:$src)>;
6373 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6374 (MOVNTDQArm addr:$src)>;
6375 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6376 (MOVNTDQArm addr:$src)>;
6377 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6378 (MOVNTDQArm addr:$src)>;
6381 } // AddedComplexity
6383 //===----------------------------------------------------------------------===//
6384 // SSE4.2 - Compare Instructions
6385 //===----------------------------------------------------------------------===//
6387 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
6388 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6389 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6390 X86MemOperand x86memop, X86FoldableSchedWrite sched,
6392 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6393 (ins RC:$src1, RC:$src2),
6395 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6396 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6397 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6399 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6400 (ins RC:$src1, x86memop:$src2),
6402 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6403 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6405 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6406 Sched<[sched.Folded, sched.ReadAfterFold]>;
6409 let Predicates = [HasAVX] in
6410 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6411 load, i128mem, SchedWriteVecALU.XMM, 0>,
6414 let Predicates = [HasAVX2] in
6415 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6416 load, i256mem, SchedWriteVecALU.YMM, 0>,
6417 VEX_4V, VEX_L, VEX_WIG;
6419 let Constraints = "$src1 = $dst" in
6420 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6421 memop, i128mem, SchedWriteVecALU.XMM>;
6423 //===----------------------------------------------------------------------===//
6424 // SSE4.2 - String/text Processing Instructions
6425 //===----------------------------------------------------------------------===//
6427 multiclass pcmpistrm_SS42AI<string asm> {
6428 def rr : SS42AI<0x62, MRMSrcReg, (outs),
6429 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6430 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6431 []>, Sched<[WritePCmpIStrM]>;
6433 def rm :SS42AI<0x62, MRMSrcMem, (outs),
6434 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6435 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6436 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6439 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6440 let Predicates = [HasAVX] in
6441 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
6442 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
6445 multiclass SS42AI_pcmpestrm<string asm> {
6446 def rr : SS42AI<0x60, MRMSrcReg, (outs),
6447 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6448 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6449 []>, Sched<[WritePCmpEStrM]>;
6451 def rm : SS42AI<0x60, MRMSrcMem, (outs),
6452 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6453 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6454 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6457 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6458 let Predicates = [HasAVX] in
6459 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
6460 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
6463 multiclass SS42AI_pcmpistri<string asm> {
6464 def rr : SS42AI<0x63, MRMSrcReg, (outs),
6465 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6466 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6467 []>, Sched<[WritePCmpIStrI]>;
6469 def rm : SS42AI<0x63, MRMSrcMem, (outs),
6470 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6471 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6472 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6475 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6476 let Predicates = [HasAVX] in
6477 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
6478 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
6481 multiclass SS42AI_pcmpestri<string asm> {
6482 def rr : SS42AI<0x61, MRMSrcReg, (outs),
6483 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6484 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6485 []>, Sched<[WritePCmpEStrI]>;
6487 def rm : SS42AI<0x61, MRMSrcMem, (outs),
6488 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6489 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6490 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6493 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6494 let Predicates = [HasAVX] in
6495 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
6496 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
6499 //===----------------------------------------------------------------------===//
6500 // SSE4.2 - CRC Instructions
6501 //===----------------------------------------------------------------------===//
6503 // No CRC instructions have AVX equivalents
6505 // crc intrinsic instruction
6506 // This set of instructions are only rm, the only difference is the size
6508 class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6509 RegisterClass RCIn, SDPatternOperator Int> :
6510 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6511 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6512 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6513 Sched<[WriteCRC32]>;
6515 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6516 X86MemOperand x86memop, SDPatternOperator Int> :
6517 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6518 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6519 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6520 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6522 let Constraints = "$src1 = $dst" in {
6523 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6524 int_x86_sse42_crc32_32_8>;
6525 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6526 int_x86_sse42_crc32_32_8>;
6527 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6528 int_x86_sse42_crc32_32_16>, OpSize16;
6529 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6530 int_x86_sse42_crc32_32_16>, OpSize16;
6531 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6532 int_x86_sse42_crc32_32_32>, OpSize32;
6533 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6534 int_x86_sse42_crc32_32_32>, OpSize32;
6535 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6536 int_x86_sse42_crc32_64_64>, REX_W;
6537 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6538 int_x86_sse42_crc32_64_64>, REX_W;
6539 let hasSideEffects = 0 in {
6541 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6543 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6548 //===----------------------------------------------------------------------===//
6549 // SHA-NI Instructions
6550 //===----------------------------------------------------------------------===//
6552 // FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6553 multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6554 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6555 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6556 (ins VR128:$src1, VR128:$src2),
6558 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6559 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6561 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6562 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6565 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6566 (ins VR128:$src1, i128mem:$src2),
6568 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6569 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6571 (set VR128:$dst, (IntId VR128:$src1,
6572 (memop addr:$src2), XMM0)),
6573 (set VR128:$dst, (IntId VR128:$src1,
6574 (memop addr:$src2))))]>, T8,
6575 Sched<[sched.Folded, sched.ReadAfterFold]>;
6578 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6579 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6580 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6581 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6583 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6584 (i8 imm:$src3)))]>, TA,
6585 Sched<[SchedWriteVecIMul.XMM]>;
6586 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6587 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6588 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6590 (int_x86_sha1rnds4 VR128:$src1,
6592 (i8 imm:$src3)))]>, TA,
6593 Sched<[SchedWriteVecIMul.XMM.Folded,
6594 SchedWriteVecIMul.XMM.ReadAfterFold]>;
6596 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6597 SchedWriteVecIMul.XMM>;
6598 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6599 SchedWriteVecIMul.XMM>;
6600 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6601 SchedWriteVecIMul.XMM>;
6604 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6605 SchedWriteVecIMul.XMM, 1>;
6607 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6608 SchedWriteVecIMul.XMM>;
6609 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6610 SchedWriteVecIMul.XMM>;
6613 // Aliases with explicit %xmm0
6614 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6615 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6616 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6617 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6619 //===----------------------------------------------------------------------===//
6620 // AES-NI Instructions
6621 //===----------------------------------------------------------------------===//
6623 multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6624 Intrinsic IntId, PatFrag ld_frag,
6625 bit Is2Addr = 0, RegisterClass RC = VR128,
6626 X86MemOperand MemOp = i128mem> {
6627 let AsmString = OpcodeStr##
6628 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6629 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6630 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6631 (ins RC:$src1, RC:$src2), "",
6632 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6633 Sched<[WriteAESDecEnc]>;
6634 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6635 (ins RC:$src1, MemOp:$src2), "",
6636 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6637 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6641 // Perform One Round of an AES Encryption/Decryption Flow
6642 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6643 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
6644 int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
6645 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
6646 int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
6647 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
6648 int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
6649 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
6650 int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
6653 let Predicates = [NoVLX, HasVAES] in {
6654 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc",
6655 int_x86_aesni_aesenc_256, load, 0, VR256,
6656 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6657 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast",
6658 int_x86_aesni_aesenclast_256, load, 0, VR256,
6659 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6660 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec",
6661 int_x86_aesni_aesdec_256, load, 0, VR256,
6662 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6663 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast",
6664 int_x86_aesni_aesdeclast_256, load, 0, VR256,
6665 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6668 let Constraints = "$src1 = $dst" in {
6669 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
6670 int_x86_aesni_aesenc, memop, 1>;
6671 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
6672 int_x86_aesni_aesenclast, memop, 1>;
6673 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
6674 int_x86_aesni_aesdec, memop, 1>;
6675 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
6676 int_x86_aesni_aesdeclast, memop, 1>;
6679 // Perform the AES InvMixColumn Transformation
6680 let Predicates = [HasAVX, HasAES] in {
6681 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6683 "vaesimc\t{$src1, $dst|$dst, $src1}",
6685 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6687 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6688 (ins i128mem:$src1),
6689 "vaesimc\t{$src1, $dst|$dst, $src1}",
6690 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6691 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
6693 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6695 "aesimc\t{$src1, $dst|$dst, $src1}",
6697 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6698 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6699 (ins i128mem:$src1),
6700 "aesimc\t{$src1, $dst|$dst, $src1}",
6701 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6702 Sched<[WriteAESIMC.Folded]>;
6704 // AES Round Key Generation Assist
6705 let Predicates = [HasAVX, HasAES] in {
6706 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6707 (ins VR128:$src1, u8imm:$src2),
6708 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6710 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
6711 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
6712 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6713 (ins i128mem:$src1, u8imm:$src2),
6714 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6716 (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>,
6717 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
6719 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6720 (ins VR128:$src1, u8imm:$src2),
6721 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6723 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
6724 Sched<[WriteAESKeyGen]>;
6725 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6726 (ins i128mem:$src1, u8imm:$src2),
6727 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6729 (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>,
6730 Sched<[WriteAESKeyGen.Folded]>;
6732 //===----------------------------------------------------------------------===//
6733 // PCLMUL Instructions
6734 //===----------------------------------------------------------------------===//
6736 // Immediate transform to help with commuting.
6737 def PCLMULCommuteImm : SDNodeXForm<imm, [{
6738 uint8_t Imm = N->getZExtValue();
6739 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6742 // SSE carry-less Multiplication instructions
6743 let Predicates = [NoAVX, HasPCLMUL] in {
6744 let Constraints = "$src1 = $dst" in {
6745 let isCommutable = 1 in
6746 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6747 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6748 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6750 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
6751 Sched<[WriteCLMul]>;
6753 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6754 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6755 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6757 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6759 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6760 } // Constraints = "$src1 = $dst"
6762 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6764 (PCLMULQDQrm VR128:$src1, addr:$src2,
6765 (PCLMULCommuteImm imm:$src3))>;
6766 } // Predicates = [NoAVX, HasPCLMUL]
6769 foreach HI = ["hq","lq"] in
6770 foreach LO = ["hq","lq"] in {
6771 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6772 (PCLMULQDQrr VR128:$dst, VR128:$src,
6773 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6774 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6775 (PCLMULQDQrm VR128:$dst, i128mem:$src,
6776 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6779 // AVX carry-less Multiplication instructions
6780 multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6781 PatFrag LdFrag, Intrinsic IntId> {
6782 let isCommutable = 1 in
6783 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6784 (ins RC:$src1, RC:$src2, u8imm:$src3),
6785 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6787 (IntId RC:$src1, RC:$src2, imm:$src3))]>,
6788 Sched<[WriteCLMul]>;
6790 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6791 (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6792 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6794 (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>,
6795 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6797 // We can commute a load in the first operand by swapping the sources and
6798 // rotating the immediate.
6799 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)),
6800 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6801 (PCLMULCommuteImm imm:$src3))>;
6804 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6805 defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6806 int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
6808 let Predicates = [NoVLX, HasVPCLMULQDQ] in
6809 defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6810 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
6812 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6813 X86MemOperand MemOp, string Hi, string Lo> {
6814 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6815 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6816 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6817 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6818 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6819 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6822 multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6823 X86MemOperand MemOp> {
6824 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6825 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6826 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6827 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6831 defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6832 defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6834 //===----------------------------------------------------------------------===//
6835 // SSE4A Instructions
6836 //===----------------------------------------------------------------------===//
6838 let Predicates = [HasSSE4A] in {
6840 let ExeDomain = SSEPackedInt in {
6841 let Constraints = "$src = $dst" in {
6842 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
6843 (ins VR128:$src, u8imm:$len, u8imm:$idx),
6844 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
6845 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
6847 PD, Sched<[SchedWriteVecALU.XMM]>;
6848 def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
6849 (ins VR128:$src, VR128:$mask),
6850 "extrq\t{$mask, $src|$src, $mask}",
6851 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
6853 PD, Sched<[SchedWriteVecALU.XMM]>;
6855 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
6856 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
6857 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
6858 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
6859 imm:$len, imm:$idx))]>,
6860 XD, Sched<[SchedWriteVecALU.XMM]>;
6861 def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
6862 (ins VR128:$src, VR128:$mask),
6863 "insertq\t{$mask, $src|$src, $mask}",
6864 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
6866 XD, Sched<[SchedWriteVecALU.XMM]>;
6868 } // ExeDomain = SSEPackedInt
6870 // Non-temporal (unaligned) scalar stores.
6871 let AddedComplexity = 400 in { // Prefer non-temporal versions
6872 let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
6873 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
6874 "movntss\t{$src, $dst|$dst, $src}", []>, XS;
6876 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
6877 "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
6880 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
6881 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
6883 def : Pat<(nontemporalstore FR64:$src, addr:$dst),
6884 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
6886 } // AddedComplexity
6889 //===----------------------------------------------------------------------===//
6891 //===----------------------------------------------------------------------===//
6893 //===----------------------------------------------------------------------===//
6894 // VBROADCAST - Load from memory and broadcast to all elements of the
6895 // destination operand
6897 class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
6898 X86MemOperand x86memop, ValueType VT,
6899 PatFrag ld_frag, SchedWrite Sched> :
6900 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
6901 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6902 [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
6903 Sched<[Sched]>, VEX;
6905 // AVX2 adds register forms
6906 class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
6907 ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
6908 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
6909 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6910 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
6911 Sched<[Sched]>, VEX;
6913 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
6914 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
6915 f32mem, v4f32, loadf32,
6916 SchedWriteFShuffle.XMM.Folded>;
6917 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
6918 f32mem, v8f32, loadf32,
6919 SchedWriteFShuffle.XMM.Folded>, VEX_L;
6921 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
6922 def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
6924 SchedWriteFShuffle.XMM.Folded>, VEX_L;
6926 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
6927 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
6928 v4f32, v4f32, SchedWriteFShuffle.XMM>;
6929 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
6930 v8f32, v4f32, WriteFShuffle256>, VEX_L;
6932 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
6933 def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
6934 v4f64, v2f64, WriteFShuffle256>, VEX_L;
6936 //===----------------------------------------------------------------------===//
6937 // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
6938 // halves of a 256-bit vector.
6940 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
6941 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
6943 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
6944 Sched<[WriteShuffleLd]>, VEX, VEX_L;
6946 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
6947 ExeDomain = SSEPackedSingle in
6948 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
6950 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
6951 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
6953 let Predicates = [HasAVX, NoVLX] in {
6954 def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
6955 (VBROADCASTF128 addr:$src)>;
6956 def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
6957 (VBROADCASTF128 addr:$src)>;
6960 // NOTE: We're using FP instructions here, but execution domain fixing can
6961 // convert to integer when profitable.
6962 let Predicates = [HasAVX, NoVLX] in {
6963 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
6964 (VBROADCASTF128 addr:$src)>;
6965 def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
6966 (VBROADCASTF128 addr:$src)>;
6967 def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
6968 (VBROADCASTF128 addr:$src)>;
6969 def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
6970 (VBROADCASTF128 addr:$src)>;
6973 //===----------------------------------------------------------------------===//
6974 // VINSERTF128 - Insert packed floating-point values
6976 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
6977 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
6978 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
6979 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6980 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
6982 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
6983 (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
6984 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6985 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
6988 // To create a 256-bit all ones value, we should produce VCMPTRUEPS
6989 // with YMM register containing zero.
6990 // FIXME: Avoid producing vxorps to clear the fake inputs.
6991 let Predicates = [HasAVX1Only] in {
6992 def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
6995 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
6996 PatFrag memop_frag> {
6997 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
6999 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7000 (INSERT_get_vinsert128_imm VR256:$ins))>;
7001 def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7002 (From (memop_frag addr:$src2)),
7004 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7005 (INSERT_get_vinsert128_imm VR256:$ins))>;
7008 let Predicates = [HasAVX, NoVLX] in {
7009 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
7010 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
7013 let Predicates = [HasAVX1Only] in {
7014 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>;
7015 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>;
7016 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
7017 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>;
7020 //===----------------------------------------------------------------------===//
7021 // VEXTRACTF128 - Extract packed floating-point values
7023 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7024 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7025 (ins VR256:$src1, u8imm:$src2),
7026 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7027 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7029 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7030 (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7031 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7032 []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7035 multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7036 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7037 (To (!cast<Instruction>(InstrStr#rr)
7039 (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7040 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7041 (iPTR imm))), addr:$dst),
7042 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7043 (EXTRACT_get_vextract128_imm VR128:$ext))>;
7047 let Predicates = [HasAVX, NoVLX] in {
7048 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7049 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7052 let Predicates = [HasAVX1Only] in {
7053 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
7054 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
7055 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7056 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
7059 //===----------------------------------------------------------------------===//
7060 // VMASKMOV - Conditional SIMD Packed Loads and Stores
7062 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7063 Intrinsic IntLd, Intrinsic IntLd256,
7064 Intrinsic IntSt, Intrinsic IntSt256,
7065 X86SchedWriteMaskMove schedX,
7066 X86SchedWriteMaskMove schedY> {
7067 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7068 (ins VR128:$src1, f128mem:$src2),
7069 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7070 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7071 VEX_4V, Sched<[schedX.RM]>;
7072 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7073 (ins VR256:$src1, f256mem:$src2),
7074 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7075 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7076 VEX_4V, VEX_L, Sched<[schedY.RM]>;
7077 def mr : AVX8I<opc_mr, MRMDestMem, (outs),
7078 (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7079 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7080 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7081 VEX_4V, Sched<[schedX.MR]>;
7082 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7083 (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7084 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7085 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7086 VEX_4V, VEX_L, Sched<[schedY.MR]>;
7089 let ExeDomain = SSEPackedSingle in
7090 defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7091 int_x86_avx_maskload_ps,
7092 int_x86_avx_maskload_ps_256,
7093 int_x86_avx_maskstore_ps,
7094 int_x86_avx_maskstore_ps_256,
7095 WriteFMaskMove32, WriteFMaskMove32Y>;
7096 let ExeDomain = SSEPackedDouble in
7097 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7098 int_x86_avx_maskload_pd,
7099 int_x86_avx_maskload_pd_256,
7100 int_x86_avx_maskstore_pd,
7101 int_x86_avx_maskstore_pd_256,
7102 WriteFMaskMove64, WriteFMaskMove64Y>;
7104 //===----------------------------------------------------------------------===//
7105 // VPERMIL - Permute Single and Double Floating-Point Values
7108 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7109 RegisterClass RC, X86MemOperand x86memop_f,
7110 X86MemOperand x86memop_i,
7111 ValueType f_vt, ValueType i_vt,
7112 X86FoldableSchedWrite sched,
7113 X86FoldableSchedWrite varsched> {
7114 let Predicates = [HasAVX, NoVLX] in {
7115 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7116 (ins RC:$src1, RC:$src2),
7117 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7118 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7120 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7121 (ins RC:$src1, x86memop_i:$src2),
7122 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7123 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7124 (i_vt (load addr:$src2)))))]>, VEX_4V,
7125 Sched<[varsched.Folded, sched.ReadAfterFold]>;
7127 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7128 (ins RC:$src1, u8imm:$src2),
7129 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7130 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
7132 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7133 (ins x86memop_f:$src1, u8imm:$src2),
7134 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7136 (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
7137 Sched<[sched.Folded]>;
7138 }// Predicates = [HasAVX, NoVLX]
7141 let ExeDomain = SSEPackedSingle in {
7142 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7143 v4f32, v4i32, SchedWriteFShuffle.XMM,
7144 SchedWriteFVarShuffle.XMM>;
7145 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7146 v8f32, v8i32, SchedWriteFShuffle.YMM,
7147 SchedWriteFVarShuffle.YMM>, VEX_L;
7149 let ExeDomain = SSEPackedDouble in {
7150 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7151 v2f64, v2i64, SchedWriteFShuffle.XMM,
7152 SchedWriteFVarShuffle.XMM>;
7153 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7154 v4f64, v4i64, SchedWriteFShuffle.YMM,
7155 SchedWriteFVarShuffle.YMM>, VEX_L;
7158 //===----------------------------------------------------------------------===//
7159 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7162 let ExeDomain = SSEPackedSingle in {
7163 let isCommutable = 1 in
7164 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7165 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7166 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7167 [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7168 (i8 imm:$src3))))]>, VEX_4V, VEX_L,
7169 Sched<[WriteFShuffle256]>;
7170 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7171 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7172 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7173 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
7174 (i8 imm:$src3)))]>, VEX_4V, VEX_L,
7175 Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7178 // Immediate transform to help with commuting.
7179 def Perm2XCommuteImm : SDNodeXForm<imm, [{
7180 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7183 let Predicates = [HasAVX] in {
7184 // Pattern with load in other operand.
7185 def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
7186 VR256:$src1, (i8 imm:$imm))),
7187 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7190 let Predicates = [HasAVX1Only] in {
7191 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7192 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7193 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7194 (loadv4i64 addr:$src2), (i8 imm:$imm))),
7195 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7196 // Pattern with load in other operand.
7197 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7198 VR256:$src1, (i8 imm:$imm))),
7199 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7202 //===----------------------------------------------------------------------===//
7203 // VZERO - Zero YMM registers
7204 // Note: These instruction do not affect the YMM16-YMM31.
7207 let SchedRW = [WriteSystem] in {
7208 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7209 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7210 // Zero All YMM registers
7211 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7212 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7213 Requires<[HasAVX]>, VEX_WIG;
7215 // Zero Upper bits of YMM registers
7216 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7217 [(int_x86_avx_vzeroupper)]>, PS, VEX,
7218 Requires<[HasAVX]>, VEX_WIG;
7222 //===----------------------------------------------------------------------===//
7223 // Half precision conversion instructions
7226 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7227 X86FoldableSchedWrite sched> {
7228 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7229 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7230 [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
7231 T8PD, VEX, Sched<[sched]>;
7232 let hasSideEffects = 0, mayLoad = 1 in
7233 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7234 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7235 [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
7236 T8PD, VEX, Sched<[sched.Folded]>;
7239 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7240 SchedWrite RR, SchedWrite MR> {
7241 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7242 (ins RC:$src1, i32u8imm:$src2),
7243 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7244 [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
7245 TAPD, VEX, Sched<[RR]>;
7246 let hasSideEffects = 0, mayStore = 1 in
7247 def mr : Ii8<0x1D, MRMDestMem, (outs),
7248 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7249 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7250 TAPD, VEX, Sched<[MR]>;
7253 let Predicates = [HasF16C, NoVLX] in {
7254 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>;
7255 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L;
7256 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7258 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7259 WriteCvtPS2PHYSt>, VEX_L;
7261 // Pattern match vcvtph2ps of a scalar i64 load.
7262 def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7263 (VCVTPH2PSrm addr:$src)>;
7264 def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16
7265 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7266 (VCVTPH2PSrm addr:$src)>;
7268 def : Pat<(store (f64 (extractelt
7269 (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
7270 (iPTR 0))), addr:$dst),
7271 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
7272 def : Pat<(store (i64 (extractelt
7273 (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
7274 (iPTR 0))), addr:$dst),
7275 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
7276 def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
7277 (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
7280 // Patterns for matching conversions from float to half-float and vice versa.
7281 let Predicates = [HasF16C, NoVLX] in {
7282 // Use MXCSR.RC for rounding instead of explicitly specifying the default
7283 // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
7284 // configurations we support (the default). However, falling back to MXCSR is
7285 // more consistent with other instructions, which are always controlled by it.
7286 // It's encoded as 0b100.
7287 def : Pat<(fp_to_f16 FR32:$src),
7288 (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
7289 (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
7291 def : Pat<(f16_to_fp GR16:$src),
7292 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7293 (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
7295 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
7296 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7297 (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
7300 //===----------------------------------------------------------------------===//
7301 // AVX2 Instructions
7302 //===----------------------------------------------------------------------===//
7304 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7305 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7306 ValueType OpVT, X86FoldableSchedWrite sched,
7308 X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7309 let isCommutable = 1 in
7310 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7311 (ins RC:$src1, RC:$src2, u8imm:$src3),
7312 !strconcat(OpcodeStr,
7313 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7314 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
7315 Sched<[sched]>, VEX_4V;
7316 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7317 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7318 !strconcat(OpcodeStr,
7319 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7321 (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>,
7322 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7324 // Pattern to commute if load is in first source.
7325 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)),
7326 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7327 (commuteXForm imm:$src3))>;
7330 let Predicates = [HasAVX2] in {
7331 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7332 SchedWriteBlend.XMM, VR128, i128mem,
7334 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7335 SchedWriteBlend.YMM, VR256, i256mem,
7336 BlendCommuteImm8>, VEX_L;
7338 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
7339 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>;
7340 def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
7341 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
7342 def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
7343 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
7345 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
7346 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>;
7347 def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
7348 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>;
7349 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
7350 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>;
7353 // For insertion into the zero index (low half) of a 256-bit vector, it is
7354 // more efficient to generate a blend with immediate instead of an insert*128.
7355 // NOTE: We're using FP instructions here, but exeuction domain fixing should
7356 // take care of using integer instructions when profitable.
7357 let Predicates = [HasAVX] in {
7358 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7359 (VBLENDPSYrri VR256:$src1,
7360 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7361 VR128:$src2, sub_xmm), 0xf)>;
7362 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7363 (VBLENDPSYrri VR256:$src1,
7364 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7365 VR128:$src2, sub_xmm), 0xf)>;
7366 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7367 (VBLENDPSYrri VR256:$src1,
7368 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7369 VR128:$src2, sub_xmm), 0xf)>;
7370 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7371 (VBLENDPSYrri VR256:$src1,
7372 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7373 VR128:$src2, sub_xmm), 0xf)>;
7375 def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7376 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7377 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7378 def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7379 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7380 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7381 def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7382 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7383 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7384 def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7385 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7386 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7389 //===----------------------------------------------------------------------===//
7390 // VPBROADCAST - Load from memory and broadcast to all elements of the
7391 // destination operand
7393 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7394 X86MemOperand x86memop, PatFrag ld_frag,
7395 ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7396 let Predicates = [HasAVX2, prd] in {
7397 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7398 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7400 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7401 Sched<[SchedWriteShuffle.XMM]>, VEX;
7402 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7403 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7405 (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
7406 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7407 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7408 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7410 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7411 Sched<[WriteShuffle256]>, VEX, VEX_L;
7412 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7413 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7415 (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
7416 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7418 // Provide aliases for broadcast from the same register class that
7419 // automatically does the extract.
7420 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7421 (!cast<Instruction>(NAME#"Yrr")
7422 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7426 defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
7427 v16i8, v32i8, NoVLX_Or_NoBWI>;
7428 defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
7429 v8i16, v16i16, NoVLX_Or_NoBWI>;
7430 defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
7431 v4i32, v8i32, NoVLX>;
7432 defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
7433 v2i64, v4i64, NoVLX>;
7435 let Predicates = [HasAVX2, NoVLX] in {
7436 // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
7437 def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
7438 (VPBROADCASTQrm addr:$src)>;
7439 def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
7440 (VPBROADCASTQYrm addr:$src)>;
7442 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7443 // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
7444 // This means we'll encounter truncated i32 loads; match that here.
7445 def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7446 (VPBROADCASTWrm addr:$src)>;
7447 def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7448 (VPBROADCASTWYrm addr:$src)>;
7449 def : Pat<(v8i16 (X86VBroadcast
7450 (i16 (trunc (i32 (extloadi16 addr:$src)))))),
7451 (VPBROADCASTWrm addr:$src)>;
7452 def : Pat<(v8i16 (X86VBroadcast
7453 (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7454 (VPBROADCASTWrm addr:$src)>;
7455 def : Pat<(v16i16 (X86VBroadcast
7456 (i16 (trunc (i32 (extloadi16 addr:$src)))))),
7457 (VPBROADCASTWYrm addr:$src)>;
7458 def : Pat<(v16i16 (X86VBroadcast
7459 (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7460 (VPBROADCASTWYrm addr:$src)>;
7463 let Predicates = [HasAVX2, NoVLX] in {
7464 // Provide fallback in case the load node that is used in the patterns above
7465 // is used by additional users, which prevents the pattern selection.
7466 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7467 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7468 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7469 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7470 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7471 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7474 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7475 def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7476 (VPBROADCASTBrr (VMOVDI2PDIrr
7477 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7478 GR8:$src, sub_8bit))))>;
7479 def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7480 (VPBROADCASTBYrr (VMOVDI2PDIrr
7481 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7482 GR8:$src, sub_8bit))))>;
7484 def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7485 (VPBROADCASTWrr (VMOVDI2PDIrr
7486 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7487 GR16:$src, sub_16bit))))>;
7488 def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7489 (VPBROADCASTWYrr (VMOVDI2PDIrr
7490 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7491 GR16:$src, sub_16bit))))>;
7493 let Predicates = [HasAVX2, NoVLX] in {
7494 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7495 (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
7496 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7497 (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
7498 def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7499 (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
7500 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7501 (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
7504 // AVX1 broadcast patterns
7505 let Predicates = [HasAVX1Only] in {
7506 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
7507 (VBROADCASTSSYrm addr:$src)>;
7508 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
7509 (VBROADCASTSDYrm addr:$src)>;
7510 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
7511 (VBROADCASTSSrm addr:$src)>;
7514 // Provide fallback in case the load node that is used in the patterns above
7515 // is used by additional users, which prevents the pattern selection.
7516 let Predicates = [HasAVX, NoVLX] in {
7517 // 128bit broadcasts:
7518 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7519 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7520 def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
7521 (VMOVDDUPrm addr:$src)>;
7523 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7524 (VMOVDDUPrr VR128:$src)>;
7525 def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
7526 (VMOVDDUPrm addr:$src)>;
7527 def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
7528 (VMOVDDUPrm addr:$src)>;
7531 let Predicates = [HasAVX1Only] in {
7532 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7533 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7534 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7535 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7536 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7537 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7538 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7539 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7540 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7541 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7543 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7544 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
7545 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7546 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7547 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
7548 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
7549 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7550 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7551 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
7552 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
7554 def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7555 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
7556 def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
7557 (VMOVDDUPrm addr:$src)>;
7560 //===----------------------------------------------------------------------===//
7561 // VPERM - Permute instructions
7564 multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7565 ValueType OpVT, X86FoldableSchedWrite Sched,
7566 X86MemOperand memOp> {
7567 let Predicates = [HasAVX2, NoVLX] in {
7568 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7569 (ins VR256:$src1, VR256:$src2),
7570 !strconcat(OpcodeStr,
7571 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7573 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7574 Sched<[Sched]>, VEX_4V, VEX_L;
7575 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7576 (ins VR256:$src1, memOp:$src2),
7577 !strconcat(OpcodeStr,
7578 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7580 (OpVT (X86VPermv VR256:$src1,
7581 (load addr:$src2))))]>,
7582 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7586 defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7587 let ExeDomain = SSEPackedSingle in
7588 defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7590 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7591 ValueType OpVT, X86FoldableSchedWrite Sched,
7592 X86MemOperand memOp> {
7593 let Predicates = [HasAVX2, NoVLX] in {
7594 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7595 (ins VR256:$src1, u8imm:$src2),
7596 !strconcat(OpcodeStr,
7597 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7599 (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
7600 Sched<[Sched]>, VEX, VEX_L;
7601 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7602 (ins memOp:$src1, u8imm:$src2),
7603 !strconcat(OpcodeStr,
7604 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7606 (OpVT (X86VPermi (mem_frag addr:$src1),
7607 (i8 imm:$src2))))]>,
7608 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7612 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7613 WriteShuffle256, i256mem>, VEX_W;
7614 let ExeDomain = SSEPackedDouble in
7615 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7616 WriteFShuffle256, f256mem>, VEX_W;
7618 //===----------------------------------------------------------------------===//
7619 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
7621 let isCommutable = 1 in
7622 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7623 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7624 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7625 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7626 (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
7628 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7629 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7630 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7631 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
7633 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7635 let Predicates = [HasAVX2] in
7636 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7637 VR256:$src1, (i8 imm:$imm))),
7638 (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7641 //===----------------------------------------------------------------------===//
7642 // VINSERTI128 - Insert packed integer values
7644 let hasSideEffects = 0 in {
7645 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7646 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7647 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7648 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7650 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7651 (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7652 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7653 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7656 let Predicates = [HasAVX2, NoVLX] in {
7657 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>;
7658 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>;
7659 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
7660 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>;
7663 //===----------------------------------------------------------------------===//
7664 // VEXTRACTI128 - Extract packed integer values
7666 def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7667 (ins VR256:$src1, u8imm:$src2),
7668 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7669 Sched<[WriteShuffle256]>, VEX, VEX_L;
7670 let hasSideEffects = 0, mayStore = 1 in
7671 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7672 (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7673 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7674 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7676 let Predicates = [HasAVX2, NoVLX] in {
7677 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
7678 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
7679 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7680 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
7683 //===----------------------------------------------------------------------===//
7684 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7686 multiclass avx2_pmovmask<string OpcodeStr,
7687 Intrinsic IntLd128, Intrinsic IntLd256,
7688 Intrinsic IntSt128, Intrinsic IntSt256> {
7689 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7690 (ins VR128:$src1, i128mem:$src2),
7691 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7692 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7693 VEX_4V, Sched<[WriteVecMaskedLoad]>;
7694 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7695 (ins VR256:$src1, i256mem:$src2),
7696 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7697 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7698 VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
7699 def mr : AVX28I<0x8e, MRMDestMem, (outs),
7700 (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7701 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7702 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7703 VEX_4V, Sched<[WriteVecMaskedStore]>;
7704 def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7705 (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7706 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7707 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7708 VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
7711 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7712 int_x86_avx2_maskload_d,
7713 int_x86_avx2_maskload_d_256,
7714 int_x86_avx2_maskstore_d,
7715 int_x86_avx2_maskstore_d_256>;
7716 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7717 int_x86_avx2_maskload_q,
7718 int_x86_avx2_maskload_q_256,
7719 int_x86_avx2_maskstore_q,
7720 int_x86_avx2_maskstore_q_256>, VEX_W;
7722 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7723 ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
7725 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7726 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7728 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7729 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7730 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7731 (VT immAllZerosV))),
7732 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7734 let Predicates = [HasAVX] in {
7735 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
7736 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
7737 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
7738 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
7740 let Predicates = [HasAVX1Only] in {
7741 // load/store i32/i64 not supported use ps/pd version
7742 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
7743 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
7744 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
7745 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
7747 let Predicates = [HasAVX2] in {
7748 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
7749 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
7750 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
7751 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
7754 //===----------------------------------------------------------------------===//
7755 // SubVector Broadcasts
7756 // Provide fallback in case the load node that is used in the patterns above
7757 // is used by additional users, which prevents the pattern selection.
7759 let Predicates = [HasAVX, NoVLX] in {
7760 def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
7761 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7762 (v2f64 VR128:$src), 1)>;
7763 def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
7764 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7765 (v4f32 VR128:$src), 1)>;
7768 // NOTE: We're using FP instructions here, but execution domain fixing can
7769 // convert to integer when profitable.
7770 let Predicates = [HasAVX, NoVLX] in {
7771 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
7772 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7773 (v2i64 VR128:$src), 1)>;
7774 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
7775 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7776 (v4i32 VR128:$src), 1)>;
7777 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
7778 (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7779 (v8i16 VR128:$src), 1)>;
7780 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
7781 (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7782 (v16i8 VR128:$src), 1)>;
7785 //===----------------------------------------------------------------------===//
7786 // Variable Bit Shifts
7788 multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7789 ValueType vt128, ValueType vt256> {
7790 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7791 (ins VR128:$src1, VR128:$src2),
7792 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7794 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7795 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7796 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7797 (ins VR128:$src1, i128mem:$src2),
7798 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7800 (vt128 (OpNode VR128:$src1,
7801 (vt128 (load addr:$src2)))))]>,
7802 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7803 SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7804 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7805 (ins VR256:$src1, VR256:$src2),
7806 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7808 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
7809 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
7810 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7811 (ins VR256:$src1, i256mem:$src2),
7812 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7814 (vt256 (OpNode VR256:$src1,
7815 (vt256 (load addr:$src2)))))]>,
7816 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
7817 SchedWriteVarVecShift.YMM.ReadAfterFold]>;
7820 let Predicates = [HasAVX2, NoVLX] in {
7821 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
7822 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
7823 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
7824 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
7825 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
7828 //===----------------------------------------------------------------------===//
7829 // VGATHER - GATHER Operations
7831 // FIXME: Improve scheduling of gather instructions.
7832 multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
7833 ValueType VTy, PatFrag GatherNode128,
7834 PatFrag GatherNode256, RegisterClass RC256,
7835 X86MemOperand memop128, X86MemOperand memop256,
7836 ValueType MTx = VTx, ValueType MTy = VTy> {
7837 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
7838 (ins VR128:$src1, memop128:$src2, VR128:$mask),
7839 !strconcat(OpcodeStr,
7840 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7841 [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
7842 (GatherNode128 VR128:$src1, VR128:$mask,
7843 vectoraddr:$src2))]>,
7844 VEX, Sched<[WriteLoad]>;
7845 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
7846 (ins RC256:$src1, memop256:$src2, RC256:$mask),
7847 !strconcat(OpcodeStr,
7848 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7849 [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
7850 (GatherNode256 RC256:$src1, RC256:$mask,
7851 vectoraddr:$src2))]>,
7852 VEX, VEX_L, Sched<[WriteLoad]>;
7855 let Predicates = [HasAVX2] in {
7856 let mayLoad = 1, hasSideEffects = 0, Constraints
7857 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
7859 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
7860 mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
7861 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
7862 mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
7863 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
7864 mgatherv8i32, VR256, vx128mem, vy256mem>;
7865 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
7866 mgatherv4i64, VR128, vx64mem, vy128mem>;
7868 let ExeDomain = SSEPackedDouble in {
7869 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
7870 mgatherv4i32, VR256, vx128mem, vx256mem,
7871 v2i64, v4i64>, VEX_W;
7872 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
7873 mgatherv4i64, VR256, vx128mem, vy256mem,
7874 v2i64, v4i64>, VEX_W;
7877 let ExeDomain = SSEPackedSingle in {
7878 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
7879 mgatherv8i32, VR256, vx128mem, vy256mem,
7881 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
7882 mgatherv4i64, VR128, vx64mem, vy128mem,
7888 //===----------------------------------------------------------------------===//
7889 // GFNI instructions
7890 //===----------------------------------------------------------------------===//
7892 multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
7893 RegisterClass RC, PatFrag MemOpFrag,
7894 X86MemOperand X86MemOp, bit Is2Addr = 0> {
7895 let ExeDomain = SSEPackedInt,
7896 AsmString = !if(Is2Addr,
7897 OpcodeStr##"\t{$src2, $dst|$dst, $src2}",
7898 OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
7899 let isCommutable = 1 in
7900 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
7901 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
7902 Sched<[SchedWriteVecALU.XMM]>, T8PD;
7904 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
7905 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
7906 (MemOpFrag addr:$src2))))]>,
7907 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
7911 multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
7912 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
7913 X86MemOperand X86MemOp, bit Is2Addr = 0> {
7914 let AsmString = !if(Is2Addr,
7915 OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7916 OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
7917 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
7918 (ins RC:$src1, RC:$src2, u8imm:$src3), "",
7919 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
7920 SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
7921 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
7922 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
7923 [(set RC:$dst, (OpVT (OpNode RC:$src1,
7924 (MemOpFrag addr:$src2),
7925 imm:$src3)))], SSEPackedInt>,
7926 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
7930 multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
7931 let Constraints = "$src1 = $dst",
7932 Predicates = [HasGFNI, UseSSE2] in
7933 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
7934 VR128, load, i128mem, 1>;
7935 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
7936 defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
7937 load, i128mem>, VEX_4V, VEX_W;
7938 defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
7939 load, i256mem>, VEX_4V, VEX_L, VEX_W;
7944 let Constraints = "$src1 = $dst",
7945 Predicates = [HasGFNI, UseSSE2] in
7946 defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
7948 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
7949 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
7951 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
7952 i256mem>, VEX_4V, VEX_L;
7954 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
7955 let isCommutable = 0 in {
7956 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
7957 X86GF2P8affineinvqb>, TAPD;
7958 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
7959 X86GF2P8affineqb>, TAPD;