1 //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file describes the X86 SSE instruction set, defining the instructions,
10 // and properties of the instructions which are needed for code generation,
11 // machine code emission, and analysis.
13 //===----------------------------------------------------------------------===//
15 //===----------------------------------------------------------------------===//
16 // SSE 1 & 2 Instructions Classes
17 //===----------------------------------------------------------------------===//
19 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
21 RegisterClass RC, X86MemOperand x86memop,
22 Domain d, X86FoldableSchedWrite sched,
24 let isCommutable = 1 in {
25 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
27 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
28 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
29 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
32 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
34 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
35 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
36 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
37 Sched<[sched.Folded, sched.ReadAfterFold]>;
40 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
41 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
42 SDPatternOperator OpNode, RegisterClass RC,
43 ValueType VT, string asm, Operand memopr,
44 ComplexPattern mem_cpat, Domain d,
45 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
46 let isCodeGenOnly = 1, hasSideEffects = 0 in {
47 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
49 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
50 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
51 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
54 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
56 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
57 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
58 [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
59 Sched<[sched.Folded, sched.ReadAfterFold]>;
63 /// sse12_fp_packed - SSE 1 & 2 packed instructions class
64 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
65 RegisterClass RC, ValueType vt,
66 X86MemOperand x86memop, PatFrag mem_frag,
67 Domain d, X86FoldableSchedWrite sched,
69 let isCommutable = 1 in
70 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
72 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
73 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
74 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
77 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
79 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
80 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
81 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
83 Sched<[sched.Folded, sched.ReadAfterFold]>;
86 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
87 multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
88 string OpcodeStr, X86MemOperand x86memop,
89 X86FoldableSchedWrite sched,
90 list<dag> pat_rr, list<dag> pat_rm,
92 let isCommutable = 1, hasSideEffects = 0 in
93 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
95 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
96 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
99 let hasSideEffects = 0, mayLoad = 1 in
100 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
102 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
103 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
105 Sched<[sched.Folded, sched.ReadAfterFold]>;
109 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
110 // This is expanded by ExpandPostRAPseudos.
111 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
112 isPseudo = 1, SchedRW = [WriteZero] in {
113 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
114 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
115 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
116 [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
119 //===----------------------------------------------------------------------===//
120 // AVX & SSE - Zero/One Vectors
121 //===----------------------------------------------------------------------===//
123 // Alias instruction that maps zero vector to pxor / xorp* for sse.
124 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
125 // swizzled by ExecutionDomainFix to pxor.
126 // We set canFoldAsLoad because this can be converted to a constant-pool
127 // load of an all-zeros value if folding it would be beneficial.
128 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
129 isPseudo = 1, SchedRW = [WriteZero] in {
130 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
131 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
134 let Predicates = [NoAVX512] in
135 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
138 // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
139 // and doesn't need it because on sandy bridge the register is set to zero
140 // at the rename stage without using any execution unit, so SET0PSY
141 // and SET0PDY can be used for vector int instructions without penalty
142 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
143 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
144 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
145 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
148 // We set canFoldAsLoad because this can be converted to a constant-pool
149 // load of an all-ones value if folding it would be beneficial.
150 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
151 isPseudo = 1, SchedRW = [WriteZero] in {
152 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
153 [(set VR128:$dst, (v4i32 immAllOnesV))]>;
154 let Predicates = [HasAVX1Only, OptForMinSize] in {
155 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
156 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
158 let Predicates = [HasAVX2] in
159 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
160 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
163 //===----------------------------------------------------------------------===//
164 // SSE 1 & 2 - Move FP Scalar Instructions
166 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
167 // register copies because it's a partial register update; Register-to-register
168 // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
169 // that the insert be implementable in terms of a copy, and just mentioned, we
170 // don't use movss/movsd for copies.
171 //===----------------------------------------------------------------------===//
173 multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
174 X86MemOperand x86memop, string base_opc,
175 string asm_opr, Domain d, string Name> {
176 let isCommutable = 1 in
177 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
178 (ins VR128:$src1, VR128:$src2),
179 !strconcat(base_opc, asm_opr),
180 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
181 Sched<[SchedWriteFShuffle.XMM]>;
183 // For the disassembler
184 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
185 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
186 (ins VR128:$src1, VR128:$src2),
187 !strconcat(base_opc, asm_opr), []>,
188 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
191 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
192 X86MemOperand x86memop, string OpcodeStr,
193 Domain d, string Name, Predicate pred> {
195 let Predicates = [UseAVX, OptForSize] in
196 defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
197 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
199 VEX_4V, VEX_LIG, VEX_WIG;
201 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
202 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
203 [(store RC:$src, addr:$dst)], d>,
204 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
206 let Constraints = "$src1 = $dst" in {
207 let Predicates = [pred, NoSSE41_Or_OptForSize] in
208 defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
209 "\t{$src2, $dst|$dst, $src2}", d, Name>;
212 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
213 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
214 [(store RC:$src, addr:$dst)], d>,
215 Sched<[WriteFStore]>;
217 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
218 (!cast<Instruction>("V"#NAME#"rr_REV")
219 VR128:$dst, VR128:$src1, VR128:$src2), 0>;
220 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
221 (!cast<Instruction>(NAME#"rr_REV")
222 VR128:$dst, VR128:$src2), 0>;
225 // Loading from memory automatically zeroing upper bits.
226 multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
227 PatFrag mem_pat, string OpcodeStr, Domain d> {
228 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
229 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
230 [(set RC:$dst, (mem_pat addr:$src))], d>,
231 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
232 def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
233 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
234 [(set RC:$dst, (mem_pat addr:$src))], d>,
238 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
239 SSEPackedSingle, "MOVSS", UseSSE1>, XS;
240 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
241 SSEPackedDouble, "MOVSD", UseSSE2>, XD;
243 let canFoldAsLoad = 1, isReMaterializable = 1 in {
244 defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
245 SSEPackedSingle>, XS;
246 defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
247 SSEPackedDouble>, XD;
251 let Predicates = [UseAVX] in {
252 // MOVSSrm zeros the high parts of the register; represent this
253 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
254 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
255 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
256 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
257 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
258 def : Pat<(v4f32 (X86vzload addr:$src)),
259 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
261 // MOVSDrm zeros the high parts of the register; represent this
262 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
263 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
264 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
265 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
266 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
267 def : Pat<(v2f64 (X86vzload addr:$src)),
268 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
270 // Represent the same patterns above but in the form they appear for
272 def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
273 (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
274 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
275 def : Pat<(v8f32 (X86vzload addr:$src)),
276 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
277 def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
278 (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
279 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
280 def : Pat<(v4f64 (X86vzload addr:$src)),
281 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
283 // Extract and store.
284 def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
286 (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
289 let Predicates = [UseAVX, OptForSize] in {
290 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
291 // MOVSS to the lower bits.
292 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
293 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
294 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
295 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
297 // Move low f32 and clear high bits.
298 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
299 (SUBREG_TO_REG (i32 0),
300 (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
301 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
302 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
303 (SUBREG_TO_REG (i32 0),
304 (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
305 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
307 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
308 (SUBREG_TO_REG (i32 0),
309 (v2f64 (VMOVSDrr (v2f64 (V_SET0)),
310 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
312 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
313 (SUBREG_TO_REG (i32 0),
314 (v2i64 (VMOVSDrr (v2i64 (V_SET0)),
315 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
319 let Predicates = [UseSSE1] in {
320 let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
321 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
322 // MOVSS to the lower bits.
323 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
324 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
325 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
326 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
329 // MOVSSrm already zeros the high parts of the register.
330 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
331 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
332 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
333 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
334 def : Pat<(v4f32 (X86vzload addr:$src)),
335 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
337 // Extract and store.
338 def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
340 (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
343 let Predicates = [UseSSE2] in {
344 // MOVSDrm already zeros the high parts of the register.
345 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
346 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
347 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
348 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
349 def : Pat<(v2f64 (X86vzload addr:$src)),
350 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
353 // Aliases to help the assembler pick two byte VEX encodings by swapping the
354 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
355 def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
356 (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
357 def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
358 (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
360 //===----------------------------------------------------------------------===//
361 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
362 //===----------------------------------------------------------------------===//
364 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
365 X86MemOperand x86memop, PatFrag ld_frag,
366 string asm, Domain d,
367 X86SchedWriteMoveLS sched> {
368 let hasSideEffects = 0, isMoveReg = 1 in
369 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
370 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
372 let canFoldAsLoad = 1, isReMaterializable = 1 in
373 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
374 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
375 [(set RC:$dst, (ld_frag addr:$src))], d>,
379 let Predicates = [HasAVX, NoVLX] in {
380 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
381 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
383 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
384 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
386 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
387 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
389 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
390 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
393 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
394 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
395 PS, VEX, VEX_L, VEX_WIG;
396 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
397 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
398 PD, VEX, VEX_L, VEX_WIG;
399 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
400 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
401 PS, VEX, VEX_L, VEX_WIG;
402 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
403 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
404 PD, VEX, VEX_L, VEX_WIG;
407 let Predicates = [UseSSE1] in {
408 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
409 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
411 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
412 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
415 let Predicates = [UseSSE2] in {
416 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
417 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
419 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
420 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
424 let Predicates = [HasAVX, NoVLX] in {
425 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
426 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
427 "movaps\t{$src, $dst|$dst, $src}",
428 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
430 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
431 "movapd\t{$src, $dst|$dst, $src}",
432 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
434 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
435 "movups\t{$src, $dst|$dst, $src}",
436 [(store (v4f32 VR128:$src), addr:$dst)]>,
438 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
439 "movupd\t{$src, $dst|$dst, $src}",
440 [(store (v2f64 VR128:$src), addr:$dst)]>,
444 let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
445 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
446 "movaps\t{$src, $dst|$dst, $src}",
447 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
449 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
450 "movapd\t{$src, $dst|$dst, $src}",
451 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
453 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
454 "movups\t{$src, $dst|$dst, $src}",
455 [(store (v8f32 VR256:$src), addr:$dst)]>,
457 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
458 "movupd\t{$src, $dst|$dst, $src}",
459 [(store (v4f64 VR256:$src), addr:$dst)]>,
465 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
467 let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
468 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
470 "movaps\t{$src, $dst|$dst, $src}", []>,
471 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
472 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
474 "movapd\t{$src, $dst|$dst, $src}", []>,
475 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
476 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
478 "movups\t{$src, $dst|$dst, $src}", []>,
479 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
480 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
482 "movupd\t{$src, $dst|$dst, $src}", []>,
483 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
486 let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
487 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
489 "movaps\t{$src, $dst|$dst, $src}", []>,
490 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
491 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
493 "movapd\t{$src, $dst|$dst, $src}", []>,
494 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
495 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
497 "movups\t{$src, $dst|$dst, $src}", []>,
498 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
499 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
501 "movupd\t{$src, $dst|$dst, $src}", []>,
502 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
506 // Aliases to help the assembler pick two byte VEX encodings by swapping the
507 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
508 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
509 (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>;
510 def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
511 (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>;
512 def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
513 (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>;
514 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
515 (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>;
516 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
517 (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
518 def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
519 (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
520 def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
521 (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
522 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
523 (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
525 // Reversed version with ".s" suffix for GAS compatibility.
526 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
527 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
528 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
529 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
530 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
531 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
532 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
533 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
534 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
535 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
536 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
537 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
538 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
539 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
540 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
541 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
543 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
544 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
545 "movaps\t{$src, $dst|$dst, $src}",
546 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
547 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
548 "movapd\t{$src, $dst|$dst, $src}",
549 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
550 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
551 "movups\t{$src, $dst|$dst, $src}",
552 [(store (v4f32 VR128:$src), addr:$dst)]>;
553 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
554 "movupd\t{$src, $dst|$dst, $src}",
555 [(store (v2f64 VR128:$src), addr:$dst)]>;
559 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
560 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
561 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
562 "movaps\t{$src, $dst|$dst, $src}", []>,
563 FoldGenData<"MOVAPSrr">;
564 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
565 "movapd\t{$src, $dst|$dst, $src}", []>,
566 FoldGenData<"MOVAPDrr">;
567 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
568 "movups\t{$src, $dst|$dst, $src}", []>,
569 FoldGenData<"MOVUPSrr">;
570 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
571 "movupd\t{$src, $dst|$dst, $src}", []>,
572 FoldGenData<"MOVUPDrr">;
575 // Reversed version with ".s" suffix for GAS compatibility.
576 def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
577 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
578 def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
579 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
580 def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
581 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
582 def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
583 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
585 let Predicates = [HasAVX, NoVLX] in {
586 // 256-bit load/store need to use floating point load/store in case we don't
587 // have AVX2. Execution domain fixing will convert to integer if AVX2 is
588 // available and changing the domain is beneficial.
589 def : Pat<(alignedloadv4i64 addr:$src),
590 (VMOVAPSYrm addr:$src)>;
591 def : Pat<(alignedloadv8i32 addr:$src),
592 (VMOVAPSYrm addr:$src)>;
593 def : Pat<(alignedloadv16i16 addr:$src),
594 (VMOVAPSYrm addr:$src)>;
595 def : Pat<(alignedloadv32i8 addr:$src),
596 (VMOVAPSYrm addr:$src)>;
597 def : Pat<(loadv4i64 addr:$src),
598 (VMOVUPSYrm addr:$src)>;
599 def : Pat<(loadv8i32 addr:$src),
600 (VMOVUPSYrm addr:$src)>;
601 def : Pat<(loadv16i16 addr:$src),
602 (VMOVUPSYrm addr:$src)>;
603 def : Pat<(loadv32i8 addr:$src),
604 (VMOVUPSYrm addr:$src)>;
606 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
607 (VMOVAPSYmr addr:$dst, VR256:$src)>;
608 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
609 (VMOVAPSYmr addr:$dst, VR256:$src)>;
610 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
611 (VMOVAPSYmr addr:$dst, VR256:$src)>;
612 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
613 (VMOVAPSYmr addr:$dst, VR256:$src)>;
614 def : Pat<(store (v4i64 VR256:$src), addr:$dst),
615 (VMOVUPSYmr addr:$dst, VR256:$src)>;
616 def : Pat<(store (v8i32 VR256:$src), addr:$dst),
617 (VMOVUPSYmr addr:$dst, VR256:$src)>;
618 def : Pat<(store (v16i16 VR256:$src), addr:$dst),
619 (VMOVUPSYmr addr:$dst, VR256:$src)>;
620 def : Pat<(store (v32i8 VR256:$src), addr:$dst),
621 (VMOVUPSYmr addr:$dst, VR256:$src)>;
624 // Use movaps / movups for SSE integer load / store (one byte shorter).
625 // The instructions selected below are then converted to MOVDQA/MOVDQU
626 // during the SSE domain pass.
627 let Predicates = [UseSSE1] in {
628 def : Pat<(alignedloadv2i64 addr:$src),
629 (MOVAPSrm addr:$src)>;
630 def : Pat<(alignedloadv4i32 addr:$src),
631 (MOVAPSrm addr:$src)>;
632 def : Pat<(alignedloadv8i16 addr:$src),
633 (MOVAPSrm addr:$src)>;
634 def : Pat<(alignedloadv16i8 addr:$src),
635 (MOVAPSrm addr:$src)>;
636 def : Pat<(loadv2i64 addr:$src),
637 (MOVUPSrm addr:$src)>;
638 def : Pat<(loadv4i32 addr:$src),
639 (MOVUPSrm addr:$src)>;
640 def : Pat<(loadv8i16 addr:$src),
641 (MOVUPSrm addr:$src)>;
642 def : Pat<(loadv16i8 addr:$src),
643 (MOVUPSrm addr:$src)>;
645 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
646 (MOVAPSmr addr:$dst, VR128:$src)>;
647 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
648 (MOVAPSmr addr:$dst, VR128:$src)>;
649 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
650 (MOVAPSmr addr:$dst, VR128:$src)>;
651 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
652 (MOVAPSmr addr:$dst, VR128:$src)>;
653 def : Pat<(store (v2i64 VR128:$src), addr:$dst),
654 (MOVUPSmr addr:$dst, VR128:$src)>;
655 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
656 (MOVUPSmr addr:$dst, VR128:$src)>;
657 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
658 (MOVUPSmr addr:$dst, VR128:$src)>;
659 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
660 (MOVUPSmr addr:$dst, VR128:$src)>;
663 //===----------------------------------------------------------------------===//
664 // SSE 1 & 2 - Move Low packed FP Instructions
665 //===----------------------------------------------------------------------===//
667 multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode,
668 string base_opc, string asm_opr> {
669 // No pattern as they need be special cased between high and low.
670 let hasSideEffects = 0, mayLoad = 1 in
671 def PSrm : PI<opc, MRMSrcMem,
672 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
673 !strconcat(base_opc, "s", asm_opr),
674 [], SSEPackedSingle>, PS,
675 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
677 def PDrm : PI<opc, MRMSrcMem,
678 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
679 !strconcat(base_opc, "d", asm_opr),
680 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
681 (scalar_to_vector (loadf64 addr:$src2)))))],
682 SSEPackedDouble>, PD,
683 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
686 multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
688 let Predicates = [UseAVX] in
689 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
690 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
693 let Constraints = "$src1 = $dst" in
694 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
695 "\t{$src2, $dst|$dst, $src2}">;
698 defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
700 let SchedRW = [WriteFStore] in {
701 let Predicates = [UseAVX] in {
702 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
703 "movlps\t{$src, $dst|$dst, $src}",
704 [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
705 (iPTR 0))), addr:$dst)]>,
707 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
708 "movlpd\t{$src, $dst|$dst, $src}",
709 [(store (f64 (extractelt (v2f64 VR128:$src),
710 (iPTR 0))), addr:$dst)]>,
713 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
714 "movlps\t{$src, $dst|$dst, $src}",
715 [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
716 (iPTR 0))), addr:$dst)]>;
717 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
718 "movlpd\t{$src, $dst|$dst, $src}",
719 [(store (f64 (extractelt (v2f64 VR128:$src),
720 (iPTR 0))), addr:$dst)]>;
723 let Predicates = [UseSSE1] in {
724 // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
725 def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
726 (iPTR 0))), addr:$src1),
727 (MOVLPSmr addr:$src1, VR128:$src2)>;
729 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
730 // end up with a movsd or blend instead of shufp.
731 // No need for aligned load, we're only loading 64-bits.
732 def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)),
733 (MOVLPSrm VR128:$src1, addr:$src2)>;
736 //===----------------------------------------------------------------------===//
737 // SSE 1 & 2 - Move Hi packed FP Instructions
738 //===----------------------------------------------------------------------===//
740 defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
742 let SchedRW = [WriteFStore] in {
743 // v2f64 extract element 1 is always custom lowered to unpack high to low
744 // and extract element 0 so the non-store version isn't too horrible.
745 let Predicates = [UseAVX] in {
746 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
747 "movhps\t{$src, $dst|$dst, $src}",
748 [(store (f64 (extractelt
749 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
750 (bc_v2f64 (v4f32 VR128:$src))),
751 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
752 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
753 "movhpd\t{$src, $dst|$dst, $src}",
754 [(store (f64 (extractelt
755 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
756 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
758 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
759 "movhps\t{$src, $dst|$dst, $src}",
760 [(store (f64 (extractelt
761 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
762 (bc_v2f64 (v4f32 VR128:$src))),
763 (iPTR 0))), addr:$dst)]>;
764 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
765 "movhpd\t{$src, $dst|$dst, $src}",
766 [(store (f64 (extractelt
767 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
768 (iPTR 0))), addr:$dst)]>;
771 let Predicates = [UseAVX] in {
772 // Also handle an i64 load because that may get selected as a faster way to
774 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
775 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
776 (VMOVHPDrm VR128:$src1, addr:$src2)>;
778 def : Pat<(store (f64 (extractelt
779 (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
780 (iPTR 0))), addr:$dst),
781 (VMOVHPDmr addr:$dst, VR128:$src)>;
784 let Predicates = [UseSSE1] in {
785 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
786 // end up with a movsd or blend instead of shufp.
787 // No need for aligned load, we're only loading 64-bits.
788 def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)),
789 (MOVHPSrm VR128:$src1, addr:$src2)>;
792 let Predicates = [UseSSE2] in {
795 // Also handle an i64 load because that may get selected as a faster way to
797 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
798 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
799 (MOVHPDrm VR128:$src1, addr:$src2)>;
801 def : Pat<(store (f64 (extractelt
802 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
803 (iPTR 0))), addr:$dst),
804 (MOVHPDmr addr:$dst, VR128:$src)>;
807 //===----------------------------------------------------------------------===//
808 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
809 //===----------------------------------------------------------------------===//
811 let Predicates = [UseAVX] in {
812 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
813 (ins VR128:$src1, VR128:$src2),
814 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
816 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
817 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
818 let isCommutable = 1 in
819 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
820 (ins VR128:$src1, VR128:$src2),
821 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
823 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
824 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
827 let Constraints = "$src1 = $dst" in {
828 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
829 (ins VR128:$src1, VR128:$src2),
830 "movlhps\t{$src2, $dst|$dst, $src2}",
832 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
833 Sched<[SchedWriteFShuffle.XMM]>;
834 let isCommutable = 1 in
835 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
836 (ins VR128:$src1, VR128:$src2),
837 "movhlps\t{$src2, $dst|$dst, $src2}",
839 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
840 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
843 //===----------------------------------------------------------------------===//
844 // SSE 1 & 2 - Conversion Instructions
845 //===----------------------------------------------------------------------===//
847 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
848 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
849 string asm, X86FoldableSchedWrite sched,
850 SchedRead Int2Fpu = ReadDefault> {
851 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
852 [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
853 Sched<[sched, Int2Fpu]>;
854 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
855 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
856 Sched<[sched.Folded]>;
859 multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
860 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
861 string asm, Domain d, X86FoldableSchedWrite sched> {
862 let hasSideEffects = 0 in {
863 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
864 [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>,
867 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
868 [(set RC:$dst, (DstTy (sint_to_fp
869 (SrcTy (ld_frag addr:$src)))))], d>,
870 Sched<[sched.Folded]>;
874 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
875 X86MemOperand x86memop, string asm,
876 X86FoldableSchedWrite sched> {
877 let hasSideEffects = 0, Predicates = [UseAVX] in {
878 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
879 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
880 Sched<[sched, ReadDefault, ReadInt2Fpu]>;
882 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
883 (ins DstRC:$src1, x86memop:$src),
884 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
885 Sched<[sched.Folded, sched.ReadAfterFold]>;
886 } // hasSideEffects = 0
889 let Predicates = [UseAVX] in {
890 defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
891 "cvttss2si\t{$src, $dst|$dst, $src}",
894 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
895 "cvttss2si\t{$src, $dst|$dst, $src}",
897 XS, VEX, VEX_W, VEX_LIG;
898 defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
899 "cvttsd2si\t{$src, $dst|$dst, $src}",
902 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
903 "cvttsd2si\t{$src, $dst|$dst, $src}",
905 XD, VEX, VEX_W, VEX_LIG;
907 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
908 (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
909 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
910 (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
911 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
912 (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
913 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
914 (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
915 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
916 (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
917 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
918 (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
919 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
920 (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
921 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
922 (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
924 // The assembler can recognize rr 64-bit instructions by seeing a rxx
925 // register, but the same isn't true when only using memory operands,
926 // provide other assembly "l" and "q" forms to address this explicitly
927 // where appropriate to do so.
928 defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}",
929 WriteCvtI2SS>, XS, VEX_4V, VEX_LIG;
930 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}",
931 WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
932 defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}",
933 WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
934 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}",
935 WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
937 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
938 (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
939 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
940 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
942 let Predicates = [UseAVX] in {
943 def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
944 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
945 def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
946 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
947 def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
948 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
949 def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
950 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
952 def : Pat<(f32 (sint_to_fp GR32:$src)),
953 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
954 def : Pat<(f32 (sint_to_fp GR64:$src)),
955 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
956 def : Pat<(f64 (sint_to_fp GR32:$src)),
957 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
958 def : Pat<(f64 (sint_to_fp GR64:$src)),
959 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
962 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
963 "cvttss2si\t{$src, $dst|$dst, $src}",
965 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
966 "cvttss2si\t{$src, $dst|$dst, $src}",
967 WriteCvtSS2I>, XS, REX_W;
968 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
969 "cvttsd2si\t{$src, $dst|$dst, $src}",
971 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
972 "cvttsd2si\t{$src, $dst|$dst, $src}",
973 WriteCvtSD2I>, XD, REX_W;
974 defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
975 "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
976 WriteCvtI2SS, ReadInt2Fpu>, XS;
977 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
978 "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
979 WriteCvtI2SS, ReadInt2Fpu>, XS, REX_W;
980 defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
981 "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
982 WriteCvtI2SD, ReadInt2Fpu>, XD;
983 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
984 "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
985 WriteCvtI2SD, ReadInt2Fpu>, XD, REX_W;
987 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
988 (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
989 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
990 (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
991 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
992 (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
993 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
994 (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
995 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
996 (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
997 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
998 (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
999 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1000 (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
1001 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1002 (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
1004 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1005 (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">;
1006 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1007 (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">;
1009 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
1010 // and/or XMM operand(s).
1012 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1013 ValueType DstVT, ValueType SrcVT, SDNode OpNode,
1014 Operand memop, ComplexPattern mem_cpat, string asm,
1015 X86FoldableSchedWrite sched> {
1016 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1017 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1018 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
1020 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1021 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1022 [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>,
1023 Sched<[sched.Folded]>;
1026 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1027 RegisterClass DstRC, X86MemOperand x86memop,
1028 string asm, X86FoldableSchedWrite sched,
1030 let hasSideEffects = 0 in {
1031 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1033 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1034 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1035 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
1037 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1038 (ins DstRC:$src1, x86memop:$src2),
1040 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1041 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1042 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
1046 let Predicates = [UseAVX] in {
1047 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
1048 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1049 WriteCvtSD2I>, XD, VEX, VEX_LIG;
1050 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
1051 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1052 WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG;
1054 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
1055 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD;
1056 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
1057 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
1060 let isCodeGenOnly = 1 in {
1061 let Predicates = [UseAVX] in {
1062 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1063 i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V;
1064 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1065 i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W;
1066 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1067 i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V;
1068 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1069 i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W;
1071 let Constraints = "$src1 = $dst" in {
1072 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1073 i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS;
1074 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1075 i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W;
1076 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1077 i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD;
1078 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1079 i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W;
1081 } // isCodeGenOnly = 1
1085 // Aliases for intrinsics
1086 let isCodeGenOnly = 1 in {
1087 let Predicates = [UseAVX] in {
1088 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1089 ssmem, sse_load_f32, "cvttss2si",
1090 WriteCvtSS2I>, XS, VEX;
1091 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1092 X86cvtts2Int, ssmem, sse_load_f32,
1093 "cvttss2si", WriteCvtSS2I>,
1095 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1096 sdmem, sse_load_f64, "cvttsd2si",
1097 WriteCvtSS2I>, XD, VEX;
1098 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1099 X86cvtts2Int, sdmem, sse_load_f64,
1100 "cvttsd2si", WriteCvtSS2I>,
1103 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1104 ssmem, sse_load_f32, "cvttss2si",
1106 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1107 X86cvtts2Int, ssmem, sse_load_f32,
1108 "cvttss2si", WriteCvtSS2I>, XS, REX_W;
1109 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1110 sdmem, sse_load_f64, "cvttsd2si",
1112 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1113 X86cvtts2Int, sdmem, sse_load_f64,
1114 "cvttsd2si", WriteCvtSD2I>, XD, REX_W;
1115 } // isCodeGenOnly = 1
1117 let Predicates = [UseAVX] in {
1118 defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1119 ssmem, sse_load_f32, "cvtss2si",
1120 WriteCvtSS2I>, XS, VEX, VEX_LIG;
1121 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1122 ssmem, sse_load_f32, "cvtss2si",
1123 WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG;
1125 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1126 ssmem, sse_load_f32, "cvtss2si",
1128 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1129 ssmem, sse_load_f32, "cvtss2si",
1130 WriteCvtSS2I>, XS, REX_W;
1132 defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1133 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1134 SSEPackedSingle, WriteCvtI2PS>,
1135 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1136 defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1137 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1138 SSEPackedSingle, WriteCvtI2PSY>,
1139 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1141 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1142 "cvtdq2ps\t{$src, $dst|$dst, $src}",
1143 SSEPackedSingle, WriteCvtI2PS>,
1144 PS, Requires<[UseSSE2]>;
1147 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1148 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1149 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1150 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1151 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1152 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1153 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1154 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1155 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1156 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1157 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1158 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1159 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1160 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1161 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1162 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1165 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1166 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1167 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1168 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1169 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1170 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1171 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1172 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1173 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1174 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1175 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1176 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1177 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1178 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1179 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1180 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1184 // Convert scalar double to scalar single
1185 let hasSideEffects = 0, Predicates = [UseAVX] in {
1186 def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1187 (ins FR32:$src1, FR64:$src2),
1188 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1189 VEX_4V, VEX_LIG, VEX_WIG,
1190 Sched<[WriteCvtSD2SS]>;
1192 def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1193 (ins FR32:$src1, f64mem:$src2),
1194 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1195 XD, VEX_4V, VEX_LIG, VEX_WIG,
1196 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1199 def : Pat<(f32 (fpround FR64:$src)),
1200 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1203 def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1204 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1205 [(set FR32:$dst, (fpround FR64:$src))]>,
1206 Sched<[WriteCvtSD2SS]>;
1207 def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1208 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1209 [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>,
1210 XD, Requires<[UseSSE2, OptForSize]>,
1211 Sched<[WriteCvtSD2SS.Folded]>;
1213 let isCodeGenOnly = 1 in {
1214 def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1215 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1216 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1218 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
1219 XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
1220 Sched<[WriteCvtSD2SS]>;
1221 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1222 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1223 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1224 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1225 VR128:$src1, sse_load_f64:$src2))]>,
1226 XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
1227 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1228 let Constraints = "$src1 = $dst" in {
1229 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1230 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1231 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1233 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
1234 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1235 def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1236 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1237 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1238 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1239 VR128:$src1, sse_load_f64:$src2))]>,
1240 XD, Requires<[UseSSE2]>,
1241 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1243 } // isCodeGenOnly = 1
1245 // Convert scalar single to scalar double
1246 // SSE2 instructions with XS prefix
1247 let hasSideEffects = 0 in {
1248 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1249 (ins FR64:$src1, FR32:$src2),
1250 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1251 XS, VEX_4V, VEX_LIG, VEX_WIG,
1252 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>;
1254 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1255 (ins FR64:$src1, f32mem:$src2),
1256 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1257 XS, VEX_4V, VEX_LIG, VEX_WIG,
1258 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1259 Requires<[UseAVX, OptForSize]>;
1262 def : Pat<(f64 (fpextend FR32:$src)),
1263 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1264 def : Pat<(fpextend (loadf32 addr:$src)),
1265 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1267 def : Pat<(extloadf32 addr:$src),
1268 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>,
1269 Requires<[UseAVX, OptForSize]>;
1270 def : Pat<(extloadf32 addr:$src),
1271 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
1272 Requires<[UseAVX, OptForSpeed]>;
1274 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1275 "cvtss2sd\t{$src, $dst|$dst, $src}",
1276 [(set FR64:$dst, (fpextend FR32:$src))]>,
1277 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>;
1278 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1279 "cvtss2sd\t{$src, $dst|$dst, $src}",
1280 [(set FR64:$dst, (extloadf32 addr:$src))]>,
1281 XS, Requires<[UseSSE2, OptForSize]>,
1282 Sched<[WriteCvtSS2SD.Folded]>;
1284 // extload f32 -> f64. This matches load+fpextend because we have a hack in
1285 // the isel (PreprocessForFPConvert) that can introduce loads after dag
1287 // Since these loads aren't folded into the fpextend, we have to match it
1289 def : Pat<(fpextend (loadf32 addr:$src)),
1290 (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>;
1291 def : Pat<(extloadf32 addr:$src),
1292 (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
1294 let isCodeGenOnly = 1, hasSideEffects = 0 in {
1295 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1296 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1297 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1298 []>, XS, VEX_4V, VEX_WIG,
1299 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1301 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1302 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1303 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1304 []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
1305 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1306 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1307 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1308 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1309 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1310 []>, XS, Requires<[UseSSE2]>,
1311 Sched<[WriteCvtSS2SD]>;
1313 def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1314 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1315 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1316 []>, XS, Requires<[UseSSE2]>,
1317 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1319 } // isCodeGenOnly = 1
1321 // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1322 // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1323 // vmovs{s,d} instructions
1324 let Predicates = [UseAVX] in {
1325 def : Pat<(v4f32 (X86Movss
1327 (v4f32 (scalar_to_vector
1328 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1329 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1331 def : Pat<(v2f64 (X86Movsd
1333 (v2f64 (scalar_to_vector
1334 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1335 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1337 def : Pat<(v4f32 (X86Movss
1339 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1340 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1342 def : Pat<(v4f32 (X86Movss
1344 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
1345 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1347 def : Pat<(v4f32 (X86Movss
1349 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1350 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1352 def : Pat<(v4f32 (X86Movss
1354 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
1355 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1357 def : Pat<(v2f64 (X86Movsd
1359 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1360 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1362 def : Pat<(v2f64 (X86Movsd
1364 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
1365 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1367 def : Pat<(v2f64 (X86Movsd
1369 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1370 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1372 def : Pat<(v2f64 (X86Movsd
1374 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
1375 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1376 } // Predicates = [UseAVX]
1378 let Predicates = [UseSSE2] in {
1379 def : Pat<(v4f32 (X86Movss
1381 (v4f32 (scalar_to_vector
1382 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1383 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1385 def : Pat<(v2f64 (X86Movsd
1387 (v2f64 (scalar_to_vector
1388 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1389 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1391 def : Pat<(v2f64 (X86Movsd
1393 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1394 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1396 def : Pat<(v2f64 (X86Movsd
1398 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
1399 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1401 def : Pat<(v2f64 (X86Movsd
1403 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1404 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1406 def : Pat<(v2f64 (X86Movsd
1408 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
1409 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1410 } // Predicates = [UseSSE2]
1412 let Predicates = [UseSSE1] in {
1413 def : Pat<(v4f32 (X86Movss
1415 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1416 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1418 def : Pat<(v4f32 (X86Movss
1420 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
1421 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1423 def : Pat<(v4f32 (X86Movss
1425 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1426 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1428 def : Pat<(v4f32 (X86Movss
1430 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
1431 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1432 } // Predicates = [UseSSE1]
1434 let Predicates = [HasAVX, NoVLX] in {
1435 // Convert packed single/double fp to doubleword
1436 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1437 "cvtps2dq\t{$src, $dst|$dst, $src}",
1438 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1439 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1440 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1441 "cvtps2dq\t{$src, $dst|$dst, $src}",
1443 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1444 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1445 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1446 "cvtps2dq\t{$src, $dst|$dst, $src}",
1448 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1449 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1450 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1451 "cvtps2dq\t{$src, $dst|$dst, $src}",
1453 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1454 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1456 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1457 "cvtps2dq\t{$src, $dst|$dst, $src}",
1458 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1459 Sched<[WriteCvtPS2I]>;
1460 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1461 "cvtps2dq\t{$src, $dst|$dst, $src}",
1463 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1464 Sched<[WriteCvtPS2ILd]>;
1467 // Convert Packed Double FP to Packed DW Integers
1468 let Predicates = [HasAVX, NoVLX] in {
1469 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1470 // register, but the same isn't true when using memory operands instead.
1471 // Provide other assembly rr and rm forms to address this explicitly.
1472 def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1473 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1475 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1476 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1479 def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1480 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1482 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1483 Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1486 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1487 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1489 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1490 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1491 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1492 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1494 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1495 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1498 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1499 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
1500 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1501 (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
1502 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1503 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
1504 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1505 (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
1507 def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1508 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1510 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1511 Sched<[WriteCvtPD2ILd]>;
1512 def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1513 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1515 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1516 Sched<[WriteCvtPD2I]>;
1518 // Convert with truncation packed single/double fp to doubleword
1519 // SSE2 packed instructions with XS prefix
1520 let Predicates = [HasAVX, NoVLX] in {
1521 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1522 "cvttps2dq\t{$src, $dst|$dst, $src}",
1524 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
1525 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1526 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1527 "cvttps2dq\t{$src, $dst|$dst, $src}",
1529 (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>,
1530 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1531 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1532 "cvttps2dq\t{$src, $dst|$dst, $src}",
1534 (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>,
1535 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1536 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1537 "cvttps2dq\t{$src, $dst|$dst, $src}",
1539 (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>,
1541 Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1544 let Predicates = [HasAVX, NoVLX] in {
1545 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
1546 (VCVTTPS2DQrr VR128:$src)>;
1547 def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
1548 (VCVTTPS2DQrm addr:$src)>;
1549 def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
1550 (VCVTTPS2DQYrr VR256:$src)>;
1551 def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
1552 (VCVTTPS2DQYrm addr:$src)>;
1555 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1556 "cvttps2dq\t{$src, $dst|$dst, $src}",
1558 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
1559 Sched<[WriteCvtPS2I]>;
1560 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1561 "cvttps2dq\t{$src, $dst|$dst, $src}",
1563 (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>,
1564 Sched<[WriteCvtPS2ILd]>;
1566 let Predicates = [UseSSE2] in {
1567 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
1568 (CVTTPS2DQrr VR128:$src)>;
1569 def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
1570 (CVTTPS2DQrm addr:$src)>;
1573 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1574 // register, but the same isn't true when using memory operands instead.
1575 // Provide other assembly rr and rm forms to address this explicitly.
1576 let Predicates = [HasAVX, NoVLX] in {
1578 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1579 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1581 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
1582 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1583 def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1584 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1586 (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>,
1587 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1590 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1591 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1593 (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>,
1594 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1595 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1596 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1598 (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>,
1599 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1600 } // Predicates = [HasAVX, NoVLX]
1602 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1603 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
1604 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1605 (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
1606 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1607 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
1608 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1609 (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
1611 let Predicates = [HasAVX, NoVLX] in {
1612 def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
1613 (VCVTTPD2DQYrr VR256:$src)>;
1614 def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
1615 (VCVTTPD2DQYrm addr:$src)>;
1618 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1619 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1621 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
1622 Sched<[WriteCvtPD2I]>;
1623 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1624 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1626 (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>,
1627 Sched<[WriteCvtPD2ILd]>;
1629 // Convert packed single to packed double
1630 let Predicates = [HasAVX, NoVLX] in {
1631 // SSE2 instructions without OpSize prefix
1632 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1633 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1634 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
1635 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1636 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1637 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1638 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1639 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1640 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1641 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1642 [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>,
1643 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1644 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1645 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1646 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1647 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1650 let Predicates = [UseSSE2] in {
1651 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1652 "cvtps2pd\t{$src, $dst|$dst, $src}",
1653 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
1654 PS, Sched<[WriteCvtPS2PD]>;
1655 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1656 "cvtps2pd\t{$src, $dst|$dst, $src}",
1657 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1658 PS, Sched<[WriteCvtPS2PD.Folded]>;
1661 // Convert Packed DW Integers to Packed Double FP
1662 let Predicates = [HasAVX, NoVLX] in {
1663 let hasSideEffects = 0, mayLoad = 1 in
1664 def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1665 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1667 (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
1668 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1669 def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1670 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1672 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
1673 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1674 def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1675 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1677 (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>,
1678 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1680 def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1681 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1683 (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
1684 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1687 let hasSideEffects = 0, mayLoad = 1 in
1688 def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1689 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1691 (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
1692 Sched<[WriteCvtI2PDLd]>;
1693 def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1694 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1696 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
1697 Sched<[WriteCvtI2PD]>;
1699 // AVX register conversion intrinsics
1700 let Predicates = [HasAVX, NoVLX] in {
1701 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
1702 (VCVTDQ2PDrm addr:$src)>;
1703 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
1704 (VCVTDQ2PDrm addr:$src)>;
1705 } // Predicates = [HasAVX, NoVLX]
1707 // SSE2 register conversion intrinsics
1708 let Predicates = [UseSSE2] in {
1709 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
1710 (CVTDQ2PDrm addr:$src)>;
1711 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
1712 (CVTDQ2PDrm addr:$src)>;
1713 } // Predicates = [UseSSE2]
1715 // Convert packed double to packed single
1716 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1717 // register, but the same isn't true when using memory operands instead.
1718 // Provide other assembly rr and rm forms to address this explicitly.
1719 let Predicates = [HasAVX, NoVLX] in {
1721 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1722 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1723 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
1724 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1725 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1726 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1727 [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>,
1728 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1730 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1731 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1732 [(set VR128:$dst, (X86vfpround VR256:$src))]>,
1733 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1734 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1735 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1736 [(set VR128:$dst, (X86vfpround (loadv4f64 addr:$src)))]>,
1737 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1738 } // Predicates = [HasAVX, NoVLX]
1740 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1741 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
1742 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1743 (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">;
1744 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1745 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
1746 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1747 (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">;
1749 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1750 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1751 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
1752 Sched<[WriteCvtPD2PS]>;
1753 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1754 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1755 [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>,
1756 Sched<[WriteCvtPD2PS.Folded]>;
1758 let Predicates = [HasAVX, NoVLX] in {
1759 def : Pat<(v4f32 (fpround (v4f64 VR256:$src))),
1760 (VCVTPD2PSYrr VR256:$src)>;
1761 def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))),
1762 (VCVTPD2PSYrm addr:$src)>;
1765 //===----------------------------------------------------------------------===//
1766 // SSE 1 & 2 - Compare Instructions
1767 //===----------------------------------------------------------------------===//
1769 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1770 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1771 Operand CC, SDNode OpNode, ValueType VT,
1772 PatFrag ld_frag, string asm, string asm_alt,
1773 X86FoldableSchedWrite sched> {
1774 let isCommutable = 1 in
1775 def rr : SIi8<0xC2, MRMSrcReg,
1776 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
1777 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
1779 def rm : SIi8<0xC2, MRMSrcMem,
1780 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
1781 [(set RC:$dst, (OpNode (VT RC:$src1),
1782 (ld_frag addr:$src2), imm:$cc))]>,
1783 Sched<[sched.Folded, sched.ReadAfterFold]>;
1785 // Accept explicit immediate argument form instead of comparison code.
1786 let isAsmParserOnly = 1, hasSideEffects = 0 in {
1787 def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
1788 (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>,
1789 Sched<[sched]>, NotMemoryFoldable;
1791 def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
1792 (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>,
1793 Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable;
1797 let ExeDomain = SSEPackedSingle in
1798 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
1799 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1800 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1801 SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
1802 let ExeDomain = SSEPackedDouble in
1803 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
1804 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1805 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1806 SchedWriteFCmpSizes.PD.Scl>,
1807 XD, VEX_4V, VEX_LIG, VEX_WIG;
1809 let Constraints = "$src1 = $dst" in {
1810 let ExeDomain = SSEPackedSingle in
1811 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
1812 "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
1813 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1814 SchedWriteFCmpSizes.PS.Scl>, XS;
1815 let ExeDomain = SSEPackedDouble in
1816 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
1817 "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
1818 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1819 SchedWriteFCmpSizes.PD.Scl>, XD;
1822 multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
1823 Intrinsic Int, string asm, X86FoldableSchedWrite sched,
1824 ComplexPattern mem_cpat> {
1825 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1826 (ins VR128:$src1, VR128:$src, CC:$cc), asm,
1827 [(set VR128:$dst, (Int VR128:$src1,
1828 VR128:$src, imm:$cc))]>,
1831 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1832 (ins VR128:$src1, memop:$src, CC:$cc), asm,
1833 [(set VR128:$dst, (Int VR128:$src1,
1834 mem_cpat:$src, imm:$cc))]>,
1835 Sched<[sched.Folded, sched.ReadAfterFold]>;
1838 let isCodeGenOnly = 1 in {
1839 // Aliases to match intrinsics which expect XMM operand(s).
1840 let ExeDomain = SSEPackedSingle in
1841 defm VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
1842 "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1843 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS, VEX_4V;
1844 let ExeDomain = SSEPackedDouble in
1845 defm VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
1846 "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1847 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1849 let Constraints = "$src1 = $dst" in {
1850 let ExeDomain = SSEPackedSingle in
1851 defm CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
1852 "cmp${cc}ss\t{$src, $dst|$dst, $src}",
1853 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1854 let ExeDomain = SSEPackedDouble in
1855 defm CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
1856 "cmp${cc}sd\t{$src, $dst|$dst, $src}",
1857 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1862 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1863 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
1864 ValueType vt, X86MemOperand x86memop,
1865 PatFrag ld_frag, string OpcodeStr,
1866 X86FoldableSchedWrite sched> {
1867 let hasSideEffects = 0 in {
1868 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1869 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1870 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1873 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1874 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1875 [(set EFLAGS, (OpNode (vt RC:$src1),
1876 (ld_frag addr:$src2)))]>,
1877 Sched<[sched.Folded, sched.ReadAfterFold]>;
1881 // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1882 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1883 ValueType vt, Operand memop,
1884 ComplexPattern mem_cpat, string OpcodeStr,
1885 X86FoldableSchedWrite sched> {
1886 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1887 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1888 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1891 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1892 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1893 [(set EFLAGS, (OpNode (vt RC:$src1),
1895 Sched<[sched.Folded, sched.ReadAfterFold]>;
1898 let Defs = [EFLAGS] in {
1899 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
1900 "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1901 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
1902 "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1903 let Pattern = []<dag> in {
1904 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
1905 "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1906 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
1907 "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1910 let isCodeGenOnly = 1 in {
1911 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1912 sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG;
1913 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1914 sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG;
1916 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1917 sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG;
1918 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1919 sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG;
1921 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
1922 "ucomiss", WriteFCom>, PS;
1923 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
1924 "ucomisd", WriteFCom>, PD;
1926 let Pattern = []<dag> in {
1927 defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
1928 "comiss", WriteFCom>, PS;
1929 defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
1930 "comisd", WriteFCom>, PD;
1933 let isCodeGenOnly = 1 in {
1934 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1935 sse_load_f32, "ucomiss", WriteFCom>, PS;
1936 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1937 sse_load_f64, "ucomisd", WriteFCom>, PD;
1939 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1940 sse_load_f32, "comiss", WriteFCom>, PS;
1941 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1942 sse_load_f64, "comisd", WriteFCom>, PD;
1944 } // Defs = [EFLAGS]
1946 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
1947 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1948 Operand CC, ValueType VT, string asm,
1949 string asm_alt, X86FoldableSchedWrite sched,
1950 Domain d, PatFrag ld_frag> {
1951 let isCommutable = 1 in
1952 def rri : PIi8<0xC2, MRMSrcReg,
1953 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
1954 [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
1956 def rmi : PIi8<0xC2, MRMSrcMem,
1957 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
1959 (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
1960 Sched<[sched.Folded, sched.ReadAfterFold]>;
1962 // Accept explicit immediate argument form instead of comparison code.
1963 let isAsmParserOnly = 1, hasSideEffects = 0 in {
1964 def rri_alt : PIi8<0xC2, MRMSrcReg,
1965 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
1966 asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable;
1968 def rmi_alt : PIi8<0xC2, MRMSrcMem,
1969 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
1970 asm_alt, [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>,
1975 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
1976 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1977 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1978 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
1979 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
1980 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1981 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1982 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
1983 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
1984 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1985 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1986 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
1987 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
1988 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1989 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1990 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
1991 let Constraints = "$src1 = $dst" in {
1992 defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
1993 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
1994 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1995 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1996 defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
1997 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
1998 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1999 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
2002 def CommutableCMPCC : PatLeaf<(imm), [{
2003 uint64_t Imm = N->getZExtValue() & 0x7;
2004 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
2007 // Patterns to select compares with loads in first operand.
2008 let Predicates = [HasAVX] in {
2009 def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1,
2010 CommutableCMPCC:$cc)),
2011 (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2013 def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1,
2014 CommutableCMPCC:$cc)),
2015 (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2017 def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1,
2018 CommutableCMPCC:$cc)),
2019 (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2021 def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1,
2022 CommutableCMPCC:$cc)),
2023 (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
2025 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2026 CommutableCMPCC:$cc)),
2027 (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
2029 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2030 CommutableCMPCC:$cc)),
2031 (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
2034 let Predicates = [UseSSE2] in {
2035 def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1,
2036 CommutableCMPCC:$cc)),
2037 (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2039 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2040 CommutableCMPCC:$cc)),
2041 (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
2044 let Predicates = [UseSSE1] in {
2045 def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1,
2046 CommutableCMPCC:$cc)),
2047 (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
2049 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2050 CommutableCMPCC:$cc)),
2051 (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
2054 //===----------------------------------------------------------------------===//
2055 // SSE 1 & 2 - Shuffle Instructions
2056 //===----------------------------------------------------------------------===//
2058 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2059 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2060 ValueType vt, string asm, PatFrag mem_frag,
2061 X86FoldableSchedWrite sched, Domain d> {
2062 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2063 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2064 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2065 (i8 imm:$src3))))], d>,
2066 Sched<[sched.Folded, sched.ReadAfterFold]>;
2067 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2068 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2069 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2070 (i8 imm:$src3))))], d>,
2074 let Predicates = [HasAVX, NoVLX] in {
2075 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2076 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2077 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2078 PS, VEX_4V, VEX_WIG;
2079 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2080 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2081 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2082 PS, VEX_4V, VEX_L, VEX_WIG;
2083 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2084 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2085 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2086 PD, VEX_4V, VEX_WIG;
2087 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2088 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2089 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2090 PD, VEX_4V, VEX_L, VEX_WIG;
2092 let Constraints = "$src1 = $dst" in {
2093 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2094 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2095 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2096 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2097 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2098 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2101 //===----------------------------------------------------------------------===//
2102 // SSE 1 & 2 - Unpack FP Instructions
2103 //===----------------------------------------------------------------------===//
2105 /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2106 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2107 PatFrag mem_frag, RegisterClass RC,
2108 X86MemOperand x86memop, string asm,
2109 X86FoldableSchedWrite sched, Domain d,
2110 bit IsCommutable = 0> {
2111 let isCommutable = IsCommutable in
2112 def rr : PI<opc, MRMSrcReg,
2113 (outs RC:$dst), (ins RC:$src1, RC:$src2),
2115 (vt (OpNode RC:$src1, RC:$src2)))], d>,
2117 def rm : PI<opc, MRMSrcMem,
2118 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2120 (vt (OpNode RC:$src1,
2121 (mem_frag addr:$src2))))], d>,
2122 Sched<[sched.Folded, sched.ReadAfterFold]>;
2125 let Predicates = [HasAVX, NoVLX] in {
2126 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2127 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2128 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2129 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2130 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2131 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2132 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2133 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2134 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2135 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2136 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2137 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2139 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2140 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2141 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2142 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2143 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2144 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2145 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2146 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2147 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2148 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2149 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2150 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2151 }// Predicates = [HasAVX, NoVLX]
2153 let Constraints = "$src1 = $dst" in {
2154 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2155 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2156 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2157 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2158 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2159 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2160 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2161 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2162 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2163 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2164 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2165 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2166 } // Constraints = "$src1 = $dst"
2168 let Predicates = [HasAVX1Only] in {
2169 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2170 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2171 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2172 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2173 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2174 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2175 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2176 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2178 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2179 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2180 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2181 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2182 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2183 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2184 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2185 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2188 //===----------------------------------------------------------------------===//
2189 // SSE 1 & 2 - Extract Floating-Point Sign mask
2190 //===----------------------------------------------------------------------===//
2192 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2193 multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2194 string asm, Domain d> {
2195 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2196 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2197 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2198 Sched<[WriteFMOVMSK]>;
2201 let Predicates = [HasAVX] in {
2202 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2203 SSEPackedSingle>, PS, VEX, VEX_WIG;
2204 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2205 SSEPackedDouble>, PD, VEX, VEX_WIG;
2206 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2207 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2208 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2209 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2211 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2212 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2213 (VMOVMSKPSrr VR128:$src)>;
2214 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2215 (VMOVMSKPDrr VR128:$src)>;
2216 def : Pat<(X86movmsk (v8i32 VR256:$src)),
2217 (VMOVMSKPSYrr VR256:$src)>;
2218 def : Pat<(X86movmsk (v4i64 VR256:$src)),
2219 (VMOVMSKPDYrr VR256:$src)>;
2222 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2223 SSEPackedSingle>, PS;
2224 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2225 SSEPackedDouble>, PD;
2227 let Predicates = [UseSSE2] in {
2228 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2229 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2230 (MOVMSKPSrr VR128:$src)>;
2231 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2232 (MOVMSKPDrr VR128:$src)>;
2235 //===---------------------------------------------------------------------===//
2236 // SSE2 - Packed Integer Logical Instructions
2237 //===---------------------------------------------------------------------===//
2239 let ExeDomain = SSEPackedInt in { // SSE integer instructions
2241 /// PDI_binop_rm - Simple SSE2 binary operator.
2242 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2243 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2244 X86MemOperand x86memop, X86FoldableSchedWrite sched,
2245 bit IsCommutable, bit Is2Addr> {
2246 let isCommutable = IsCommutable in
2247 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2248 (ins RC:$src1, RC:$src2),
2250 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2251 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2252 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2254 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2255 (ins RC:$src1, x86memop:$src2),
2257 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2258 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2259 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2260 Sched<[sched.Folded, sched.ReadAfterFold]>;
2262 } // ExeDomain = SSEPackedInt
2264 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2265 ValueType OpVT128, ValueType OpVT256,
2266 X86SchedWriteWidths sched, bit IsCommutable,
2268 let Predicates = [HasAVX, prd] in
2269 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2270 VR128, load, i128mem, sched.XMM,
2271 IsCommutable, 0>, VEX_4V, VEX_WIG;
2273 let Constraints = "$src1 = $dst" in
2274 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2275 memop, i128mem, sched.XMM, IsCommutable, 1>;
2277 let Predicates = [HasAVX2, prd] in
2278 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2279 OpVT256, VR256, load, i256mem, sched.YMM,
2280 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2283 // These are ordered here for pattern ordering requirements with the fp versions
2285 defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2286 SchedWriteVecLogic, 1, NoVLX>;
2287 defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2288 SchedWriteVecLogic, 1, NoVLX>;
2289 defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2290 SchedWriteVecLogic, 1, NoVLX>;
2291 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2292 SchedWriteVecLogic, 0, NoVLX>;
2294 //===----------------------------------------------------------------------===//
2295 // SSE 1 & 2 - Logical Instructions
2296 //===----------------------------------------------------------------------===//
2298 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2300 /// There are no patterns here because isel prefers integer versions for SSE2
2301 /// and later. There are SSE1 v4f32 patterns later.
2302 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2303 SDNode OpNode, X86SchedWriteWidths sched> {
2304 let Predicates = [HasAVX, NoVLX] in {
2305 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2306 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2307 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2309 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2310 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2311 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2313 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2314 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2315 [], [], 0>, PS, VEX_4V, VEX_WIG;
2317 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2318 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2319 [], [], 0>, PD, VEX_4V, VEX_WIG;
2322 let Constraints = "$src1 = $dst" in {
2323 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2324 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2327 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2328 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2333 defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
2334 defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
2335 defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
2336 let isCommutable = 0 in
2337 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
2339 let Predicates = [HasAVX2, NoVLX] in {
2340 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2341 (VPANDYrr VR256:$src1, VR256:$src2)>;
2342 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2343 (VPANDYrr VR256:$src1, VR256:$src2)>;
2344 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2345 (VPANDYrr VR256:$src1, VR256:$src2)>;
2347 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2348 (VPORYrr VR256:$src1, VR256:$src2)>;
2349 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2350 (VPORYrr VR256:$src1, VR256:$src2)>;
2351 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2352 (VPORYrr VR256:$src1, VR256:$src2)>;
2354 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2355 (VPXORYrr VR256:$src1, VR256:$src2)>;
2356 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2357 (VPXORYrr VR256:$src1, VR256:$src2)>;
2358 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2359 (VPXORYrr VR256:$src1, VR256:$src2)>;
2361 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2362 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2363 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2364 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2365 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2366 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2368 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2369 (VPANDYrm VR256:$src1, addr:$src2)>;
2370 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2371 (VPANDYrm VR256:$src1, addr:$src2)>;
2372 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2373 (VPANDYrm VR256:$src1, addr:$src2)>;
2375 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2376 (VPORYrm VR256:$src1, addr:$src2)>;
2377 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2378 (VPORYrm VR256:$src1, addr:$src2)>;
2379 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2380 (VPORYrm VR256:$src1, addr:$src2)>;
2382 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2383 (VPXORYrm VR256:$src1, addr:$src2)>;
2384 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2385 (VPXORYrm VR256:$src1, addr:$src2)>;
2386 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2387 (VPXORYrm VR256:$src1, addr:$src2)>;
2389 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2390 (VPANDNYrm VR256:$src1, addr:$src2)>;
2391 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2392 (VPANDNYrm VR256:$src1, addr:$src2)>;
2393 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2394 (VPANDNYrm VR256:$src1, addr:$src2)>;
2397 // If only AVX1 is supported, we need to handle integer operations with
2398 // floating point instructions since the integer versions aren't available.
2399 let Predicates = [HasAVX1Only] in {
2400 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2401 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2402 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2403 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2404 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2405 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2406 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2407 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2409 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2410 (VORPSYrr VR256:$src1, VR256:$src2)>;
2411 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2412 (VORPSYrr VR256:$src1, VR256:$src2)>;
2413 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2414 (VORPSYrr VR256:$src1, VR256:$src2)>;
2415 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2416 (VORPSYrr VR256:$src1, VR256:$src2)>;
2418 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2419 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2420 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2421 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2422 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2423 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2424 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2425 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2427 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2428 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2429 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2430 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2431 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2432 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2433 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2434 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2436 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2437 (VANDPSYrm VR256:$src1, addr:$src2)>;
2438 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2439 (VANDPSYrm VR256:$src1, addr:$src2)>;
2440 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2441 (VANDPSYrm VR256:$src1, addr:$src2)>;
2442 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2443 (VANDPSYrm VR256:$src1, addr:$src2)>;
2445 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2446 (VORPSYrm VR256:$src1, addr:$src2)>;
2447 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2448 (VORPSYrm VR256:$src1, addr:$src2)>;
2449 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2450 (VORPSYrm VR256:$src1, addr:$src2)>;
2451 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2452 (VORPSYrm VR256:$src1, addr:$src2)>;
2454 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2455 (VXORPSYrm VR256:$src1, addr:$src2)>;
2456 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2457 (VXORPSYrm VR256:$src1, addr:$src2)>;
2458 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2459 (VXORPSYrm VR256:$src1, addr:$src2)>;
2460 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2461 (VXORPSYrm VR256:$src1, addr:$src2)>;
2463 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2464 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2465 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2466 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2467 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2468 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2469 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2470 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2473 let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
2474 // Use packed logical operations for scalar ops.
2475 def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
2477 (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2478 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2480 def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
2482 (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2483 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2485 def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
2487 (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2488 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2490 def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
2492 (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2493 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2496 def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
2498 (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2499 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2501 def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
2503 (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2504 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2506 def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
2508 (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2509 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2511 def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
2513 (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2514 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2518 let Predicates = [UseSSE1] in {
2519 // Use packed logical operations for scalar ops.
2520 def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
2522 (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2523 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2525 def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
2527 (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2528 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2530 def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
2532 (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2533 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2535 def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
2537 (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2538 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2542 let Predicates = [UseSSE2] in {
2543 // Use packed logical operations for scalar ops.
2544 def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
2546 (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2547 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2549 def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
2551 (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2552 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2554 def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
2556 (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2557 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2559 def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
2561 (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2562 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2566 let Predicates = [HasAVX, NoVLX] in {
2567 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2568 (VPANDrr VR128:$src1, VR128:$src2)>;
2569 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2570 (VPANDrr VR128:$src1, VR128:$src2)>;
2571 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2572 (VPANDrr VR128:$src1, VR128:$src2)>;
2574 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2575 (VPORrr VR128:$src1, VR128:$src2)>;
2576 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2577 (VPORrr VR128:$src1, VR128:$src2)>;
2578 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2579 (VPORrr VR128:$src1, VR128:$src2)>;
2581 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2582 (VPXORrr VR128:$src1, VR128:$src2)>;
2583 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2584 (VPXORrr VR128:$src1, VR128:$src2)>;
2585 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2586 (VPXORrr VR128:$src1, VR128:$src2)>;
2588 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2589 (VPANDNrr VR128:$src1, VR128:$src2)>;
2590 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2591 (VPANDNrr VR128:$src1, VR128:$src2)>;
2592 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2593 (VPANDNrr VR128:$src1, VR128:$src2)>;
2595 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2596 (VPANDrm VR128:$src1, addr:$src2)>;
2597 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2598 (VPANDrm VR128:$src1, addr:$src2)>;
2599 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2600 (VPANDrm VR128:$src1, addr:$src2)>;
2602 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2603 (VPORrm VR128:$src1, addr:$src2)>;
2604 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2605 (VPORrm VR128:$src1, addr:$src2)>;
2606 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2607 (VPORrm VR128:$src1, addr:$src2)>;
2609 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2610 (VPXORrm VR128:$src1, addr:$src2)>;
2611 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2612 (VPXORrm VR128:$src1, addr:$src2)>;
2613 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2614 (VPXORrm VR128:$src1, addr:$src2)>;
2616 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2617 (VPANDNrm VR128:$src1, addr:$src2)>;
2618 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2619 (VPANDNrm VR128:$src1, addr:$src2)>;
2620 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2621 (VPANDNrm VR128:$src1, addr:$src2)>;
2624 let Predicates = [UseSSE2] in {
2625 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2626 (PANDrr VR128:$src1, VR128:$src2)>;
2627 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2628 (PANDrr VR128:$src1, VR128:$src2)>;
2629 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2630 (PANDrr VR128:$src1, VR128:$src2)>;
2632 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2633 (PORrr VR128:$src1, VR128:$src2)>;
2634 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2635 (PORrr VR128:$src1, VR128:$src2)>;
2636 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2637 (PORrr VR128:$src1, VR128:$src2)>;
2639 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2640 (PXORrr VR128:$src1, VR128:$src2)>;
2641 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2642 (PXORrr VR128:$src1, VR128:$src2)>;
2643 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2644 (PXORrr VR128:$src1, VR128:$src2)>;
2646 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2647 (PANDNrr VR128:$src1, VR128:$src2)>;
2648 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2649 (PANDNrr VR128:$src1, VR128:$src2)>;
2650 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2651 (PANDNrr VR128:$src1, VR128:$src2)>;
2653 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2654 (PANDrm VR128:$src1, addr:$src2)>;
2655 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2656 (PANDrm VR128:$src1, addr:$src2)>;
2657 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2658 (PANDrm VR128:$src1, addr:$src2)>;
2660 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2661 (PORrm VR128:$src1, addr:$src2)>;
2662 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2663 (PORrm VR128:$src1, addr:$src2)>;
2664 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2665 (PORrm VR128:$src1, addr:$src2)>;
2667 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2668 (PXORrm VR128:$src1, addr:$src2)>;
2669 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2670 (PXORrm VR128:$src1, addr:$src2)>;
2671 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2672 (PXORrm VR128:$src1, addr:$src2)>;
2674 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2675 (PANDNrm VR128:$src1, addr:$src2)>;
2676 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2677 (PANDNrm VR128:$src1, addr:$src2)>;
2678 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2679 (PANDNrm VR128:$src1, addr:$src2)>;
2682 // Patterns for packed operations when we don't have integer type available.
2683 def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2684 (ANDPSrr VR128:$src1, VR128:$src2)>;
2685 def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2686 (ORPSrr VR128:$src1, VR128:$src2)>;
2687 def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2688 (XORPSrr VR128:$src1, VR128:$src2)>;
2689 def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2690 (ANDNPSrr VR128:$src1, VR128:$src2)>;
2692 def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2693 (ANDPSrm VR128:$src1, addr:$src2)>;
2694 def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2695 (ORPSrm VR128:$src1, addr:$src2)>;
2696 def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2697 (XORPSrm VR128:$src1, addr:$src2)>;
2698 def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2699 (ANDNPSrm VR128:$src1, addr:$src2)>;
2701 //===----------------------------------------------------------------------===//
2702 // SSE 1 & 2 - Arithmetic Instructions
2703 //===----------------------------------------------------------------------===//
2705 /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2708 /// In addition, we also have a special variant of the scalar form here to
2709 /// represent the associated intrinsic operation. This form is unlike the
2710 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
2711 /// and leaves the top elements unmodified (therefore these cannot be commuted).
2713 /// These three forms can each be reg+reg or reg+mem.
2716 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2718 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2719 SDNode OpNode, X86SchedWriteSizes sched> {
2720 let Predicates = [HasAVX, NoVLX] in {
2721 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2722 VR128, v4f32, f128mem, loadv4f32,
2723 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2724 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2725 VR128, v2f64, f128mem, loadv2f64,
2726 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2728 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2729 OpNode, VR256, v8f32, f256mem, loadv8f32,
2730 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2731 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2732 OpNode, VR256, v4f64, f256mem, loadv4f64,
2733 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2736 let Constraints = "$src1 = $dst" in {
2737 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2738 v4f32, f128mem, memopv4f32, SSEPackedSingle,
2740 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2741 v2f64, f128mem, memopv2f64, SSEPackedDouble,
2746 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2747 X86SchedWriteSizes sched> {
2748 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2749 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2750 XS, VEX_4V, VEX_LIG, VEX_WIG;
2751 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2752 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2753 XD, VEX_4V, VEX_LIG, VEX_WIG;
2755 let Constraints = "$src1 = $dst" in {
2756 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2757 OpNode, FR32, f32mem, SSEPackedSingle,
2759 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2760 OpNode, FR64, f64mem, SSEPackedDouble,
2765 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2766 SDPatternOperator OpNode,
2767 X86SchedWriteSizes sched> {
2768 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2769 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2770 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2771 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2772 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2773 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2775 let Constraints = "$src1 = $dst" in {
2776 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2777 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2778 SSEPackedSingle, sched.PS.Scl>, XS;
2779 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2780 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2781 SSEPackedDouble, sched.PD.Scl>, XD;
2785 // Binary Arithmetic instructions
2786 defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>,
2787 basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>,
2788 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2789 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>,
2790 basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>,
2791 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2792 let isCommutable = 0 in {
2793 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>,
2794 basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>,
2795 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2796 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>,
2797 basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>,
2798 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2799 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2800 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2801 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2802 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2803 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2804 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2807 let isCodeGenOnly = 1 in {
2808 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2809 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2810 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2811 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2814 // Patterns used to select SSE scalar fp arithmetic instructions from
2817 // (1) a scalar fp operation followed by a blend
2819 // The effect is that the backend no longer emits unnecessary vector
2820 // insert instructions immediately after SSE scalar fp instructions
2821 // like addss or mulss.
2823 // For example, given the following code:
2824 // __m128 foo(__m128 A, __m128 B) {
2829 // Previously we generated:
2830 // addss %xmm0, %xmm1
2831 // movss %xmm1, %xmm0
2834 // addss %xmm1, %xmm0
2836 // (2) a vector packed single/double fp operation followed by a vector insert
2838 // The effect is that the backend converts the packed fp instruction
2839 // followed by a vector insert into a single SSE scalar fp instruction.
2841 // For example, given the following code:
2842 // __m128 foo(__m128 A, __m128 B) {
2843 // __m128 C = A + B;
2844 // return (__m128) {c[0], a[1], a[2], a[3]};
2847 // Previously we generated:
2848 // addps %xmm0, %xmm1
2849 // movss %xmm1, %xmm0
2852 // addss %xmm1, %xmm0
2854 // TODO: Some canonicalization in lowering would simplify the number of
2855 // patterns we have to try to match.
2856 multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
2857 ValueType VT, ValueType EltTy,
2858 RegisterClass RC, Predicate BasePredicate> {
2859 let Predicates = [BasePredicate] in {
2860 // extracted scalar math op with insert via movss/movsd
2861 def : Pat<(VT (Move (VT VR128:$dst),
2862 (VT (scalar_to_vector
2863 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2865 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2866 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2869 // Repeat for AVX versions of the instructions.
2870 let Predicates = [UseAVX] in {
2871 // extracted scalar math op with insert via movss/movsd
2872 def : Pat<(VT (Move (VT VR128:$dst),
2873 (VT (scalar_to_vector
2874 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2876 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2877 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2881 defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
2882 defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
2883 defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
2884 defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
2886 defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2887 defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2888 defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2889 defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2892 /// In addition, we also have a special variant of the scalar form here to
2893 /// represent the associated intrinsic operation. This form is unlike the
2894 /// plain scalar form, in that it takes an entire vector (instead of a
2895 /// scalar) and leaves the top elements undefined.
2897 /// And, we have a special variant form for a full-vector intrinsic form.
2899 /// sse_fp_unop_s - SSE1 unops in scalar form
2900 /// For the non-AVX defs, we need $src1 to be tied to $dst because
2901 /// the HW instructions are 2 operand / destructive.
2902 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2903 ValueType ScalarVT, X86MemOperand x86memop,
2904 Operand intmemop, SDNode OpNode, Domain d,
2905 X86FoldableSchedWrite sched, Predicate target> {
2906 let hasSideEffects = 0 in {
2907 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2908 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2909 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2912 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2913 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2914 [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2915 Sched<[sched.Folded]>,
2916 Requires<[target, OptForSize]>;
2918 let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in {
2919 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2920 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2923 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2924 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2925 Sched<[sched.Folded, sched.ReadAfterFold]>;
2931 multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
2932 ComplexPattern int_cpat, Intrinsic Intr,
2933 Predicate target, string Suffix> {
2934 let Predicates = [target] in {
2935 // These are unary operations, but they are modeled as having 2 source operands
2936 // because the high elements of the destination are unchanged in SSE.
2937 def : Pat<(Intr VR128:$src),
2938 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2940 // We don't want to fold scalar loads into these instructions unless
2941 // optimizing for size. This is because the folded instruction will have a
2942 // partial register update, while the unfolded sequence will not, e.g.
2944 // rcpss %xmm0, %xmm0
2945 // which has a clobber before the rcp, vs.
2947 let Predicates = [target, OptForSize] in {
2948 def : Pat<(Intr int_cpat:$src2),
2949 (!cast<Instruction>(NAME#m_Int)
2950 (vt (IMPLICIT_DEF)), addr:$src2)>;
2954 multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
2955 Intrinsic Intr, Predicate target> {
2956 let Predicates = [target] in {
2957 def : Pat<(Intr VR128:$src),
2958 (!cast<Instruction>(NAME#r_Int) VR128:$src,
2961 let Predicates = [target, OptForSize] in {
2962 def : Pat<(Intr int_cpat:$src2),
2963 (!cast<Instruction>(NAME#m_Int)
2964 (vt (IMPLICIT_DEF)), addr:$src2)>;
2968 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2969 ValueType ScalarVT, X86MemOperand x86memop,
2970 Operand intmemop, SDNode OpNode, Domain d,
2971 X86FoldableSchedWrite sched, Predicate target> {
2972 let hasSideEffects = 0 in {
2973 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2974 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2975 [], d>, Sched<[sched]>;
2977 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2978 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2979 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2980 let isCodeGenOnly = 1, ExeDomain = d in {
2981 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2982 (ins VR128:$src1, VR128:$src2),
2983 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2984 []>, Sched<[sched]>;
2986 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2987 (ins VR128:$src1, intmemop:$src2),
2988 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2989 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2993 // We don't want to fold scalar loads into these instructions unless
2994 // optimizing for size. This is because the folded instruction will have a
2995 // partial register update, while the unfolded sequence will not, e.g.
2996 // vmovss mem, %xmm0
2997 // vrcpss %xmm0, %xmm0, %xmm0
2998 // which has a clobber before the rcp, vs.
2999 // vrcpss mem, %xmm0, %xmm0
3000 // TODO: In theory, we could fold the load, and avoid the stall caused by
3001 // the partial register store, either in BreakFalseDeps or with smarter RA.
3002 let Predicates = [target] in {
3003 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r)
3004 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
3006 let Predicates = [target, OptForSize] in {
3007 def : Pat<(ScalarVT (OpNode (load addr:$src))),
3008 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
3013 /// sse1_fp_unop_p - SSE1 unops in packed form.
3014 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
3015 X86SchedWriteWidths sched, list<Predicate> prds> {
3016 let Predicates = prds in {
3017 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3018 !strconcat("v", OpcodeStr,
3019 "ps\t{$src, $dst|$dst, $src}"),
3020 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
3021 VEX, Sched<[sched.XMM]>, VEX_WIG;
3022 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3023 !strconcat("v", OpcodeStr,
3024 "ps\t{$src, $dst|$dst, $src}"),
3025 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
3026 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
3027 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3028 !strconcat("v", OpcodeStr,
3029 "ps\t{$src, $dst|$dst, $src}"),
3030 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
3031 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3032 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3033 !strconcat("v", OpcodeStr,
3034 "ps\t{$src, $dst|$dst, $src}"),
3035 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
3036 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
3039 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3040 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3041 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
3043 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3044 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3045 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
3046 Sched<[sched.XMM.Folded]>;
3049 /// sse2_fp_unop_p - SSE2 unops in vector forms.
3050 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
3051 SDNode OpNode, X86SchedWriteWidths sched> {
3052 let Predicates = [HasAVX, NoVLX] in {
3053 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3054 !strconcat("v", OpcodeStr,
3055 "pd\t{$src, $dst|$dst, $src}"),
3056 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
3057 VEX, Sched<[sched.XMM]>, VEX_WIG;
3058 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3059 !strconcat("v", OpcodeStr,
3060 "pd\t{$src, $dst|$dst, $src}"),
3061 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
3062 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
3063 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3064 !strconcat("v", OpcodeStr,
3065 "pd\t{$src, $dst|$dst, $src}"),
3066 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
3067 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3068 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3069 !strconcat("v", OpcodeStr,
3070 "pd\t{$src, $dst|$dst, $src}"),
3071 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
3072 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
3075 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3076 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3077 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
3079 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3080 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3081 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
3082 Sched<[sched.XMM.Folded]>;
3085 multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
3086 X86SchedWriteWidths sched, Predicate AVXTarget> {
3087 defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
3088 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
3090 defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
3091 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
3093 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
3096 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3097 X86SchedWriteWidths sched, Predicate AVXTarget> {
3098 defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
3099 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
3100 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
3101 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
3102 XS, VEX_4V, VEX_LIG, VEX_WIG;
3105 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3106 X86SchedWriteWidths sched, Predicate AVXTarget> {
3107 defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
3108 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
3109 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
3110 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
3111 XD, VEX_4V, VEX_LIG, VEX_WIG;
3115 defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>,
3116 sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
3117 sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>,
3118 sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>;
3120 // Reciprocal approximations. Note that these typically require refinement
3121 // in order to obtain suitable precision.
3122 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3123 sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3124 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
3125 defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3126 sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3127 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
3129 // There is no f64 version of the reciprocal approximation instructions.
3131 multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
3132 ValueType VT, Predicate BasePredicate> {
3133 let Predicates = [BasePredicate] in {
3134 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3135 (OpNode (extractelt VT:$src, 0))))),
3136 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3139 // Repeat for AVX versions of the instructions.
3140 let Predicates = [UseAVX] in {
3141 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3142 (OpNode (extractelt VT:$src, 0))))),
3143 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3147 multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
3148 ValueType VT, bits<8> ImmV,
3149 Predicate BasePredicate> {
3150 let Predicates = [BasePredicate] in {
3151 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3152 (OpNode (extractelt VT:$src, 0))))),
3153 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
3156 // Repeat for AVX versions of the instructions.
3157 let Predicates = [UseAVX] in {
3158 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3159 (OpNode (extractelt VT:$src, 0))))),
3160 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
3164 defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
3165 defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
3167 multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3168 SDNode Move, ValueType VT,
3169 Predicate BasePredicate> {
3170 let Predicates = [BasePredicate] in {
3171 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3172 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3175 // Repeat for AVX versions of the instructions.
3176 let Predicates = [HasAVX] in {
3177 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3178 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3182 defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3184 defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3188 //===----------------------------------------------------------------------===//
3189 // SSE 1 & 2 - Non-temporal stores
3190 //===----------------------------------------------------------------------===//
3192 let AddedComplexity = 400 in { // Prefer non-temporal versions
3193 let Predicates = [HasAVX, NoVLX] in {
3194 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3195 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3196 (ins f128mem:$dst, VR128:$src),
3197 "movntps\t{$src, $dst|$dst, $src}",
3198 [(alignednontemporalstore (v4f32 VR128:$src),
3199 addr:$dst)]>, VEX, VEX_WIG;
3200 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3201 (ins f128mem:$dst, VR128:$src),
3202 "movntpd\t{$src, $dst|$dst, $src}",
3203 [(alignednontemporalstore (v2f64 VR128:$src),
3204 addr:$dst)]>, VEX, VEX_WIG;
3207 let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3208 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3209 (ins f256mem:$dst, VR256:$src),
3210 "movntps\t{$src, $dst|$dst, $src}",
3211 [(alignednontemporalstore (v8f32 VR256:$src),
3212 addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3213 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3214 (ins f256mem:$dst, VR256:$src),
3215 "movntpd\t{$src, $dst|$dst, $src}",
3216 [(alignednontemporalstore (v4f64 VR256:$src),
3217 addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3220 let ExeDomain = SSEPackedInt in {
3221 def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
3222 (ins i128mem:$dst, VR128:$src),
3223 "movntdq\t{$src, $dst|$dst, $src}",
3224 [(alignednontemporalstore (v2i64 VR128:$src),
3225 addr:$dst)]>, VEX, VEX_WIG,
3226 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3227 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3228 (ins i256mem:$dst, VR256:$src),
3229 "movntdq\t{$src, $dst|$dst, $src}",
3230 [(alignednontemporalstore (v4i64 VR256:$src),
3231 addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3232 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3236 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3237 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3238 "movntps\t{$src, $dst|$dst, $src}",
3239 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3240 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3241 "movntpd\t{$src, $dst|$dst, $src}",
3242 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3245 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3246 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3247 "movntdq\t{$src, $dst|$dst, $src}",
3248 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3250 let SchedRW = [WriteStoreNT] in {
3251 // There is no AVX form for instructions below this point
3252 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3253 "movnti{l}\t{$src, $dst|$dst, $src}",
3254 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3255 PS, Requires<[HasSSE2]>;
3256 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3257 "movnti{q}\t{$src, $dst|$dst, $src}",
3258 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3259 PS, Requires<[HasSSE2]>;
3260 } // SchedRW = [WriteStoreNT]
3262 let Predicates = [HasAVX, NoVLX] in {
3263 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3264 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3265 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3266 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3267 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3268 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3270 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3271 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3272 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3273 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3274 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3275 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3278 let Predicates = [UseSSE2] in {
3279 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3280 (MOVNTDQmr addr:$dst, VR128:$src)>;
3281 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3282 (MOVNTDQmr addr:$dst, VR128:$src)>;
3283 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3284 (MOVNTDQmr addr:$dst, VR128:$src)>;
3287 } // AddedComplexity
3289 //===----------------------------------------------------------------------===//
3290 // SSE 1 & 2 - Prefetch and memory fence
3291 //===----------------------------------------------------------------------===//
3293 // Prefetch intrinsic.
3294 let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3295 def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3296 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3297 def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3298 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3299 def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3300 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3301 def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3302 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3305 // FIXME: How should flush instruction be modeled?
3306 let SchedRW = [WriteLoad] in {
3308 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3309 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3310 PS, Requires<[HasSSE2]>;
3313 let SchedRW = [WriteNop] in {
3314 // Pause. This "instruction" is encoded as "rep; nop", so even though it
3315 // was introduced with SSE2, it's backward compatible.
3316 def PAUSE : I<0x90, RawFrm, (outs), (ins),
3317 "pause", [(int_x86_sse2_pause)]>, OBXS;
3320 let SchedRW = [WriteFence] in {
3321 // Load, store, and memory fence
3322 // TODO: As with mfence, we may want to ease the availablity of sfence/lfence
3323 // to include any 64-bit target.
3324 def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3325 PS, Requires<[HasSSE1]>;
3326 def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3327 PS, Requires<[HasSSE2]>;
3328 def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3329 PS, Requires<[HasMFence]>;
3332 def : Pat<(X86MFence), (MFENCE)>;
3334 //===----------------------------------------------------------------------===//
3335 // SSE 1 & 2 - Load/Store XCSR register
3336 //===----------------------------------------------------------------------===//
3338 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3339 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3340 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3341 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3342 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3343 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3345 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3346 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3347 TB, Sched<[WriteLDMXCSR]>;
3348 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3349 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3350 TB, Sched<[WriteSTMXCSR]>;
3352 //===---------------------------------------------------------------------===//
3353 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3354 //===---------------------------------------------------------------------===//
3356 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3358 let hasSideEffects = 0 in {
3359 def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3360 "movdqa\t{$src, $dst|$dst, $src}", []>,
3361 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3362 def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3363 "movdqu\t{$src, $dst|$dst, $src}", []>,
3364 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3365 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3366 "movdqa\t{$src, $dst|$dst, $src}", []>,
3367 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3368 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3369 "movdqu\t{$src, $dst|$dst, $src}", []>,
3370 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3374 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3375 def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3376 "movdqa\t{$src, $dst|$dst, $src}", []>,
3377 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3378 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3379 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3380 "movdqa\t{$src, $dst|$dst, $src}", []>,
3381 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3382 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3383 def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3384 "movdqu\t{$src, $dst|$dst, $src}", []>,
3385 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3386 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3387 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3388 "movdqu\t{$src, $dst|$dst, $src}", []>,
3389 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3390 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3393 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3394 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3395 def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3396 "movdqa\t{$src, $dst|$dst, $src}",
3397 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3398 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3399 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3400 "movdqa\t{$src, $dst|$dst, $src}", []>,
3401 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3402 VEX, VEX_L, VEX_WIG;
3403 def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3404 "vmovdqu\t{$src, $dst|$dst, $src}",
3405 [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3406 Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3408 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3409 "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3410 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3411 XS, VEX, VEX_L, VEX_WIG;
3414 let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3415 def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
3416 (ins i128mem:$dst, VR128:$src),
3417 "movdqa\t{$src, $dst|$dst, $src}",
3418 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3419 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3420 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3421 (ins i256mem:$dst, VR256:$src),
3422 "movdqa\t{$src, $dst|$dst, $src}", []>,
3423 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3424 def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3425 "vmovdqu\t{$src, $dst|$dst, $src}",
3426 [(store (v2i64 VR128:$src), addr:$dst)]>,
3427 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3428 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3429 "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3430 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3433 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3434 let hasSideEffects = 0 in {
3435 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3436 "movdqa\t{$src, $dst|$dst, $src}", []>;
3438 def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3439 "movdqu\t{$src, $dst|$dst, $src}", []>,
3440 XS, Requires<[UseSSE2]>;
3444 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3445 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3446 "movdqa\t{$src, $dst|$dst, $src}", []>,
3447 FoldGenData<"MOVDQArr">;
3449 def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3450 "movdqu\t{$src, $dst|$dst, $src}", []>,
3451 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3455 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3456 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3457 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3458 "movdqa\t{$src, $dst|$dst, $src}",
3459 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3460 def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3461 "movdqu\t{$src, $dst|$dst, $src}",
3462 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3463 XS, Requires<[UseSSE2]>;
3466 let mayStore = 1, hasSideEffects = 0,
3467 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3468 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3469 "movdqa\t{$src, $dst|$dst, $src}",
3470 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3471 def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3472 "movdqu\t{$src, $dst|$dst, $src}",
3473 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3474 XS, Requires<[UseSSE2]>;
3477 } // ExeDomain = SSEPackedInt
3479 // Aliases to help the assembler pick two byte VEX encodings by swapping the
3480 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
3481 def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
3482 (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>;
3483 def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
3484 (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>;
3485 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
3486 (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>;
3487 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
3488 (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
3490 // Reversed version with ".s" suffix for GAS compatibility.
3491 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3492 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3493 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3494 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3495 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3496 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3497 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3498 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3500 // Reversed version with ".s" suffix for GAS compatibility.
3501 def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3502 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3503 def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3504 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3506 let Predicates = [HasAVX, NoVLX] in {
3507 // Additional patterns for other integer sizes.
3508 def : Pat<(alignedloadv4i32 addr:$src),
3509 (VMOVDQArm addr:$src)>;
3510 def : Pat<(alignedloadv8i16 addr:$src),
3511 (VMOVDQArm addr:$src)>;
3512 def : Pat<(alignedloadv16i8 addr:$src),
3513 (VMOVDQArm addr:$src)>;
3514 def : Pat<(loadv4i32 addr:$src),
3515 (VMOVDQUrm addr:$src)>;
3516 def : Pat<(loadv8i16 addr:$src),
3517 (VMOVDQUrm addr:$src)>;
3518 def : Pat<(loadv16i8 addr:$src),
3519 (VMOVDQUrm addr:$src)>;
3521 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3522 (VMOVDQAmr addr:$dst, VR128:$src)>;
3523 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3524 (VMOVDQAmr addr:$dst, VR128:$src)>;
3525 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3526 (VMOVDQAmr addr:$dst, VR128:$src)>;
3527 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3528 (VMOVDQUmr addr:$dst, VR128:$src)>;
3529 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3530 (VMOVDQUmr addr:$dst, VR128:$src)>;
3531 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3532 (VMOVDQUmr addr:$dst, VR128:$src)>;
3535 //===---------------------------------------------------------------------===//
3536 // SSE2 - Packed Integer Arithmetic Instructions
3537 //===---------------------------------------------------------------------===//
3539 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3541 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3542 multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3543 ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3544 PatFrag memop_frag, X86MemOperand x86memop,
3545 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3546 let isCommutable = 1 in
3547 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3548 (ins RC:$src1, RC:$src2),
3550 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3551 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3552 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3554 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3555 (ins RC:$src1, x86memop:$src2),
3557 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3558 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3559 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3560 (memop_frag addr:$src2))))]>,
3561 Sched<[sched.Folded, sched.ReadAfterFold]>;
3563 } // ExeDomain = SSEPackedInt
3565 defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3566 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3567 defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3568 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3569 defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3570 SchedWriteVecALU, 1, NoVLX>;
3571 defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3572 SchedWriteVecALU, 1, NoVLX>;
3573 defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3574 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3575 defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3576 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3577 defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3578 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3579 defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3580 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3581 defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3582 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3583 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3584 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3585 defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3586 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3587 defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3588 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3589 defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3590 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3591 defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3592 SchedWriteVecALU, 0, NoVLX>;
3593 defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3594 SchedWriteVecALU, 0, NoVLX>;
3595 defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3596 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3597 defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3598 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3599 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3600 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3601 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3602 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3603 defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3604 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3605 defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3606 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3607 defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3608 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3609 defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3610 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3611 defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
3612 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3613 defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
3614 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3615 defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3616 SchedWriteVecIMul, 1, NoVLX>;
3618 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3619 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3620 load, i128mem, SchedWriteVecIMul.XMM, 0>,
3623 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3624 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3625 VR256, load, i256mem, SchedWriteVecIMul.YMM,
3626 0>, VEX_4V, VEX_L, VEX_WIG;
3627 let Constraints = "$src1 = $dst" in
3628 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3629 memop, i128mem, SchedWriteVecIMul.XMM>;
3631 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3632 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3633 load, i128mem, SchedWritePSADBW.XMM, 0>,
3635 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3636 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3637 load, i256mem, SchedWritePSADBW.YMM, 0>,
3638 VEX_4V, VEX_L, VEX_WIG;
3639 let Constraints = "$src1 = $dst" in
3640 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3641 memop, i128mem, SchedWritePSADBW.XMM>;
3643 //===---------------------------------------------------------------------===//
3644 // SSE2 - Packed Integer Logical Instructions
3645 //===---------------------------------------------------------------------===//
3647 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3648 string OpcodeStr, SDNode OpNode,
3649 SDNode OpNode2, RegisterClass RC,
3650 X86FoldableSchedWrite sched,
3651 X86FoldableSchedWrite schedImm,
3652 ValueType DstVT, ValueType SrcVT,
3653 PatFrag ld_frag, bit Is2Addr = 1> {
3654 // src2 is always 128-bit
3655 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3656 (ins RC:$src1, VR128:$src2),
3658 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3659 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3660 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3662 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3663 (ins RC:$src1, i128mem:$src2),
3665 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3666 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3667 [(set RC:$dst, (DstVT (OpNode RC:$src1,
3668 (SrcVT (ld_frag addr:$src2)))))]>,
3669 Sched<[sched.Folded, sched.ReadAfterFold]>;
3670 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3671 (ins RC:$src1, u8imm:$src2),
3673 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3674 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3675 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>,
3679 multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3680 string OpcodeStr, SDNode OpNode,
3681 SDNode OpNode2, ValueType DstVT128,
3682 ValueType DstVT256, ValueType SrcVT,
3683 X86SchedWriteWidths sched,
3684 X86SchedWriteWidths schedImm, Predicate prd> {
3685 let Predicates = [HasAVX, prd] in
3686 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3687 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3688 DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3689 let Predicates = [HasAVX2, prd] in
3690 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3691 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3692 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3694 let Constraints = "$src1 = $dst" in
3695 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3696 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3700 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3701 SDNode OpNode, RegisterClass RC, ValueType VT,
3702 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3703 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3705 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3706 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3707 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>,
3711 multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3712 SDNode OpNode, X86SchedWriteWidths sched> {
3713 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3714 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3715 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3716 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3717 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3718 VR256, v32i8, sched.YMM, 0>,
3719 VEX_4V, VEX_L, VEX_WIG;
3720 let Constraints = "$src1 = $dst" in
3721 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3725 let ExeDomain = SSEPackedInt in {
3726 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3727 v8i16, v16i16, v8i16, SchedWriteVecShift,
3728 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3729 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3730 v4i32, v8i32, v4i32, SchedWriteVecShift,
3731 SchedWriteVecShiftImm, NoVLX>;
3732 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3733 v2i64, v4i64, v2i64, SchedWriteVecShift,
3734 SchedWriteVecShiftImm, NoVLX>;
3736 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3737 v8i16, v16i16, v8i16, SchedWriteVecShift,
3738 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3739 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3740 v4i32, v8i32, v4i32, SchedWriteVecShift,
3741 SchedWriteVecShiftImm, NoVLX>;
3742 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3743 v2i64, v4i64, v2i64, SchedWriteVecShift,
3744 SchedWriteVecShiftImm, NoVLX>;
3746 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3747 v8i16, v16i16, v8i16, SchedWriteVecShift,
3748 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3749 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3750 v4i32, v8i32, v4i32, SchedWriteVecShift,
3751 SchedWriteVecShiftImm, NoVLX>;
3753 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3755 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3757 } // ExeDomain = SSEPackedInt
3759 //===---------------------------------------------------------------------===//
3760 // SSE2 - Packed Integer Comparison Instructions
3761 //===---------------------------------------------------------------------===//
3763 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3764 SchedWriteVecALU, 1, TruePredicate>;
3765 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3766 SchedWriteVecALU, 1, TruePredicate>;
3767 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3768 SchedWriteVecALU, 1, TruePredicate>;
3769 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3770 SchedWriteVecALU, 0, TruePredicate>;
3771 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3772 SchedWriteVecALU, 0, TruePredicate>;
3773 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3774 SchedWriteVecALU, 0, TruePredicate>;
3776 //===---------------------------------------------------------------------===//
3777 // SSE2 - Packed Integer Shuffle Instructions
3778 //===---------------------------------------------------------------------===//
3780 let ExeDomain = SSEPackedInt in {
3781 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3782 SDNode OpNode, X86SchedWriteWidths sched,
3784 let Predicates = [HasAVX, prd] in {
3785 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3786 (ins VR128:$src1, u8imm:$src2),
3787 !strconcat("v", OpcodeStr,
3788 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3790 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
3791 VEX, Sched<[sched.XMM]>, VEX_WIG;
3792 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3793 (ins i128mem:$src1, u8imm:$src2),
3794 !strconcat("v", OpcodeStr,
3795 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3797 (vt128 (OpNode (load addr:$src1),
3798 (i8 imm:$src2))))]>, VEX,
3799 Sched<[sched.XMM.Folded]>, VEX_WIG;
3802 let Predicates = [HasAVX2, prd] in {
3803 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3804 (ins VR256:$src1, u8imm:$src2),
3805 !strconcat("v", OpcodeStr,
3806 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3808 (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>,
3809 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3810 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3811 (ins i256mem:$src1, u8imm:$src2),
3812 !strconcat("v", OpcodeStr,
3813 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3815 (vt256 (OpNode (load addr:$src1),
3816 (i8 imm:$src2))))]>, VEX, VEX_L,
3817 Sched<[sched.YMM.Folded]>, VEX_WIG;
3820 let Predicates = [UseSSE2] in {
3821 def ri : Ii8<0x70, MRMSrcReg,
3822 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3823 !strconcat(OpcodeStr,
3824 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3826 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
3828 def mi : Ii8<0x70, MRMSrcMem,
3829 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3830 !strconcat(OpcodeStr,
3831 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3833 (vt128 (OpNode (memop addr:$src1),
3834 (i8 imm:$src2))))]>,
3835 Sched<[sched.XMM.Folded]>;
3838 } // ExeDomain = SSEPackedInt
3840 defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3841 SchedWriteShuffle, NoVLX>, PD;
3842 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3843 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3844 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3845 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3847 //===---------------------------------------------------------------------===//
3848 // Packed Integer Pack Instructions (SSE & AVX)
3849 //===---------------------------------------------------------------------===//
3851 let ExeDomain = SSEPackedInt in {
3852 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3853 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3854 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3855 PatFrag ld_frag, bit Is2Addr = 1> {
3856 def rr : PDI<opc, MRMSrcReg,
3857 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3859 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3860 !strconcat(OpcodeStr,
3861 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3863 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3865 def rm : PDI<opc, MRMSrcMem,
3866 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3868 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3869 !strconcat(OpcodeStr,
3870 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3872 (OutVT (OpNode (ArgVT RC:$src1),
3873 (ld_frag addr:$src2))))]>,
3874 Sched<[sched.Folded, sched.ReadAfterFold]>;
3877 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3878 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3879 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3880 PatFrag ld_frag, bit Is2Addr = 1> {
3881 def rr : SS48I<opc, MRMSrcReg,
3882 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3884 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3885 !strconcat(OpcodeStr,
3886 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3888 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3890 def rm : SS48I<opc, MRMSrcMem,
3891 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3893 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3894 !strconcat(OpcodeStr,
3895 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3897 (OutVT (OpNode (ArgVT RC:$src1),
3898 (ld_frag addr:$src2))))]>,
3899 Sched<[sched.Folded, sched.ReadAfterFold]>;
3902 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3903 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3904 i128mem, SchedWriteShuffle.XMM, load, 0>,
3906 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3907 i128mem, SchedWriteShuffle.XMM, load, 0>,
3910 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3911 i128mem, SchedWriteShuffle.XMM, load, 0>,
3913 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3914 i128mem, SchedWriteShuffle.XMM, load, 0>,
3918 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3919 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3920 i256mem, SchedWriteShuffle.YMM, load, 0>,
3921 VEX_4V, VEX_L, VEX_WIG;
3922 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3923 i256mem, SchedWriteShuffle.YMM, load, 0>,
3924 VEX_4V, VEX_L, VEX_WIG;
3926 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3927 i256mem, SchedWriteShuffle.YMM, load, 0>,
3928 VEX_4V, VEX_L, VEX_WIG;
3929 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3930 i256mem, SchedWriteShuffle.YMM, load, 0>,
3934 let Constraints = "$src1 = $dst" in {
3935 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3936 i128mem, SchedWriteShuffle.XMM, memop>;
3937 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3938 i128mem, SchedWriteShuffle.XMM, memop>;
3940 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3941 i128mem, SchedWriteShuffle.XMM, memop>;
3943 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3944 i128mem, SchedWriteShuffle.XMM, memop>;
3946 } // ExeDomain = SSEPackedInt
3948 //===---------------------------------------------------------------------===//
3949 // SSE2 - Packed Integer Unpack Instructions
3950 //===---------------------------------------------------------------------===//
3952 let ExeDomain = SSEPackedInt in {
3953 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3954 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3955 X86FoldableSchedWrite sched, PatFrag ld_frag,
3957 def rr : PDI<opc, MRMSrcReg,
3958 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3960 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3961 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3962 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3964 def rm : PDI<opc, MRMSrcMem,
3965 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3967 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3968 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3969 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3970 Sched<[sched.Folded, sched.ReadAfterFold]>;
3973 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3974 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3975 i128mem, SchedWriteShuffle.XMM, load, 0>,
3977 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3978 i128mem, SchedWriteShuffle.XMM, load, 0>,
3980 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3981 i128mem, SchedWriteShuffle.XMM, load, 0>,
3983 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3984 i128mem, SchedWriteShuffle.XMM, load, 0>,
3988 let Predicates = [HasAVX, NoVLX] in {
3989 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3990 i128mem, SchedWriteShuffle.XMM, load, 0>,
3992 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3993 i128mem, SchedWriteShuffle.XMM, load, 0>,
3995 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3996 i128mem, SchedWriteShuffle.XMM, load, 0>,
3998 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3999 i128mem, SchedWriteShuffle.XMM, load, 0>,
4003 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4004 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
4005 i256mem, SchedWriteShuffle.YMM, load, 0>,
4006 VEX_4V, VEX_L, VEX_WIG;
4007 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
4008 i256mem, SchedWriteShuffle.YMM, load, 0>,
4009 VEX_4V, VEX_L, VEX_WIG;
4010 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
4011 i256mem, SchedWriteShuffle.YMM, load, 0>,
4012 VEX_4V, VEX_L, VEX_WIG;
4013 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
4014 i256mem, SchedWriteShuffle.YMM, load, 0>,
4015 VEX_4V, VEX_L, VEX_WIG;
4018 let Predicates = [HasAVX2, NoVLX] in {
4019 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
4020 i256mem, SchedWriteShuffle.YMM, load, 0>,
4021 VEX_4V, VEX_L, VEX_WIG;
4022 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
4023 i256mem, SchedWriteShuffle.YMM, load, 0>,
4024 VEX_4V, VEX_L, VEX_WIG;
4025 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
4026 i256mem, SchedWriteShuffle.YMM, load, 0>,
4027 VEX_4V, VEX_L, VEX_WIG;
4028 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
4029 i256mem, SchedWriteShuffle.YMM, load, 0>,
4030 VEX_4V, VEX_L, VEX_WIG;
4033 let Constraints = "$src1 = $dst" in {
4034 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
4035 i128mem, SchedWriteShuffle.XMM, memop>;
4036 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
4037 i128mem, SchedWriteShuffle.XMM, memop>;
4038 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
4039 i128mem, SchedWriteShuffle.XMM, memop>;
4040 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
4041 i128mem, SchedWriteShuffle.XMM, memop>;
4043 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
4044 i128mem, SchedWriteShuffle.XMM, memop>;
4045 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
4046 i128mem, SchedWriteShuffle.XMM, memop>;
4047 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
4048 i128mem, SchedWriteShuffle.XMM, memop>;
4049 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
4050 i128mem, SchedWriteShuffle.XMM, memop>;
4052 } // ExeDomain = SSEPackedInt
4054 //===---------------------------------------------------------------------===//
4055 // SSE2 - Packed Integer Extract and Insert
4056 //===---------------------------------------------------------------------===//
4058 let ExeDomain = SSEPackedInt in {
4059 multiclass sse2_pinsrw<bit Is2Addr = 1> {
4060 def rr : Ii8<0xC4, MRMSrcReg,
4061 (outs VR128:$dst), (ins VR128:$src1,
4062 GR32orGR64:$src2, u8imm:$src3),
4064 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4065 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4067 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
4068 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
4069 def rm : Ii8<0xC4, MRMSrcMem,
4070 (outs VR128:$dst), (ins VR128:$src1,
4071 i16mem:$src2, u8imm:$src3),
4073 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4074 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4076 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
4078 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
4082 let Predicates = [HasAVX, NoBWI] in
4083 def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
4084 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4085 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4086 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4088 PD, VEX, Sched<[WriteVecExtract]>;
4089 def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
4090 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4091 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4092 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4094 Sched<[WriteVecExtract]>;
4097 let Predicates = [HasAVX, NoBWI] in
4098 defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
4100 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4101 defm PINSRW : sse2_pinsrw, PD;
4103 } // ExeDomain = SSEPackedInt
4105 //===---------------------------------------------------------------------===//
4106 // SSE2 - Packed Mask Creation
4107 //===---------------------------------------------------------------------===//
4109 let ExeDomain = SSEPackedInt in {
4111 def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4113 "pmovmskb\t{$src, $dst|$dst, $src}",
4114 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4115 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
4117 let Predicates = [HasAVX2] in {
4118 def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4120 "pmovmskb\t{$src, $dst|$dst, $src}",
4121 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
4122 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
4125 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4126 "pmovmskb\t{$src, $dst|$dst, $src}",
4127 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4128 Sched<[WriteVecMOVMSK]>;
4130 } // ExeDomain = SSEPackedInt
4132 //===---------------------------------------------------------------------===//
4133 // SSE2 - Conditional Store
4134 //===---------------------------------------------------------------------===//
4136 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
4137 let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
4138 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4139 (ins VR128:$src, VR128:$mask),
4140 "maskmovdqu\t{$mask, $src|$src, $mask}",
4141 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4143 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4144 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4145 (ins VR128:$src, VR128:$mask),
4146 "maskmovdqu\t{$mask, $src|$src, $mask}",
4147 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4150 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
4151 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4152 "maskmovdqu\t{$mask, $src|$src, $mask}",
4153 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4154 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4155 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4156 "maskmovdqu\t{$mask, $src|$src, $mask}",
4157 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
4159 } // ExeDomain = SSEPackedInt
4161 //===---------------------------------------------------------------------===//
4162 // SSE2 - Move Doubleword/Quadword
4163 //===---------------------------------------------------------------------===//
4165 //===---------------------------------------------------------------------===//
4166 // Move Int Doubleword to Packed Double Int
4168 let ExeDomain = SSEPackedInt in {
4169 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4170 "movd\t{$src, $dst|$dst, $src}",
4172 (v4i32 (scalar_to_vector GR32:$src)))]>,
4173 VEX, Sched<[WriteVecMoveFromGpr]>;
4174 def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4175 "movd\t{$src, $dst|$dst, $src}",
4177 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4178 VEX, Sched<[WriteVecLoad]>;
4179 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4180 "movq\t{$src, $dst|$dst, $src}",
4182 (v2i64 (scalar_to_vector GR64:$src)))]>,
4183 VEX, Sched<[WriteVecMoveFromGpr]>;
4184 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4185 def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4186 "movq\t{$src, $dst|$dst, $src}", []>,
4187 VEX, Sched<[WriteVecLoad]>;
4188 let isCodeGenOnly = 1 in
4189 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4190 "movq\t{$src, $dst|$dst, $src}",
4191 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4192 VEX, Sched<[WriteVecMoveFromGpr]>;
4194 def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4195 "movd\t{$src, $dst|$dst, $src}",
4197 (v4i32 (scalar_to_vector GR32:$src)))]>,
4198 Sched<[WriteVecMoveFromGpr]>;
4199 def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4200 "movd\t{$src, $dst|$dst, $src}",
4202 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4203 Sched<[WriteVecLoad]>;
4204 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4205 "movq\t{$src, $dst|$dst, $src}",
4207 (v2i64 (scalar_to_vector GR64:$src)))]>,
4208 Sched<[WriteVecMoveFromGpr]>;
4209 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4210 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4211 "movq\t{$src, $dst|$dst, $src}", []>,
4212 Sched<[WriteVecLoad]>;
4213 let isCodeGenOnly = 1 in
4214 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4215 "movq\t{$src, $dst|$dst, $src}",
4216 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4217 Sched<[WriteVecMoveFromGpr]>;
4218 } // ExeDomain = SSEPackedInt
4220 //===---------------------------------------------------------------------===//
4221 // Move Int Doubleword to Single Scalar
4223 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4224 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4225 "movd\t{$src, $dst|$dst, $src}",
4226 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4227 VEX, Sched<[WriteVecMoveFromGpr]>;
4229 def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4230 "movd\t{$src, $dst|$dst, $src}",
4231 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
4232 VEX, Sched<[WriteVecLoad]>;
4233 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4234 "movd\t{$src, $dst|$dst, $src}",
4235 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4236 Sched<[WriteVecMoveFromGpr]>;
4238 def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4239 "movd\t{$src, $dst|$dst, $src}",
4240 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
4241 Sched<[WriteVecLoad]>;
4242 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4244 //===---------------------------------------------------------------------===//
4245 // Move Packed Doubleword Int to Packed Double Int
4247 let ExeDomain = SSEPackedInt in {
4248 def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4249 "movd\t{$src, $dst|$dst, $src}",
4250 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4252 Sched<[WriteVecMoveToGpr]>;
4253 def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
4254 (ins i32mem:$dst, VR128:$src),
4255 "movd\t{$src, $dst|$dst, $src}",
4256 [(store (i32 (extractelt (v4i32 VR128:$src),
4257 (iPTR 0))), addr:$dst)]>,
4258 VEX, Sched<[WriteVecStore]>;
4259 def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4260 "movd\t{$src, $dst|$dst, $src}",
4261 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4263 Sched<[WriteVecMoveToGpr]>;
4264 def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4265 "movd\t{$src, $dst|$dst, $src}",
4266 [(store (i32 (extractelt (v4i32 VR128:$src),
4267 (iPTR 0))), addr:$dst)]>,
4268 Sched<[WriteVecStore]>;
4269 } // ExeDomain = SSEPackedInt
4271 //===---------------------------------------------------------------------===//
4272 // Move Packed Doubleword Int first element to Doubleword Int
4274 let ExeDomain = SSEPackedInt in {
4275 let SchedRW = [WriteVecMoveToGpr] in {
4276 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4277 "movq\t{$src, $dst|$dst, $src}",
4278 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4282 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4283 "movq\t{$src, $dst|$dst, $src}",
4284 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4288 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4289 def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4290 (ins i64mem:$dst, VR128:$src),
4291 "movq\t{$src, $dst|$dst, $src}", []>,
4292 VEX, Sched<[WriteVecStore]>;
4293 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4294 def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4295 "movq\t{$src, $dst|$dst, $src}", []>,
4296 Sched<[WriteVecStore]>;
4297 } // ExeDomain = SSEPackedInt
4299 //===---------------------------------------------------------------------===//
4300 // Bitcast FR64 <-> GR64
4302 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4303 let Predicates = [UseAVX] in
4304 def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4305 "movq\t{$src, $dst|$dst, $src}",
4306 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4307 VEX, Sched<[WriteVecLoad]>;
4308 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4309 "movq\t{$src, $dst|$dst, $src}",
4310 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4311 VEX, Sched<[WriteVecMoveToGpr]>;
4312 def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4313 "movq\t{$src, $dst|$dst, $src}",
4314 [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
4315 VEX, Sched<[WriteVecStore]>;
4317 def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4318 "movq\t{$src, $dst|$dst, $src}",
4319 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4320 Sched<[WriteVecLoad]>;
4321 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4322 "movq\t{$src, $dst|$dst, $src}",
4323 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4324 Sched<[WriteVecMoveToGpr]>;
4325 def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4326 "movq\t{$src, $dst|$dst, $src}",
4327 [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
4328 Sched<[WriteVecStore]>;
4329 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4331 //===---------------------------------------------------------------------===//
4332 // Move Scalar Single to Double Int
4334 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4335 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4336 "movd\t{$src, $dst|$dst, $src}",
4337 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4338 VEX, Sched<[WriteVecMoveToGpr]>;
4339 def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4340 "movd\t{$src, $dst|$dst, $src}",
4341 [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
4342 VEX, Sched<[WriteVecStore]>;
4343 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4344 "movd\t{$src, $dst|$dst, $src}",
4345 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4346 Sched<[WriteVecMoveToGpr]>;
4347 def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4348 "movd\t{$src, $dst|$dst, $src}",
4349 [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
4350 Sched<[WriteVecStore]>;
4351 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4353 let Predicates = [UseAVX] in {
4354 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4355 (VMOVDI2PDIrr GR32:$src)>;
4357 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4358 (VMOV64toPQIrr GR64:$src)>;
4360 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4361 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4362 (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>;
4363 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4364 // These instructions also write zeros in the high part of a 256-bit register.
4365 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4366 (VMOVDI2PDIrm addr:$src)>;
4367 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4368 (VMOVDI2PDIrm addr:$src)>;
4369 def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
4370 (VMOVDI2PDIrm addr:$src)>;
4371 def : Pat<(v4i32 (X86vzload addr:$src)),
4372 (VMOVDI2PDIrm addr:$src)>;
4373 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4374 (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
4375 (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4376 def : Pat<(v8i32 (X86vzload addr:$src)),
4377 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4378 // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4379 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4380 (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4381 (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>;
4384 let Predicates = [UseSSE2] in {
4385 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4386 (MOVDI2PDIrr GR32:$src)>;
4388 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4389 (MOV64toPQIrr GR64:$src)>;
4390 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4391 (MOVDI2PDIrm addr:$src)>;
4392 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4393 (MOVDI2PDIrm addr:$src)>;
4394 def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
4395 (MOVDI2PDIrm addr:$src)>;
4396 def : Pat<(v4i32 (X86vzload addr:$src)),
4397 (MOVDI2PDIrm addr:$src)>;
4400 // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4401 // "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4403 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4404 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4405 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4406 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4407 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4408 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4409 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4410 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4411 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4413 //===---------------------------------------------------------------------===//
4414 // SSE2 - Move Quadword
4415 //===---------------------------------------------------------------------===//
4417 //===---------------------------------------------------------------------===//
4418 // Move Quadword Int to Packed Quadword Int
4421 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4422 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4423 "vmovq\t{$src, $dst|$dst, $src}",
4425 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4426 VEX, Requires<[UseAVX]>, VEX_WIG;
4427 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4428 "movq\t{$src, $dst|$dst, $src}",
4430 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4431 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4432 } // ExeDomain, SchedRW
4434 //===---------------------------------------------------------------------===//
4435 // Move Packed Quadword Int to Quadword Int
4437 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4438 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4439 "movq\t{$src, $dst|$dst, $src}",
4440 [(store (i64 (extractelt (v2i64 VR128:$src),
4441 (iPTR 0))), addr:$dst)]>,
4443 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4444 "movq\t{$src, $dst|$dst, $src}",
4445 [(store (i64 (extractelt (v2i64 VR128:$src),
4446 (iPTR 0))), addr:$dst)]>;
4447 } // ExeDomain, SchedRW
4449 // For disassembler only
4450 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4451 SchedRW = [SchedWriteVecLogic.XMM] in {
4452 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4453 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4454 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4455 "movq\t{$src, $dst|$dst, $src}", []>;
4458 // Aliases to help the assembler pick two byte VEX encodings by swapping the
4459 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
4460 def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
4461 (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
4463 def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4464 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4465 def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4466 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4468 let Predicates = [UseAVX] in {
4469 def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4470 (VMOVQI2PQIrm addr:$src)>;
4471 def : Pat<(v2i64 (X86vzload addr:$src)),
4472 (VMOVQI2PQIrm addr:$src)>;
4473 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4474 (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
4475 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4476 def : Pat<(v4i64 (X86vzload addr:$src)),
4477 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4480 let Predicates = [UseSSE2] in {
4481 def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4482 (MOVQI2PQIrm addr:$src)>;
4483 def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
4486 //===---------------------------------------------------------------------===//
4487 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4488 // IA32 document. movq xmm1, xmm2 does clear the high bits.
4490 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4491 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4492 "vmovq\t{$src, $dst|$dst, $src}",
4493 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4494 XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4495 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4496 "movq\t{$src, $dst|$dst, $src}",
4497 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4498 XS, Requires<[UseSSE2]>;
4499 } // ExeDomain, SchedRW
4501 let Predicates = [UseAVX] in {
4502 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4503 (VMOVZPQILo2PQIrr VR128:$src)>;
4505 let Predicates = [UseSSE2] in {
4506 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4507 (MOVZPQILo2PQIrr VR128:$src)>;
4510 //===---------------------------------------------------------------------===//
4511 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4512 //===---------------------------------------------------------------------===//
4514 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4515 ValueType vt, RegisterClass RC, PatFrag mem_frag,
4516 X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4517 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4518 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4519 [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4521 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4522 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4523 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4524 Sched<[sched.Folded]>;
4527 let Predicates = [HasAVX, NoVLX] in {
4528 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4529 v4f32, VR128, loadv4f32, f128mem,
4530 SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4531 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4532 v4f32, VR128, loadv4f32, f128mem,
4533 SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4534 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4535 v8f32, VR256, loadv8f32, f256mem,
4536 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4537 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4538 v8f32, VR256, loadv8f32, f256mem,
4539 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4541 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4542 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4543 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4544 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4546 let Predicates = [HasAVX, NoVLX] in {
4547 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4548 (VMOVSHDUPrr VR128:$src)>;
4549 def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4550 (VMOVSHDUPrm addr:$src)>;
4551 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4552 (VMOVSLDUPrr VR128:$src)>;
4553 def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4554 (VMOVSLDUPrm addr:$src)>;
4555 def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4556 (VMOVSHDUPYrr VR256:$src)>;
4557 def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4558 (VMOVSHDUPYrm addr:$src)>;
4559 def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4560 (VMOVSLDUPYrr VR256:$src)>;
4561 def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4562 (VMOVSLDUPYrm addr:$src)>;
4565 let Predicates = [UseSSE3] in {
4566 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4567 (MOVSHDUPrr VR128:$src)>;
4568 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4569 (MOVSHDUPrm addr:$src)>;
4570 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4571 (MOVSLDUPrr VR128:$src)>;
4572 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4573 (MOVSLDUPrm addr:$src)>;
4576 //===---------------------------------------------------------------------===//
4577 // SSE3 - Replicate Double FP - MOVDDUP
4578 //===---------------------------------------------------------------------===//
4580 multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4581 def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4582 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4583 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4585 def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4586 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4589 (scalar_to_vector (loadf64 addr:$src)))))]>,
4590 Sched<[sched.XMM.Folded]>;
4593 // FIXME: Merge with above classes when there are patterns for the ymm version
4594 multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4595 def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4596 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4597 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4599 def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4600 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4602 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4603 Sched<[sched.YMM.Folded]>;
4606 let Predicates = [HasAVX, NoVLX] in {
4607 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4609 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4610 VEX, VEX_L, VEX_WIG;
4613 defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4616 let Predicates = [HasAVX, NoVLX] in {
4617 def : Pat<(X86Movddup (loadv2f64 addr:$src)),
4618 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4619 def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))),
4620 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4623 let Predicates = [UseSSE3] in {
4624 // No need for aligned memory as this only loads 64-bits.
4625 def : Pat<(X86Movddup (loadv2f64 addr:$src)),
4626 (MOVDDUPrm addr:$src)>;
4627 def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))),
4628 (MOVDDUPrm addr:$src)>;
4631 //===---------------------------------------------------------------------===//
4632 // SSE3 - Move Unaligned Integer
4633 //===---------------------------------------------------------------------===//
4635 let Predicates = [HasAVX] in {
4636 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4637 "vlddqu\t{$src, $dst|$dst, $src}",
4638 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4639 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4640 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4641 "vlddqu\t{$src, $dst|$dst, $src}",
4642 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4643 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4646 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4647 "lddqu\t{$src, $dst|$dst, $src}",
4648 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4649 Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4651 //===---------------------------------------------------------------------===//
4652 // SSE3 - Arithmetic
4653 //===---------------------------------------------------------------------===//
4655 multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4656 X86MemOperand x86memop, X86FoldableSchedWrite sched,
4657 PatFrag ld_frag, bit Is2Addr = 1> {
4658 def rr : I<0xD0, MRMSrcReg,
4659 (outs RC:$dst), (ins RC:$src1, RC:$src2),
4661 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4662 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4663 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4665 def rm : I<0xD0, MRMSrcMem,
4666 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4668 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4669 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4670 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4671 Sched<[sched.Folded, sched.ReadAfterFold]>;
4674 let Predicates = [HasAVX] in {
4675 let ExeDomain = SSEPackedSingle in {
4676 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4677 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4678 XD, VEX_4V, VEX_WIG;
4679 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4680 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4681 XD, VEX_4V, VEX_L, VEX_WIG;
4683 let ExeDomain = SSEPackedDouble in {
4684 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4685 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4686 PD, VEX_4V, VEX_WIG;
4687 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4688 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4689 PD, VEX_4V, VEX_L, VEX_WIG;
4692 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4693 let ExeDomain = SSEPackedSingle in
4694 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4695 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4696 let ExeDomain = SSEPackedDouble in
4697 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4698 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4701 //===---------------------------------------------------------------------===//
4702 // SSE3 Instructions
4703 //===---------------------------------------------------------------------===//
4706 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4707 X86MemOperand x86memop, SDNode OpNode,
4708 X86FoldableSchedWrite sched, PatFrag ld_frag,
4710 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4712 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4713 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4714 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4717 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4719 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4720 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4721 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4722 Sched<[sched.Folded, sched.ReadAfterFold]>;
4724 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4725 X86MemOperand x86memop, SDNode OpNode,
4726 X86FoldableSchedWrite sched, PatFrag ld_frag,
4728 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4730 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4731 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4732 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4735 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4737 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4738 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4739 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4740 Sched<[sched.Folded, sched.ReadAfterFold]>;
4743 let Predicates = [HasAVX] in {
4744 let ExeDomain = SSEPackedSingle in {
4745 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4746 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4747 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4748 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4749 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4750 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4751 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4752 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4754 let ExeDomain = SSEPackedDouble in {
4755 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4756 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4757 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4758 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4759 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4760 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4761 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4762 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4766 let Constraints = "$src1 = $dst" in {
4767 let ExeDomain = SSEPackedSingle in {
4768 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4769 WriteFHAdd, memopv4f32>;
4770 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4771 WriteFHAdd, memopv4f32>;
4773 let ExeDomain = SSEPackedDouble in {
4774 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4775 WriteFHAdd, memopv2f64>;
4776 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4777 WriteFHAdd, memopv2f64>;
4781 //===---------------------------------------------------------------------===//
4782 // SSSE3 - Packed Absolute Instructions
4783 //===---------------------------------------------------------------------===//
4785 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4786 multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4787 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4788 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4790 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4791 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4794 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4796 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4798 (vt (OpNode (ld_frag addr:$src))))]>,
4799 Sched<[sched.XMM.Folded]>;
4802 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4803 multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4804 SDNode OpNode, X86SchedWriteWidths sched> {
4805 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4807 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4808 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4811 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4813 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4815 (vt (OpNode (load addr:$src))))]>,
4816 Sched<[sched.YMM.Folded]>;
4819 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4820 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4821 load>, VEX, VEX_WIG;
4822 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4823 load>, VEX, VEX_WIG;
4825 let Predicates = [HasAVX, NoVLX] in {
4826 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4827 load>, VEX, VEX_WIG;
4829 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4830 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4831 VEX, VEX_L, VEX_WIG;
4832 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4833 VEX, VEX_L, VEX_WIG;
4835 let Predicates = [HasAVX2, NoVLX] in {
4836 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4837 VEX, VEX_L, VEX_WIG;
4840 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4842 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4844 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4847 //===---------------------------------------------------------------------===//
4848 // SSSE3 - Packed Binary Operator Instructions
4849 //===---------------------------------------------------------------------===//
4851 /// SS3I_binop_rm - Simple SSSE3 bin op
4852 multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4853 ValueType DstVT, ValueType OpVT, RegisterClass RC,
4854 PatFrag memop_frag, X86MemOperand x86memop,
4855 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4856 let isCommutable = 1 in
4857 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4858 (ins RC:$src1, RC:$src2),
4860 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4861 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4862 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4864 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4865 (ins RC:$src1, x86memop:$src2),
4867 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4868 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4870 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4871 Sched<[sched.Folded, sched.ReadAfterFold]>;
4874 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4875 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4876 Intrinsic IntId128, X86FoldableSchedWrite sched,
4877 PatFrag ld_frag, bit Is2Addr = 1> {
4878 let isCommutable = 1 in
4879 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4880 (ins VR128:$src1, VR128:$src2),
4882 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4883 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4884 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4886 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4887 (ins VR128:$src1, i128mem:$src2),
4889 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4890 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4892 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4893 Sched<[sched.Folded, sched.ReadAfterFold]>;
4896 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4898 X86FoldableSchedWrite sched> {
4899 let isCommutable = 1 in
4900 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4901 (ins VR256:$src1, VR256:$src2),
4902 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4903 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4905 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4906 (ins VR256:$src1, i256mem:$src2),
4907 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4909 (IntId256 VR256:$src1, (load addr:$src2)))]>,
4910 Sched<[sched.Folded, sched.ReadAfterFold]>;
4913 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4914 let isCommutable = 0 in {
4915 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4916 VR128, load, i128mem,
4917 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4918 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4919 v16i8, VR128, load, i128mem,
4920 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4922 defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4923 VR128, load, i128mem,
4924 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4927 let ImmT = NoImm, Predicates = [HasAVX] in {
4928 let isCommutable = 0 in {
4929 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4931 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4932 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4934 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4935 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4937 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4938 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4940 SchedWritePHAdd.XMM, 0>, VEX_4V;
4941 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
4942 int_x86_ssse3_psign_b_128,
4943 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4944 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
4945 int_x86_ssse3_psign_w_128,
4946 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4947 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
4948 int_x86_ssse3_psign_d_128,
4949 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4950 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
4951 int_x86_ssse3_phadd_sw_128,
4952 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4953 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
4954 int_x86_ssse3_phsub_sw_128,
4955 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4959 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4960 let isCommutable = 0 in {
4961 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4962 VR256, load, i256mem,
4963 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4964 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4965 v32i8, VR256, load, i256mem,
4966 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4968 defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4969 VR256, load, i256mem,
4970 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4973 let ImmT = NoImm, Predicates = [HasAVX2] in {
4974 let isCommutable = 0 in {
4975 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4976 VR256, load, i256mem,
4977 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4978 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4980 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4981 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4982 VR256, load, i256mem,
4983 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4984 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4986 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
4987 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4988 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4989 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4990 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4991 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4992 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4993 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4994 int_x86_avx2_phadd_sw,
4995 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4996 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4997 int_x86_avx2_phsub_sw,
4998 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
5002 // None of these have i8 immediate fields.
5003 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
5004 let isCommutable = 0 in {
5005 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
5006 memop, i128mem, SchedWritePHAdd.XMM>;
5007 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
5008 memop, i128mem, SchedWritePHAdd.XMM>;
5009 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
5010 memop, i128mem, SchedWritePHAdd.XMM>;
5011 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
5012 memop, i128mem, SchedWritePHAdd.XMM>;
5013 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
5014 SchedWriteVecALU.XMM, memop>;
5015 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
5016 SchedWriteVecALU.XMM, memop>;
5017 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
5018 SchedWriteVecALU.XMM, memop>;
5019 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
5020 memop, i128mem, SchedWriteVarShuffle.XMM>;
5021 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
5022 int_x86_ssse3_phadd_sw_128,
5023 SchedWritePHAdd.XMM, memop>;
5024 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
5025 int_x86_ssse3_phsub_sw_128,
5026 SchedWritePHAdd.XMM, memop>;
5027 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
5028 v16i8, VR128, memop, i128mem,
5029 SchedWriteVecIMul.XMM>;
5031 defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
5032 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
5035 //===---------------------------------------------------------------------===//
5036 // SSSE3 - Packed Align Instruction Patterns
5037 //===---------------------------------------------------------------------===//
5039 multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
5040 PatFrag memop_frag, X86MemOperand x86memop,
5041 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
5042 let hasSideEffects = 0 in {
5043 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
5044 (ins RC:$src1, RC:$src2, u8imm:$src3),
5046 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5048 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5049 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>,
5052 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
5053 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5055 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5057 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5058 [(set RC:$dst, (VT (X86PAlignr RC:$src1,
5059 (memop_frag addr:$src2),
5060 (i8 imm:$src3))))]>,
5061 Sched<[sched.Folded, sched.ReadAfterFold]>;
5065 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
5066 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
5067 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
5068 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
5069 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
5070 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
5071 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
5072 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
5073 SchedWriteShuffle.XMM>;
5075 //===---------------------------------------------------------------------===//
5076 // SSSE3 - Thread synchronization
5077 //===---------------------------------------------------------------------===//
5079 let SchedRW = [WriteSystem] in {
5080 let usesCustomInserter = 1 in {
5081 def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
5082 [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
5083 Requires<[HasSSE3]>;
5086 let Uses = [EAX, ECX, EDX] in
5087 def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
5088 TB, Requires<[HasSSE3]>;
5090 let Uses = [ECX, EAX] in
5091 def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
5092 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
5095 def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
5096 def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
5098 def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
5099 Requires<[Not64BitMode]>;
5100 def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
5101 Requires<[In64BitMode]>;
5103 //===----------------------------------------------------------------------===//
5104 // SSE4.1 - Packed Move with Sign/Zero Extend
5105 //===----------------------------------------------------------------------===//
5107 multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5108 RegisterClass OutRC, RegisterClass InRC,
5109 X86FoldableSchedWrite sched> {
5110 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
5111 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
5114 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
5115 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
5116 Sched<[sched.Folded]>;
5119 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
5120 X86MemOperand MemOp, X86MemOperand MemYOp,
5122 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
5123 SchedWriteShuffle.XMM>;
5124 let Predicates = [HasAVX, prd] in
5125 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
5126 VR128, VR128, SchedWriteShuffle.XMM>,
5128 let Predicates = [HasAVX2, prd] in
5129 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
5130 VR256, VR128, WriteShuffle256>,
5131 VEX, VEX_L, VEX_WIG;
5134 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5135 X86MemOperand MemYOp, Predicate prd> {
5136 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
5137 MemOp, MemYOp, prd>;
5138 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
5139 !strconcat("pmovzx", OpcodeStr),
5140 MemOp, MemYOp, prd>;
5143 defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
5144 defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
5145 defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
5147 defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
5148 defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
5150 defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
5152 // Patterns that we also need for any_extend.
5153 // Any_extend_vector_inreg is currently legalized to zero_extend_vector_inreg.
5154 multiclass SS41I_pmovx_avx2_patterns_base<string OpcPrefix, SDNode ExtOp> {
5155 // Register-Register patterns
5156 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5157 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
5158 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
5161 let Predicates = [HasAVX2, NoVLX] in {
5162 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
5163 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
5165 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
5166 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
5169 // AVX2 Register-Memory patterns
5170 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5171 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
5172 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5173 def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5174 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5175 def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5176 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5179 let Predicates = [HasAVX2, NoVLX] in {
5180 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
5181 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5182 def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5183 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5184 def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5185 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5187 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
5188 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5189 def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
5190 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5191 def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
5192 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5197 multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
5198 SDNode ExtOp, SDNode InVecOp> :
5199 SS41I_pmovx_avx2_patterns_base<OpcPrefix, ExtOp> {
5201 // Register-Register patterns
5202 let Predicates = [HasAVX2, NoVLX] in {
5203 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
5204 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
5205 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
5206 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
5208 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
5209 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
5212 // Simple Register-Memory patterns
5213 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5214 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5215 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5217 let Predicates = [HasAVX2, NoVLX] in {
5218 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5219 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5220 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5221 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5223 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5224 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5225 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5226 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5228 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5229 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5232 // AVX2 Register-Memory patterns
5233 let Predicates = [HasAVX2, NoVLX] in {
5234 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5235 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5236 def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5237 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5238 def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
5239 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5240 def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))),
5241 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5243 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5244 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5245 def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5246 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5247 def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
5248 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5249 def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))),
5250 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5252 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5253 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5254 def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5255 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5256 def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
5257 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5258 def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))),
5259 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5263 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5264 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5265 defm : SS41I_pmovx_avx2_patterns_base<"VPMOVZX", anyext>;
5267 // SSE4.1/AVX patterns.
5268 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5270 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5271 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5272 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5274 let Predicates = [HasAVX, NoVLX] in {
5275 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5276 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5277 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5278 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5280 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5281 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5282 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5283 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5285 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5286 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5288 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5289 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5290 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5292 let Predicates = [HasAVX, NoVLX] in {
5293 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5294 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5295 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5296 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5298 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5299 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5300 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5301 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5303 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5304 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5306 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5307 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5308 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5309 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5310 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5311 def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5312 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5313 def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5314 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5315 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5316 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5318 let Predicates = [HasAVX, NoVLX] in {
5319 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5320 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5321 def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5322 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5323 def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5324 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5325 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5326 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5328 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5329 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5330 def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5331 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5332 def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5333 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5334 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5335 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5337 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5338 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5339 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5340 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5341 def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5342 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5343 def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5344 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5345 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5346 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5348 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5349 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5350 def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
5351 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5352 def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5353 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5354 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5355 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5357 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5358 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5359 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5360 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5361 def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
5362 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5363 def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
5364 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5365 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5366 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5370 defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5371 defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5373 let Predicates = [UseSSE41] in {
5374 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5375 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5378 //===----------------------------------------------------------------------===//
5379 // SSE4.1 - Extract Instructions
5380 //===----------------------------------------------------------------------===//
5382 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5383 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5384 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5385 (ins VR128:$src1, u8imm:$src2),
5386 !strconcat(OpcodeStr,
5387 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5388 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5390 Sched<[WriteVecExtract]>;
5391 let hasSideEffects = 0, mayStore = 1 in
5392 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5393 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5394 !strconcat(OpcodeStr,
5395 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5396 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
5397 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5400 let Predicates = [HasAVX, NoBWI] in
5401 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
5403 defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
5406 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5407 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5408 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5409 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5410 (ins VR128:$src1, u8imm:$src2),
5411 !strconcat(OpcodeStr,
5412 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5413 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5415 let hasSideEffects = 0, mayStore = 1 in
5416 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5417 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5418 !strconcat(OpcodeStr,
5419 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5420 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
5421 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5424 let Predicates = [HasAVX, NoBWI] in
5425 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
5427 defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
5430 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5431 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5432 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5433 (ins VR128:$src1, u8imm:$src2),
5434 !strconcat(OpcodeStr,
5435 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5437 (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5438 Sched<[WriteVecExtract]>;
5439 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5440 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5441 !strconcat(OpcodeStr,
5442 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5443 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5444 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5447 let Predicates = [HasAVX, NoDQI] in
5448 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5450 defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
5452 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5453 multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5454 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5455 (ins VR128:$src1, u8imm:$src2),
5456 !strconcat(OpcodeStr,
5457 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5459 (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5460 Sched<[WriteVecExtract]>;
5461 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5462 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5463 !strconcat(OpcodeStr,
5464 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5465 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5466 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5469 let Predicates = [HasAVX, NoDQI] in
5470 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5472 defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W;
5474 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5476 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5477 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5478 (ins VR128:$src1, u8imm:$src2),
5479 !strconcat(OpcodeStr,
5480 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5481 [(set GR32orGR64:$dst,
5482 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5483 Sched<[WriteVecExtract]>;
5484 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5485 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5486 !strconcat(OpcodeStr,
5487 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5488 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5489 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5492 let ExeDomain = SSEPackedSingle in {
5493 let Predicates = [UseAVX] in
5494 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5495 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
5498 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
5499 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
5502 (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
5504 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
5507 (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
5508 Requires<[UseSSE41]>;
5510 //===----------------------------------------------------------------------===//
5511 // SSE4.1 - Insert Instructions
5512 //===----------------------------------------------------------------------===//
5514 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5515 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5516 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5518 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5520 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5522 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
5523 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5524 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5525 (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5527 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5529 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5531 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>,
5532 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5535 let Predicates = [HasAVX, NoBWI] in
5536 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
5537 let Constraints = "$src1 = $dst" in
5538 defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
5540 multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5541 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5542 (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5544 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5546 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5548 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5549 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5550 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5551 (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5553 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5555 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5557 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5558 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5561 let Predicates = [HasAVX, NoDQI] in
5562 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5563 let Constraints = "$src1 = $dst" in
5564 defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5566 multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5567 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5568 (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5570 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5572 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5574 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5575 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5576 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5577 (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5579 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5581 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5583 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5584 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5587 let Predicates = [HasAVX, NoDQI] in
5588 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5589 let Constraints = "$src1 = $dst" in
5590 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5592 // insertps has a few different modes, there's the first two here below which
5593 // are optimized inserts that won't zero arbitrary elements in the destination
5594 // vector. The next one matches the intrinsic and could zero arbitrary elements
5595 // in the target vector.
5596 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5597 let isCommutable = 1 in
5598 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5599 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5601 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5603 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5605 (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>,
5606 Sched<[SchedWriteFShuffle.XMM]>;
5607 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5608 (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5610 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5612 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5614 (X86insertps VR128:$src1,
5615 (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5617 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5620 let ExeDomain = SSEPackedSingle in {
5621 let Predicates = [UseAVX] in
5622 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5624 let Constraints = "$src1 = $dst" in
5625 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5628 let Predicates = [UseAVX] in {
5629 // If we're inserting an element from a vbroadcast of a load, fold the
5630 // load into the X86insertps instruction.
5631 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
5632 (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
5633 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
5634 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
5635 (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
5636 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
5639 //===----------------------------------------------------------------------===//
5640 // SSE4.1 - Round Instructions
5641 //===----------------------------------------------------------------------===//
5643 multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5644 X86MemOperand x86memop, RegisterClass RC,
5645 ValueType VT, PatFrag mem_frag, SDNode OpNode,
5646 X86FoldableSchedWrite sched> {
5647 // Intrinsic operation, reg.
5648 // Vector intrinsic operation, reg
5649 def r : SS4AIi8<opc, MRMSrcReg,
5650 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5651 !strconcat(OpcodeStr,
5652 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5653 [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>,
5656 // Vector intrinsic operation, mem
5657 def m : SS4AIi8<opc, MRMSrcMem,
5658 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5659 !strconcat(OpcodeStr,
5660 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5662 (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>,
5663 Sched<[sched.Folded]>;
5666 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5667 string OpcodeStr, X86FoldableSchedWrite sched> {
5668 let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
5669 def SSr : SS4AIi8<opcss, MRMSrcReg,
5670 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5671 !strconcat(OpcodeStr,
5672 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5673 []>, Sched<[sched]>;
5676 def SSm : SS4AIi8<opcss, MRMSrcMem,
5677 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5678 !strconcat(OpcodeStr,
5679 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5680 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5681 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5683 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
5684 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5685 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5686 !strconcat(OpcodeStr,
5687 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5688 []>, Sched<[sched]>;
5691 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5692 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5693 !strconcat(OpcodeStr,
5694 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5695 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5696 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5699 multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5700 string OpcodeStr, X86FoldableSchedWrite sched> {
5701 let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
5702 def SSr : SS4AIi8<opcss, MRMSrcReg,
5703 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5704 !strconcat(OpcodeStr,
5705 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5706 []>, Sched<[sched]>;
5709 def SSm : SS4AIi8<opcss, MRMSrcMem,
5710 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5711 !strconcat(OpcodeStr,
5712 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5713 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5714 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5716 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
5717 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5718 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5719 !strconcat(OpcodeStr,
5720 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5721 []>, Sched<[sched]>;
5724 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5725 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5726 !strconcat(OpcodeStr,
5727 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5728 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5729 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5732 multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5733 string OpcodeStr, X86FoldableSchedWrite sched,
5734 ValueType VT32, ValueType VT64,
5735 SDNode OpNode, bit Is2Addr = 1> {
5736 let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
5737 def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5738 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5740 !strconcat(OpcodeStr,
5741 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5742 !strconcat(OpcodeStr,
5743 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5744 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
5747 def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5748 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5750 !strconcat(OpcodeStr,
5751 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5752 !strconcat(OpcodeStr,
5753 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5755 (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
5756 Sched<[sched.Folded, sched.ReadAfterFold]>;
5757 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5759 let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
5760 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5761 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5763 !strconcat(OpcodeStr,
5764 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5765 !strconcat(OpcodeStr,
5766 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5767 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
5770 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5771 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5773 !strconcat(OpcodeStr,
5774 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5775 !strconcat(OpcodeStr,
5776 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5778 (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
5779 Sched<[sched.Folded, sched.ReadAfterFold]>;
5780 } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5783 // FP round - roundss, roundps, roundsd, roundpd
5784 let Predicates = [HasAVX, NoVLX] in {
5785 let ExeDomain = SSEPackedSingle in {
5787 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5788 loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>,
5790 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5791 loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>,
5792 VEX, VEX_L, VEX_WIG;
5795 let ExeDomain = SSEPackedDouble in {
5796 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5797 loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>,
5799 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5800 loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>,
5801 VEX, VEX_L, VEX_WIG;
5804 let Predicates = [HasAVX, NoAVX512] in {
5805 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5806 v4f32, v2f64, X86RndScales, 0>,
5807 VEX_4V, VEX_LIG, VEX_WIG;
5808 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5809 VEX_4V, VEX_LIG, VEX_WIG;
5812 let Predicates = [UseAVX] in {
5813 def : Pat<(ffloor FR32:$src),
5814 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
5815 def : Pat<(f32 (fnearbyint FR32:$src)),
5816 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
5817 def : Pat<(f32 (fceil FR32:$src)),
5818 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
5819 def : Pat<(f32 (frint FR32:$src)),
5820 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
5821 def : Pat<(f32 (ftrunc FR32:$src)),
5822 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
5824 def : Pat<(f64 (ffloor FR64:$src)),
5825 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
5826 def : Pat<(f64 (fnearbyint FR64:$src)),
5827 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
5828 def : Pat<(f64 (fceil FR64:$src)),
5829 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
5830 def : Pat<(f64 (frint FR64:$src)),
5831 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
5832 def : Pat<(f64 (ftrunc FR64:$src)),
5833 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
5836 let Predicates = [UseAVX, OptForSize] in {
5837 def : Pat<(ffloor (loadf32 addr:$src)),
5838 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
5839 def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
5840 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
5841 def : Pat<(f32 (fceil (loadf32 addr:$src))),
5842 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
5843 def : Pat<(f32 (frint (loadf32 addr:$src))),
5844 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
5845 def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
5846 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
5848 def : Pat<(f64 (ffloor (loadf64 addr:$src))),
5849 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
5850 def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
5851 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
5852 def : Pat<(f64 (fceil (loadf64 addr:$src))),
5853 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
5854 def : Pat<(f64 (frint (loadf64 addr:$src))),
5855 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
5856 def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
5857 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
5860 let Predicates = [HasAVX, NoVLX] in {
5861 def : Pat<(v4f32 (ffloor VR128:$src)),
5862 (VROUNDPSr VR128:$src, (i32 0x9))>;
5863 def : Pat<(v4f32 (fnearbyint VR128:$src)),
5864 (VROUNDPSr VR128:$src, (i32 0xC))>;
5865 def : Pat<(v4f32 (fceil VR128:$src)),
5866 (VROUNDPSr VR128:$src, (i32 0xA))>;
5867 def : Pat<(v4f32 (frint VR128:$src)),
5868 (VROUNDPSr VR128:$src, (i32 0x4))>;
5869 def : Pat<(v4f32 (ftrunc VR128:$src)),
5870 (VROUNDPSr VR128:$src, (i32 0xB))>;
5872 def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))),
5873 (VROUNDPSm addr:$src, (i32 0x9))>;
5874 def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))),
5875 (VROUNDPSm addr:$src, (i32 0xC))>;
5876 def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))),
5877 (VROUNDPSm addr:$src, (i32 0xA))>;
5878 def : Pat<(v4f32 (frint (loadv4f32 addr:$src))),
5879 (VROUNDPSm addr:$src, (i32 0x4))>;
5880 def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))),
5881 (VROUNDPSm addr:$src, (i32 0xB))>;
5883 def : Pat<(v2f64 (ffloor VR128:$src)),
5884 (VROUNDPDr VR128:$src, (i32 0x9))>;
5885 def : Pat<(v2f64 (fnearbyint VR128:$src)),
5886 (VROUNDPDr VR128:$src, (i32 0xC))>;
5887 def : Pat<(v2f64 (fceil VR128:$src)),
5888 (VROUNDPDr VR128:$src, (i32 0xA))>;
5889 def : Pat<(v2f64 (frint VR128:$src)),
5890 (VROUNDPDr VR128:$src, (i32 0x4))>;
5891 def : Pat<(v2f64 (ftrunc VR128:$src)),
5892 (VROUNDPDr VR128:$src, (i32 0xB))>;
5894 def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))),
5895 (VROUNDPDm addr:$src, (i32 0x9))>;
5896 def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))),
5897 (VROUNDPDm addr:$src, (i32 0xC))>;
5898 def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))),
5899 (VROUNDPDm addr:$src, (i32 0xA))>;
5900 def : Pat<(v2f64 (frint (loadv2f64 addr:$src))),
5901 (VROUNDPDm addr:$src, (i32 0x4))>;
5902 def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))),
5903 (VROUNDPDm addr:$src, (i32 0xB))>;
5905 def : Pat<(v8f32 (ffloor VR256:$src)),
5906 (VROUNDPSYr VR256:$src, (i32 0x9))>;
5907 def : Pat<(v8f32 (fnearbyint VR256:$src)),
5908 (VROUNDPSYr VR256:$src, (i32 0xC))>;
5909 def : Pat<(v8f32 (fceil VR256:$src)),
5910 (VROUNDPSYr VR256:$src, (i32 0xA))>;
5911 def : Pat<(v8f32 (frint VR256:$src)),
5912 (VROUNDPSYr VR256:$src, (i32 0x4))>;
5913 def : Pat<(v8f32 (ftrunc VR256:$src)),
5914 (VROUNDPSYr VR256:$src, (i32 0xB))>;
5916 def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))),
5917 (VROUNDPSYm addr:$src, (i32 0x9))>;
5918 def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))),
5919 (VROUNDPSYm addr:$src, (i32 0xC))>;
5920 def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))),
5921 (VROUNDPSYm addr:$src, (i32 0xA))>;
5922 def : Pat<(v8f32 (frint (loadv8f32 addr:$src))),
5923 (VROUNDPSYm addr:$src, (i32 0x4))>;
5924 def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))),
5925 (VROUNDPSYm addr:$src, (i32 0xB))>;
5927 def : Pat<(v4f64 (ffloor VR256:$src)),
5928 (VROUNDPDYr VR256:$src, (i32 0x9))>;
5929 def : Pat<(v4f64 (fnearbyint VR256:$src)),
5930 (VROUNDPDYr VR256:$src, (i32 0xC))>;
5931 def : Pat<(v4f64 (fceil VR256:$src)),
5932 (VROUNDPDYr VR256:$src, (i32 0xA))>;
5933 def : Pat<(v4f64 (frint VR256:$src)),
5934 (VROUNDPDYr VR256:$src, (i32 0x4))>;
5935 def : Pat<(v4f64 (ftrunc VR256:$src)),
5936 (VROUNDPDYr VR256:$src, (i32 0xB))>;
5938 def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))),
5939 (VROUNDPDYm addr:$src, (i32 0x9))>;
5940 def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))),
5941 (VROUNDPDYm addr:$src, (i32 0xC))>;
5942 def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))),
5943 (VROUNDPDYm addr:$src, (i32 0xA))>;
5944 def : Pat<(v4f64 (frint (loadv4f64 addr:$src))),
5945 (VROUNDPDYm addr:$src, (i32 0x4))>;
5946 def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))),
5947 (VROUNDPDYm addr:$src, (i32 0xB))>;
5950 let ExeDomain = SSEPackedSingle in
5951 defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5952 memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>;
5953 let ExeDomain = SSEPackedDouble in
5954 defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5955 memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>;
5957 defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5959 let Constraints = "$src1 = $dst" in
5960 defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5961 v4f32, v2f64, X86RndScales>;
5963 let Predicates = [UseSSE41] in {
5964 def : Pat<(ffloor FR32:$src),
5965 (ROUNDSSr FR32:$src, (i32 0x9))>;
5966 def : Pat<(f32 (fnearbyint FR32:$src)),
5967 (ROUNDSSr FR32:$src, (i32 0xC))>;
5968 def : Pat<(f32 (fceil FR32:$src)),
5969 (ROUNDSSr FR32:$src, (i32 0xA))>;
5970 def : Pat<(f32 (frint FR32:$src)),
5971 (ROUNDSSr FR32:$src, (i32 0x4))>;
5972 def : Pat<(f32 (ftrunc FR32:$src)),
5973 (ROUNDSSr FR32:$src, (i32 0xB))>;
5975 def : Pat<(f64 (ffloor FR64:$src)),
5976 (ROUNDSDr FR64:$src, (i32 0x9))>;
5977 def : Pat<(f64 (fnearbyint FR64:$src)),
5978 (ROUNDSDr FR64:$src, (i32 0xC))>;
5979 def : Pat<(f64 (fceil FR64:$src)),
5980 (ROUNDSDr FR64:$src, (i32 0xA))>;
5981 def : Pat<(f64 (frint FR64:$src)),
5982 (ROUNDSDr FR64:$src, (i32 0x4))>;
5983 def : Pat<(f64 (ftrunc FR64:$src)),
5984 (ROUNDSDr FR64:$src, (i32 0xB))>;
5987 let Predicates = [UseSSE41, OptForSize] in {
5988 def : Pat<(ffloor (loadf32 addr:$src)),
5989 (ROUNDSSm addr:$src, (i32 0x9))>;
5990 def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
5991 (ROUNDSSm addr:$src, (i32 0xC))>;
5992 def : Pat<(f32 (fceil (loadf32 addr:$src))),
5993 (ROUNDSSm addr:$src, (i32 0xA))>;
5994 def : Pat<(f32 (frint (loadf32 addr:$src))),
5995 (ROUNDSSm addr:$src, (i32 0x4))>;
5996 def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
5997 (ROUNDSSm addr:$src, (i32 0xB))>;
5999 def : Pat<(f64 (ffloor (loadf64 addr:$src))),
6000 (ROUNDSDm addr:$src, (i32 0x9))>;
6001 def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
6002 (ROUNDSDm addr:$src, (i32 0xC))>;
6003 def : Pat<(f64 (fceil (loadf64 addr:$src))),
6004 (ROUNDSDm addr:$src, (i32 0xA))>;
6005 def : Pat<(f64 (frint (loadf64 addr:$src))),
6006 (ROUNDSDm addr:$src, (i32 0x4))>;
6007 def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
6008 (ROUNDSDm addr:$src, (i32 0xB))>;
6011 let Predicates = [UseSSE41] in {
6012 def : Pat<(v4f32 (ffloor VR128:$src)),
6013 (ROUNDPSr VR128:$src, (i32 0x9))>;
6014 def : Pat<(v4f32 (fnearbyint VR128:$src)),
6015 (ROUNDPSr VR128:$src, (i32 0xC))>;
6016 def : Pat<(v4f32 (fceil VR128:$src)),
6017 (ROUNDPSr VR128:$src, (i32 0xA))>;
6018 def : Pat<(v4f32 (frint VR128:$src)),
6019 (ROUNDPSr VR128:$src, (i32 0x4))>;
6020 def : Pat<(v4f32 (ftrunc VR128:$src)),
6021 (ROUNDPSr VR128:$src, (i32 0xB))>;
6023 def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))),
6024 (ROUNDPSm addr:$src, (i32 0x9))>;
6025 def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))),
6026 (ROUNDPSm addr:$src, (i32 0xC))>;
6027 def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))),
6028 (ROUNDPSm addr:$src, (i32 0xA))>;
6029 def : Pat<(v4f32 (frint (memopv4f32 addr:$src))),
6030 (ROUNDPSm addr:$src, (i32 0x4))>;
6031 def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))),
6032 (ROUNDPSm addr:$src, (i32 0xB))>;
6034 def : Pat<(v2f64 (ffloor VR128:$src)),
6035 (ROUNDPDr VR128:$src, (i32 0x9))>;
6036 def : Pat<(v2f64 (fnearbyint VR128:$src)),
6037 (ROUNDPDr VR128:$src, (i32 0xC))>;
6038 def : Pat<(v2f64 (fceil VR128:$src)),
6039 (ROUNDPDr VR128:$src, (i32 0xA))>;
6040 def : Pat<(v2f64 (frint VR128:$src)),
6041 (ROUNDPDr VR128:$src, (i32 0x4))>;
6042 def : Pat<(v2f64 (ftrunc VR128:$src)),
6043 (ROUNDPDr VR128:$src, (i32 0xB))>;
6045 def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))),
6046 (ROUNDPDm addr:$src, (i32 0x9))>;
6047 def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))),
6048 (ROUNDPDm addr:$src, (i32 0xC))>;
6049 def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))),
6050 (ROUNDPDm addr:$src, (i32 0xA))>;
6051 def : Pat<(v2f64 (frint (memopv2f64 addr:$src))),
6052 (ROUNDPDm addr:$src, (i32 0x4))>;
6053 def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))),
6054 (ROUNDPDm addr:$src, (i32 0xB))>;
6057 defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss,
6058 v4f32, 0x01, UseSSE41>;
6059 defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss,
6060 v4f32, 0x02, UseSSE41>;
6061 defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd,
6062 v2f64, 0x01, UseSSE41>;
6063 defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd,
6064 v2f64, 0x02, UseSSE41>;
6066 //===----------------------------------------------------------------------===//
6067 // SSE4.1 - Packed Bit Test
6068 //===----------------------------------------------------------------------===//
6070 // ptest instruction we'll lower to this in X86ISelLowering primarily from
6071 // the intel intrinsic that corresponds to this.
6072 let Defs = [EFLAGS], Predicates = [HasAVX] in {
6073 def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6074 "vptest\t{$src2, $src1|$src1, $src2}",
6075 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6076 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
6077 def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6078 "vptest\t{$src2, $src1|$src1, $src2}",
6079 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
6080 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
6083 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
6084 "vptest\t{$src2, $src1|$src1, $src2}",
6085 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
6086 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
6087 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
6088 "vptest\t{$src2, $src1|$src1, $src2}",
6089 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
6090 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
6091 VEX, VEX_L, VEX_WIG;
6094 let Defs = [EFLAGS] in {
6095 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6096 "ptest\t{$src2, $src1|$src1, $src2}",
6097 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6098 Sched<[SchedWriteVecTest.XMM]>;
6099 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6100 "ptest\t{$src2, $src1|$src1, $src2}",
6101 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
6102 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
6105 // The bit test instructions below are AVX only
6106 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
6107 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
6108 X86FoldableSchedWrite sched> {
6109 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
6110 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6111 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
6112 Sched<[sched]>, VEX;
6113 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
6114 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6115 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
6116 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
6119 let Defs = [EFLAGS], Predicates = [HasAVX] in {
6120 let ExeDomain = SSEPackedSingle in {
6121 defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
6122 SchedWriteFTest.XMM>;
6123 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
6124 SchedWriteFTest.YMM>, VEX_L;
6126 let ExeDomain = SSEPackedDouble in {
6127 defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
6128 SchedWriteFTest.XMM>;
6129 defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
6130 SchedWriteFTest.YMM>, VEX_L;
6134 //===----------------------------------------------------------------------===//
6135 // SSE4.1 - Misc Instructions
6136 //===----------------------------------------------------------------------===//
6138 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
6139 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
6140 "popcnt{w}\t{$src, $dst|$dst, $src}",
6141 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
6142 Sched<[WritePOPCNT]>, OpSize16, XS;
6143 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
6144 "popcnt{w}\t{$src, $dst|$dst, $src}",
6145 [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
6146 (implicit EFLAGS)]>,
6147 Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
6149 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
6150 "popcnt{l}\t{$src, $dst|$dst, $src}",
6151 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
6152 Sched<[WritePOPCNT]>, OpSize32, XS;
6154 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
6155 "popcnt{l}\t{$src, $dst|$dst, $src}",
6156 [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
6157 (implicit EFLAGS)]>,
6158 Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
6160 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
6161 "popcnt{q}\t{$src, $dst|$dst, $src}",
6162 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
6163 Sched<[WritePOPCNT]>, XS;
6164 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
6165 "popcnt{q}\t{$src, $dst|$dst, $src}",
6166 [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
6167 (implicit EFLAGS)]>,
6168 Sched<[WritePOPCNT.Folded]>, XS;
6171 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
6172 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
6173 SDNode OpNode, PatFrag ld_frag,
6174 X86FoldableSchedWrite Sched> {
6175 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6177 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6178 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
6180 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6182 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6184 (v8i16 (OpNode (ld_frag addr:$src))))]>,
6185 Sched<[Sched.Folded]>;
6188 // PHMIN has the same profile as PSAD, thus we use the same scheduling
6189 // model, although the naming is misleading.
6190 let Predicates = [HasAVX] in
6191 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
6193 WritePHMINPOS>, VEX, VEX_WIG;
6194 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
6198 /// SS48I_binop_rm - Simple SSE41 binary operator.
6199 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6200 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6201 X86MemOperand x86memop, X86FoldableSchedWrite sched,
6203 let isCommutable = 1 in
6204 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
6205 (ins RC:$src1, RC:$src2),
6207 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6208 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6209 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6211 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
6212 (ins RC:$src1, x86memop:$src2),
6214 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6215 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6217 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6218 Sched<[sched.Folded, sched.ReadAfterFold]>;
6221 let Predicates = [HasAVX, NoVLX] in {
6222 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
6223 load, i128mem, SchedWriteVecALU.XMM, 0>,
6225 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
6226 load, i128mem, SchedWriteVecALU.XMM, 0>,
6228 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
6229 load, i128mem, SchedWriteVecALU.XMM, 0>,
6231 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
6232 load, i128mem, SchedWriteVecALU.XMM, 0>,
6234 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
6235 load, i128mem, SchedWriteVecIMul.XMM, 0>,
6238 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
6239 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
6240 load, i128mem, SchedWriteVecALU.XMM, 0>,
6242 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
6243 load, i128mem, SchedWriteVecALU.XMM, 0>,
6245 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
6246 load, i128mem, SchedWriteVecALU.XMM, 0>,
6248 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
6249 load, i128mem, SchedWriteVecALU.XMM, 0>,
6253 let Predicates = [HasAVX2, NoVLX] in {
6254 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
6255 load, i256mem, SchedWriteVecALU.YMM, 0>,
6256 VEX_4V, VEX_L, VEX_WIG;
6257 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
6258 load, i256mem, SchedWriteVecALU.YMM, 0>,
6259 VEX_4V, VEX_L, VEX_WIG;
6260 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
6261 load, i256mem, SchedWriteVecALU.YMM, 0>,
6262 VEX_4V, VEX_L, VEX_WIG;
6263 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
6264 load, i256mem, SchedWriteVecALU.YMM, 0>,
6265 VEX_4V, VEX_L, VEX_WIG;
6266 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
6267 load, i256mem, SchedWriteVecIMul.YMM, 0>,
6268 VEX_4V, VEX_L, VEX_WIG;
6270 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
6271 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
6272 load, i256mem, SchedWriteVecALU.YMM, 0>,
6273 VEX_4V, VEX_L, VEX_WIG;
6274 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
6275 load, i256mem, SchedWriteVecALU.YMM, 0>,
6276 VEX_4V, VEX_L, VEX_WIG;
6277 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
6278 load, i256mem, SchedWriteVecALU.YMM, 0>,
6279 VEX_4V, VEX_L, VEX_WIG;
6280 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
6281 load, i256mem, SchedWriteVecALU.YMM, 0>,
6282 VEX_4V, VEX_L, VEX_WIG;
6285 let Constraints = "$src1 = $dst" in {
6286 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
6287 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6288 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
6289 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6290 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
6291 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6292 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
6293 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6294 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
6295 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6296 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
6297 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6298 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
6299 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6300 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
6301 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6302 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
6303 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
6306 let Predicates = [HasAVX, NoVLX] in
6307 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
6308 load, i128mem, SchedWritePMULLD.XMM, 0>,
6310 let Predicates = [HasAVX] in
6311 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
6312 load, i128mem, SchedWriteVecALU.XMM, 0>,
6315 let Predicates = [HasAVX2, NoVLX] in
6316 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
6317 load, i256mem, SchedWritePMULLD.YMM, 0>,
6318 VEX_4V, VEX_L, VEX_WIG;
6319 let Predicates = [HasAVX2] in
6320 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
6321 load, i256mem, SchedWriteVecALU.YMM, 0>,
6322 VEX_4V, VEX_L, VEX_WIG;
6324 let Constraints = "$src1 = $dst" in {
6325 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
6326 memop, i128mem, SchedWritePMULLD.XMM, 1>;
6327 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
6328 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6331 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
6332 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
6333 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
6334 X86MemOperand x86memop, bit Is2Addr,
6335 X86FoldableSchedWrite sched> {
6336 let isCommutable = 1 in
6337 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6338 (ins RC:$src1, RC:$src2, u8imm:$src3),
6340 !strconcat(OpcodeStr,
6341 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6342 !strconcat(OpcodeStr,
6343 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6344 [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
6346 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6347 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6349 !strconcat(OpcodeStr,
6350 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6351 !strconcat(OpcodeStr,
6352 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6354 (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>,
6355 Sched<[sched.Folded, sched.ReadAfterFold]>;
6358 /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
6359 multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6360 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6361 X86MemOperand x86memop, bit Is2Addr,
6362 X86FoldableSchedWrite sched> {
6363 let isCommutable = 1 in
6364 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6365 (ins RC:$src1, RC:$src2, u8imm:$src3),
6367 !strconcat(OpcodeStr,
6368 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6369 !strconcat(OpcodeStr,
6370 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6371 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
6373 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6374 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6376 !strconcat(OpcodeStr,
6377 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6378 !strconcat(OpcodeStr,
6379 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6381 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
6382 Sched<[sched.Folded, sched.ReadAfterFold]>;
6385 def BlendCommuteImm2 : SDNodeXForm<imm, [{
6386 uint8_t Imm = N->getZExtValue() & 0x03;
6387 return getI8Imm(Imm ^ 0x03, SDLoc(N));
6390 def BlendCommuteImm4 : SDNodeXForm<imm, [{
6391 uint8_t Imm = N->getZExtValue() & 0x0f;
6392 return getI8Imm(Imm ^ 0x0f, SDLoc(N));
6395 def BlendCommuteImm8 : SDNodeXForm<imm, [{
6396 uint8_t Imm = N->getZExtValue() & 0xff;
6397 return getI8Imm(Imm ^ 0xff, SDLoc(N));
6400 let Predicates = [HasAVX] in {
6401 let isCommutable = 0 in {
6402 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6403 VR128, load, i128mem, 0,
6404 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
6407 let ExeDomain = SSEPackedSingle in
6408 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6409 VR128, load, f128mem, 0,
6410 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
6411 let ExeDomain = SSEPackedDouble in
6412 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6413 VR128, load, f128mem, 0,
6414 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
6415 let ExeDomain = SSEPackedSingle in
6416 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6417 VR256, load, i256mem, 0,
6418 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
6421 let Predicates = [HasAVX2] in {
6422 let isCommutable = 0 in {
6423 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6424 VR256, load, i256mem, 0,
6425 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
6429 let Constraints = "$src1 = $dst" in {
6430 let isCommutable = 0 in {
6431 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6432 VR128, memop, i128mem, 1,
6433 SchedWriteMPSAD.XMM>;
6436 let ExeDomain = SSEPackedSingle in
6437 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6438 VR128, memop, f128mem, 1,
6439 SchedWriteDPPS.XMM>;
6440 let ExeDomain = SSEPackedDouble in
6441 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6442 VR128, memop, f128mem, 1,
6443 SchedWriteDPPD.XMM>;
6446 /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6447 multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6448 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6449 X86MemOperand x86memop, bit Is2Addr, Domain d,
6450 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6451 let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6452 let isCommutable = 1 in
6453 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6454 (ins RC:$src1, RC:$src2, u8imm:$src3),
6456 !strconcat(OpcodeStr,
6457 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6458 !strconcat(OpcodeStr,
6459 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6460 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
6462 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6463 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6465 !strconcat(OpcodeStr,
6466 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6467 !strconcat(OpcodeStr,
6468 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6470 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
6471 Sched<[sched.Folded, sched.ReadAfterFold]>;
6474 // Pattern to commute if load is in first source.
6475 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)),
6476 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6477 (commuteXForm imm:$src3))>;
6480 let Predicates = [HasAVX] in {
6481 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6482 VR128, load, f128mem, 0, SSEPackedSingle,
6483 SchedWriteFBlend.XMM, BlendCommuteImm4>,
6485 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6486 VR256, load, f256mem, 0, SSEPackedSingle,
6487 SchedWriteFBlend.YMM, BlendCommuteImm8>,
6488 VEX_4V, VEX_L, VEX_WIG;
6489 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6490 VR128, load, f128mem, 0, SSEPackedDouble,
6491 SchedWriteFBlend.XMM, BlendCommuteImm2>,
6493 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6494 VR256, load, f256mem, 0, SSEPackedDouble,
6495 SchedWriteFBlend.YMM, BlendCommuteImm4>,
6496 VEX_4V, VEX_L, VEX_WIG;
6497 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6498 VR128, load, i128mem, 0, SSEPackedInt,
6499 SchedWriteBlend.XMM, BlendCommuteImm8>,
6503 let Predicates = [HasAVX2] in {
6504 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6505 VR256, load, i256mem, 0, SSEPackedInt,
6506 SchedWriteBlend.YMM, BlendCommuteImm8>,
6507 VEX_4V, VEX_L, VEX_WIG;
6510 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6511 VR128, memop, f128mem, 1, SSEPackedSingle,
6512 SchedWriteFBlend.XMM, BlendCommuteImm4>;
6513 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6514 VR128, memop, f128mem, 1, SSEPackedDouble,
6515 SchedWriteFBlend.XMM, BlendCommuteImm2>;
6516 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6517 VR128, memop, i128mem, 1, SSEPackedInt,
6518 SchedWriteBlend.XMM, BlendCommuteImm8>;
6520 // For insertion into the zero index (low half) of a 256-bit vector, it is
6521 // more efficient to generate a blend with immediate instead of an insert*128.
6522 let Predicates = [HasAVX] in {
6523 def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6524 (VBLENDPDYrri VR256:$src1,
6525 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6526 VR128:$src2, sub_xmm), 0x3)>;
6527 def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6528 (VBLENDPSYrri VR256:$src1,
6529 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6530 VR128:$src2, sub_xmm), 0xf)>;
6533 /// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6534 multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6535 X86MemOperand x86memop, ValueType VT,
6536 PatFrag mem_frag, SDNode OpNode,
6537 X86FoldableSchedWrite sched> {
6538 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6539 (ins RC:$src1, RC:$src2, RC:$src3),
6540 !strconcat(OpcodeStr,
6541 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6542 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6543 SSEPackedInt>, TAPD, VEX_4V,
6546 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6547 (ins RC:$src1, x86memop:$src2, RC:$src3),
6548 !strconcat(OpcodeStr,
6549 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6551 (OpNode RC:$src3, (mem_frag addr:$src2),
6552 RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6553 Sched<[sched.Folded, sched.ReadAfterFold,
6555 ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6558 sched.ReadAfterFold]>;
6561 let Predicates = [HasAVX] in {
6562 let ExeDomain = SSEPackedDouble in {
6563 defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6564 v2f64, loadv2f64, X86Blendv,
6565 SchedWriteFVarBlend.XMM>;
6566 defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6567 v4f64, loadv4f64, X86Blendv,
6568 SchedWriteFVarBlend.YMM>, VEX_L;
6569 } // ExeDomain = SSEPackedDouble
6570 let ExeDomain = SSEPackedSingle in {
6571 defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6572 v4f32, loadv4f32, X86Blendv,
6573 SchedWriteFVarBlend.XMM>;
6574 defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6575 v8f32, loadv8f32, X86Blendv,
6576 SchedWriteFVarBlend.YMM>, VEX_L;
6577 } // ExeDomain = SSEPackedSingle
6578 defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6579 v16i8, loadv16i8, X86Blendv,
6580 SchedWriteVarBlend.XMM>;
6583 let Predicates = [HasAVX2] in {
6584 defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6585 v32i8, loadv32i8, X86Blendv,
6586 SchedWriteVarBlend.YMM>, VEX_L;
6589 let Predicates = [HasAVX] in {
6590 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6591 (v4i32 VR128:$src2))),
6592 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6593 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6594 (v2i64 VR128:$src2))),
6595 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6596 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6597 (v8i32 VR256:$src2))),
6598 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6599 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6600 (v4i64 VR256:$src2))),
6601 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6604 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6605 // changed to use blends because blends have better throughput on sandybridge
6606 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6607 let Predicates = [HasAVX, OptForSpeed] in {
6608 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6609 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6610 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6611 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6613 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6614 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6615 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6616 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6617 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6618 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6620 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6621 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6622 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6623 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6624 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6625 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6627 // Move low f32 and clear high bits.
6628 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6629 (SUBREG_TO_REG (i32 0),
6630 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6631 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6632 (i8 1))), sub_xmm)>;
6633 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6634 (SUBREG_TO_REG (i32 0),
6635 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6636 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6637 (i8 3))), sub_xmm)>;
6639 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
6640 (SUBREG_TO_REG (i32 0),
6641 (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
6642 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)),
6643 (i8 1))), sub_xmm)>;
6644 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
6645 (SUBREG_TO_REG (i32 0),
6646 (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
6647 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)),
6648 (i8 0xf))), sub_xmm)>;
6651 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6652 // changed to use blends because blends have better throughput on sandybridge
6653 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6654 let Predicates = [UseSSE41, OptForSpeed] in {
6655 // With SSE41 we can use blends for these patterns.
6656 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6657 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6658 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6659 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6661 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6662 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6663 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6664 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6665 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6666 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6668 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6669 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6670 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6671 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6672 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6673 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6677 /// SS41I_ternary - SSE 4.1 ternary operator
6678 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6679 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6680 PatFrag mem_frag, X86MemOperand x86memop,
6681 SDNode OpNode, X86FoldableSchedWrite sched> {
6682 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6683 (ins VR128:$src1, VR128:$src2),
6684 !strconcat(OpcodeStr,
6685 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6687 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6690 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6691 (ins VR128:$src1, x86memop:$src2),
6692 !strconcat(OpcodeStr,
6693 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6695 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6696 Sched<[sched.Folded, sched.ReadAfterFold]>;
6700 let ExeDomain = SSEPackedDouble in
6701 defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6702 X86Blendv, SchedWriteFVarBlend.XMM>;
6703 let ExeDomain = SSEPackedSingle in
6704 defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6705 X86Blendv, SchedWriteFVarBlend.XMM>;
6706 defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6707 X86Blendv, SchedWriteVarBlend.XMM>;
6709 // Aliases with the implicit xmm0 argument
6710 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6711 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6712 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6713 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6714 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6715 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6716 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6717 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6718 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6719 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6720 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6721 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6723 let Predicates = [UseSSE41] in {
6724 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6725 (v4i32 VR128:$src2))),
6726 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6727 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6728 (v2i64 VR128:$src2))),
6729 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6732 let AddedComplexity = 400 in { // Prefer non-temporal versions
6734 let Predicates = [HasAVX, NoVLX] in
6735 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6736 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6737 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6738 let Predicates = [HasAVX2, NoVLX] in
6739 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6740 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6741 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6742 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6743 "movntdqa\t{$src, $dst|$dst, $src}", []>,
6744 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6746 let Predicates = [HasAVX2, NoVLX] in {
6747 def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6748 (VMOVNTDQAYrm addr:$src)>;
6749 def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6750 (VMOVNTDQAYrm addr:$src)>;
6751 def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6752 (VMOVNTDQAYrm addr:$src)>;
6753 def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6754 (VMOVNTDQAYrm addr:$src)>;
6755 def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6756 (VMOVNTDQAYrm addr:$src)>;
6757 def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6758 (VMOVNTDQAYrm addr:$src)>;
6761 let Predicates = [HasAVX, NoVLX] in {
6762 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6763 (VMOVNTDQArm addr:$src)>;
6764 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6765 (VMOVNTDQArm addr:$src)>;
6766 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6767 (VMOVNTDQArm addr:$src)>;
6768 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6769 (VMOVNTDQArm addr:$src)>;
6770 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6771 (VMOVNTDQArm addr:$src)>;
6772 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6773 (VMOVNTDQArm addr:$src)>;
6776 let Predicates = [UseSSE41] in {
6777 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6778 (MOVNTDQArm addr:$src)>;
6779 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6780 (MOVNTDQArm addr:$src)>;
6781 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6782 (MOVNTDQArm addr:$src)>;
6783 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6784 (MOVNTDQArm addr:$src)>;
6785 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6786 (MOVNTDQArm addr:$src)>;
6787 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6788 (MOVNTDQArm addr:$src)>;
6791 } // AddedComplexity
6793 //===----------------------------------------------------------------------===//
6794 // SSE4.2 - Compare Instructions
6795 //===----------------------------------------------------------------------===//
6797 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
6798 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6799 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6800 X86MemOperand x86memop, X86FoldableSchedWrite sched,
6802 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6803 (ins RC:$src1, RC:$src2),
6805 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6806 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6807 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6809 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6810 (ins RC:$src1, x86memop:$src2),
6812 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6813 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6815 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6816 Sched<[sched.Folded, sched.ReadAfterFold]>;
6819 let Predicates = [HasAVX] in
6820 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6821 load, i128mem, SchedWriteVecALU.XMM, 0>,
6824 let Predicates = [HasAVX2] in
6825 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6826 load, i256mem, SchedWriteVecALU.YMM, 0>,
6827 VEX_4V, VEX_L, VEX_WIG;
6829 let Constraints = "$src1 = $dst" in
6830 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6831 memop, i128mem, SchedWriteVecALU.XMM>;
6833 //===----------------------------------------------------------------------===//
6834 // SSE4.2 - String/text Processing Instructions
6835 //===----------------------------------------------------------------------===//
6837 multiclass pcmpistrm_SS42AI<string asm> {
6838 def rr : SS42AI<0x62, MRMSrcReg, (outs),
6839 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6840 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6841 []>, Sched<[WritePCmpIStrM]>;
6843 def rm :SS42AI<0x62, MRMSrcMem, (outs),
6844 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6845 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6846 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6849 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6850 let Predicates = [HasAVX] in
6851 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
6852 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
6855 multiclass SS42AI_pcmpestrm<string asm> {
6856 def rr : SS42AI<0x60, MRMSrcReg, (outs),
6857 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6858 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6859 []>, Sched<[WritePCmpEStrM]>;
6861 def rm : SS42AI<0x60, MRMSrcMem, (outs),
6862 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6863 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6864 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6867 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6868 let Predicates = [HasAVX] in
6869 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
6870 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
6873 multiclass SS42AI_pcmpistri<string asm> {
6874 def rr : SS42AI<0x63, MRMSrcReg, (outs),
6875 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6876 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6877 []>, Sched<[WritePCmpIStrI]>;
6879 def rm : SS42AI<0x63, MRMSrcMem, (outs),
6880 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6881 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6882 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6885 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6886 let Predicates = [HasAVX] in
6887 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
6888 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
6891 multiclass SS42AI_pcmpestri<string asm> {
6892 def rr : SS42AI<0x61, MRMSrcReg, (outs),
6893 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6894 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6895 []>, Sched<[WritePCmpEStrI]>;
6897 def rm : SS42AI<0x61, MRMSrcMem, (outs),
6898 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6899 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6900 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6903 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6904 let Predicates = [HasAVX] in
6905 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
6906 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
6909 //===----------------------------------------------------------------------===//
6910 // SSE4.2 - CRC Instructions
6911 //===----------------------------------------------------------------------===//
6913 // No CRC instructions have AVX equivalents
6915 // crc intrinsic instruction
6916 // This set of instructions are only rm, the only difference is the size
6918 class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6919 RegisterClass RCIn, SDPatternOperator Int> :
6920 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6921 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6922 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6923 Sched<[WriteCRC32]>;
6925 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6926 X86MemOperand x86memop, SDPatternOperator Int> :
6927 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6928 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6929 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6930 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6932 let Constraints = "$src1 = $dst" in {
6933 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6934 int_x86_sse42_crc32_32_8>;
6935 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6936 int_x86_sse42_crc32_32_8>;
6937 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6938 int_x86_sse42_crc32_32_16>, OpSize16;
6939 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6940 int_x86_sse42_crc32_32_16>, OpSize16;
6941 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6942 int_x86_sse42_crc32_32_32>, OpSize32;
6943 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6944 int_x86_sse42_crc32_32_32>, OpSize32;
6945 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6946 int_x86_sse42_crc32_64_64>, REX_W;
6947 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6948 int_x86_sse42_crc32_64_64>, REX_W;
6949 let hasSideEffects = 0 in {
6951 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6953 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6958 //===----------------------------------------------------------------------===//
6959 // SHA-NI Instructions
6960 //===----------------------------------------------------------------------===//
6962 // FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6963 multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6964 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6965 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6966 (ins VR128:$src1, VR128:$src2),
6968 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6969 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6971 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6972 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6975 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6976 (ins VR128:$src1, i128mem:$src2),
6978 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6979 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6981 (set VR128:$dst, (IntId VR128:$src1,
6982 (memop addr:$src2), XMM0)),
6983 (set VR128:$dst, (IntId VR128:$src1,
6984 (memop addr:$src2))))]>, T8,
6985 Sched<[sched.Folded, sched.ReadAfterFold]>;
6988 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6989 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6990 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6991 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6993 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6994 (i8 imm:$src3)))]>, TA,
6995 Sched<[SchedWriteVecIMul.XMM]>;
6996 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6997 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6998 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7000 (int_x86_sha1rnds4 VR128:$src1,
7002 (i8 imm:$src3)))]>, TA,
7003 Sched<[SchedWriteVecIMul.XMM.Folded,
7004 SchedWriteVecIMul.XMM.ReadAfterFold]>;
7006 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
7007 SchedWriteVecIMul.XMM>;
7008 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
7009 SchedWriteVecIMul.XMM>;
7010 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
7011 SchedWriteVecIMul.XMM>;
7014 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
7015 SchedWriteVecIMul.XMM, 1>;
7017 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
7018 SchedWriteVecIMul.XMM>;
7019 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
7020 SchedWriteVecIMul.XMM>;
7023 // Aliases with explicit %xmm0
7024 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
7025 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
7026 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
7027 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
7029 //===----------------------------------------------------------------------===//
7030 // AES-NI Instructions
7031 //===----------------------------------------------------------------------===//
7033 multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
7034 Intrinsic IntId, PatFrag ld_frag,
7035 bit Is2Addr = 0, RegisterClass RC = VR128,
7036 X86MemOperand MemOp = i128mem> {
7037 let AsmString = OpcodeStr##
7038 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
7039 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
7040 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
7041 (ins RC:$src1, RC:$src2), "",
7042 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
7043 Sched<[WriteAESDecEnc]>;
7044 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
7045 (ins RC:$src1, MemOp:$src2), "",
7046 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
7047 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
7051 // Perform One Round of an AES Encryption/Decryption Flow
7052 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
7053 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
7054 int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
7055 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
7056 int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
7057 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
7058 int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
7059 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
7060 int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
7063 let Predicates = [NoVLX, HasVAES] in {
7064 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc",
7065 int_x86_aesni_aesenc_256, load, 0, VR256,
7066 i256mem>, VEX_4V, VEX_L, VEX_WIG;
7067 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast",
7068 int_x86_aesni_aesenclast_256, load, 0, VR256,
7069 i256mem>, VEX_4V, VEX_L, VEX_WIG;
7070 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec",
7071 int_x86_aesni_aesdec_256, load, 0, VR256,
7072 i256mem>, VEX_4V, VEX_L, VEX_WIG;
7073 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast",
7074 int_x86_aesni_aesdeclast_256, load, 0, VR256,
7075 i256mem>, VEX_4V, VEX_L, VEX_WIG;
7078 let Constraints = "$src1 = $dst" in {
7079 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
7080 int_x86_aesni_aesenc, memop, 1>;
7081 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
7082 int_x86_aesni_aesenclast, memop, 1>;
7083 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
7084 int_x86_aesni_aesdec, memop, 1>;
7085 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
7086 int_x86_aesni_aesdeclast, memop, 1>;
7089 // Perform the AES InvMixColumn Transformation
7090 let Predicates = [HasAVX, HasAES] in {
7091 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7093 "vaesimc\t{$src1, $dst|$dst, $src1}",
7095 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
7097 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7098 (ins i128mem:$src1),
7099 "vaesimc\t{$src1, $dst|$dst, $src1}",
7100 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
7101 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
7103 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7105 "aesimc\t{$src1, $dst|$dst, $src1}",
7107 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
7108 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7109 (ins i128mem:$src1),
7110 "aesimc\t{$src1, $dst|$dst, $src1}",
7111 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
7112 Sched<[WriteAESIMC.Folded]>;
7114 // AES Round Key Generation Assist
7115 let Predicates = [HasAVX, HasAES] in {
7116 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7117 (ins VR128:$src1, u8imm:$src2),
7118 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7120 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7121 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
7122 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7123 (ins i128mem:$src1, u8imm:$src2),
7124 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7126 (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>,
7127 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
7129 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7130 (ins VR128:$src1, u8imm:$src2),
7131 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7133 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7134 Sched<[WriteAESKeyGen]>;
7135 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7136 (ins i128mem:$src1, u8imm:$src2),
7137 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7139 (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>,
7140 Sched<[WriteAESKeyGen.Folded]>;
7142 //===----------------------------------------------------------------------===//
7143 // PCLMUL Instructions
7144 //===----------------------------------------------------------------------===//
7146 // Immediate transform to help with commuting.
7147 def PCLMULCommuteImm : SDNodeXForm<imm, [{
7148 uint8_t Imm = N->getZExtValue();
7149 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
7152 // SSE carry-less Multiplication instructions
7153 let Predicates = [NoAVX, HasPCLMUL] in {
7154 let Constraints = "$src1 = $dst" in {
7155 let isCommutable = 1 in
7156 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7157 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7158 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7160 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
7161 Sched<[WriteCLMul]>;
7163 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7164 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7165 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7167 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
7169 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
7170 } // Constraints = "$src1 = $dst"
7172 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
7174 (PCLMULQDQrm VR128:$src1, addr:$src2,
7175 (PCLMULCommuteImm imm:$src3))>;
7176 } // Predicates = [NoAVX, HasPCLMUL]
7179 foreach HI = ["hq","lq"] in
7180 foreach LO = ["hq","lq"] in {
7181 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
7182 (PCLMULQDQrr VR128:$dst, VR128:$src,
7183 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
7184 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
7185 (PCLMULQDQrm VR128:$dst, i128mem:$src,
7186 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
7189 // AVX carry-less Multiplication instructions
7190 multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
7191 PatFrag LdFrag, Intrinsic IntId> {
7192 let isCommutable = 1 in
7193 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
7194 (ins RC:$src1, RC:$src2, u8imm:$src3),
7195 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7197 (IntId RC:$src1, RC:$src2, imm:$src3))]>,
7198 Sched<[WriteCLMul]>;
7200 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
7201 (ins RC:$src1, MemOp:$src2, u8imm:$src3),
7202 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7204 (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>,
7205 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
7207 // We can commute a load in the first operand by swapping the sources and
7208 // rotating the immediate.
7209 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)),
7210 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
7211 (PCLMULCommuteImm imm:$src3))>;
7214 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
7215 defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
7216 int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
7218 let Predicates = [NoVLX, HasVPCLMULQDQ] in
7219 defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
7220 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
7222 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
7223 X86MemOperand MemOp, string Hi, string Lo> {
7224 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7225 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
7226 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
7227 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7228 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
7229 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
7232 multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
7233 X86MemOperand MemOp> {
7234 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
7235 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
7236 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
7237 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
7241 defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
7242 defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
7244 //===----------------------------------------------------------------------===//
7245 // SSE4A Instructions
7246 //===----------------------------------------------------------------------===//
7248 let Predicates = [HasSSE4A] in {
7250 let ExeDomain = SSEPackedInt in {
7251 let Constraints = "$src = $dst" in {
7252 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
7253 (ins VR128:$src, u8imm:$len, u8imm:$idx),
7254 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7255 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
7257 PD, Sched<[SchedWriteVecALU.XMM]>;
7258 def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
7259 (ins VR128:$src, VR128:$mask),
7260 "extrq\t{$mask, $src|$src, $mask}",
7261 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7263 PD, Sched<[SchedWriteVecALU.XMM]>;
7265 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7266 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
7267 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7268 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
7269 imm:$len, imm:$idx))]>,
7270 XD, Sched<[SchedWriteVecALU.XMM]>;
7271 def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
7272 (ins VR128:$src, VR128:$mask),
7273 "insertq\t{$mask, $src|$src, $mask}",
7274 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7276 XD, Sched<[SchedWriteVecALU.XMM]>;
7278 } // ExeDomain = SSEPackedInt
7280 // Non-temporal (unaligned) scalar stores.
7281 let AddedComplexity = 400 in { // Prefer non-temporal versions
7282 let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
7283 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7284 "movntss\t{$src, $dst|$dst, $src}", []>, XS;
7286 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7287 "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
7290 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
7291 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7293 def : Pat<(nontemporalstore FR64:$src, addr:$dst),
7294 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7296 } // AddedComplexity
7299 //===----------------------------------------------------------------------===//
7301 //===----------------------------------------------------------------------===//
7303 //===----------------------------------------------------------------------===//
7304 // VBROADCAST - Load from memory and broadcast to all elements of the
7305 // destination operand
7307 class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
7308 X86MemOperand x86memop, ValueType VT,
7309 PatFrag ld_frag, SchedWrite Sched> :
7310 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7311 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7312 [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
7313 Sched<[Sched]>, VEX;
7315 // AVX2 adds register forms
7316 class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
7317 ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
7318 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7319 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7320 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
7321 Sched<[Sched]>, VEX;
7323 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
7324 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
7325 f32mem, v4f32, loadf32,
7326 SchedWriteFShuffle.XMM.Folded>;
7327 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
7328 f32mem, v8f32, loadf32,
7329 SchedWriteFShuffle.XMM.Folded>, VEX_L;
7331 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
7332 def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
7334 SchedWriteFShuffle.XMM.Folded>, VEX_L;
7336 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
7337 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
7338 v4f32, v4f32, SchedWriteFShuffle.XMM>;
7339 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
7340 v8f32, v4f32, WriteFShuffle256>, VEX_L;
7342 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
7343 def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
7344 v4f64, v2f64, WriteFShuffle256>, VEX_L;
7346 let Predicates = [HasAVX, NoVLX] in {
7347 def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
7348 (VBROADCASTSSrm addr:$src)>;
7349 def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
7350 (VBROADCASTSSYrm addr:$src)>;
7351 def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
7352 (VBROADCASTSDYrm addr:$src)>;
7355 //===----------------------------------------------------------------------===//
7356 // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7357 // halves of a 256-bit vector.
7359 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7360 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7362 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7363 Sched<[WriteShuffleLd]>, VEX, VEX_L;
7365 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7366 ExeDomain = SSEPackedSingle in
7367 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7369 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7370 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7372 let Predicates = [HasAVX2, NoVLX] in {
7373 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
7374 (VBROADCASTI128 addr:$src)>;
7375 def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
7376 (VBROADCASTI128 addr:$src)>;
7377 def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
7378 (VBROADCASTI128 addr:$src)>;
7379 def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
7380 (VBROADCASTI128 addr:$src)>;
7383 let Predicates = [HasAVX, NoVLX] in {
7384 def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
7385 (VBROADCASTF128 addr:$src)>;
7386 def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
7387 (VBROADCASTF128 addr:$src)>;
7390 let Predicates = [HasAVX1Only] in {
7391 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
7392 (VBROADCASTF128 addr:$src)>;
7393 def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
7394 (VBROADCASTF128 addr:$src)>;
7395 def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
7396 (VBROADCASTF128 addr:$src)>;
7397 def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
7398 (VBROADCASTF128 addr:$src)>;
7401 //===----------------------------------------------------------------------===//
7402 // VINSERTF128 - Insert packed floating-point values
7404 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7405 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7406 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7407 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7408 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7410 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7411 (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7412 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7413 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7416 // To create a 256-bit all ones value, we should produce VCMPTRUEPS
7417 // with YMM register containing zero.
7418 // FIXME: Avoid producing vxorps to clear the fake inputs.
7419 let Predicates = [HasAVX1Only] in {
7420 def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7423 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
7424 PatFrag memop_frag> {
7425 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7427 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7428 (INSERT_get_vinsert128_imm VR256:$ins))>;
7429 def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7430 (From (memop_frag addr:$src2)),
7432 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7433 (INSERT_get_vinsert128_imm VR256:$ins))>;
7436 let Predicates = [HasAVX, NoVLX] in {
7437 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
7438 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
7441 let Predicates = [HasAVX1Only] in {
7442 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>;
7443 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>;
7444 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
7445 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>;
7448 //===----------------------------------------------------------------------===//
7449 // VEXTRACTF128 - Extract packed floating-point values
7451 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7452 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7453 (ins VR256:$src1, u8imm:$src2),
7454 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7455 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7457 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7458 (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7459 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7460 []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7463 multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7464 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7465 (To (!cast<Instruction>(InstrStr#rr)
7467 (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7468 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7469 (iPTR imm))), addr:$dst),
7470 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7471 (EXTRACT_get_vextract128_imm VR128:$ext))>;
7475 let Predicates = [HasAVX, NoVLX] in {
7476 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7477 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7480 let Predicates = [HasAVX1Only] in {
7481 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
7482 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
7483 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7484 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
7487 //===----------------------------------------------------------------------===//
7488 // VMASKMOV - Conditional SIMD Packed Loads and Stores
7490 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7491 Intrinsic IntLd, Intrinsic IntLd256,
7492 Intrinsic IntSt, Intrinsic IntSt256> {
7493 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7494 (ins VR128:$src1, f128mem:$src2),
7495 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7496 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7497 VEX_4V, Sched<[WriteFMaskedLoad]>;
7498 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7499 (ins VR256:$src1, f256mem:$src2),
7500 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7501 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7502 VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>;
7503 def mr : AVX8I<opc_mr, MRMDestMem, (outs),
7504 (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7505 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7506 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7507 VEX_4V, Sched<[WriteFMaskedStore]>;
7508 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7509 (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7510 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7511 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7512 VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>;
7515 let ExeDomain = SSEPackedSingle in
7516 defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7517 int_x86_avx_maskload_ps,
7518 int_x86_avx_maskload_ps_256,
7519 int_x86_avx_maskstore_ps,
7520 int_x86_avx_maskstore_ps_256>;
7521 let ExeDomain = SSEPackedDouble in
7522 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7523 int_x86_avx_maskload_pd,
7524 int_x86_avx_maskload_pd_256,
7525 int_x86_avx_maskstore_pd,
7526 int_x86_avx_maskstore_pd_256>;
7528 //===----------------------------------------------------------------------===//
7529 // VPERMIL - Permute Single and Double Floating-Point Values
7532 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7533 RegisterClass RC, X86MemOperand x86memop_f,
7534 X86MemOperand x86memop_i,
7535 ValueType f_vt, ValueType i_vt,
7536 X86FoldableSchedWrite sched,
7537 X86FoldableSchedWrite varsched> {
7538 let Predicates = [HasAVX, NoVLX] in {
7539 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7540 (ins RC:$src1, RC:$src2),
7541 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7542 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7544 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7545 (ins RC:$src1, x86memop_i:$src2),
7546 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7547 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7548 (i_vt (load addr:$src2)))))]>, VEX_4V,
7549 Sched<[varsched.Folded, sched.ReadAfterFold]>;
7551 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7552 (ins RC:$src1, u8imm:$src2),
7553 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7554 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
7556 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7557 (ins x86memop_f:$src1, u8imm:$src2),
7558 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7560 (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
7561 Sched<[sched.Folded]>;
7562 }// Predicates = [HasAVX, NoVLX]
7565 let ExeDomain = SSEPackedSingle in {
7566 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7567 v4f32, v4i32, SchedWriteFShuffle.XMM,
7568 SchedWriteFVarShuffle.XMM>;
7569 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7570 v8f32, v8i32, SchedWriteFShuffle.YMM,
7571 SchedWriteFVarShuffle.YMM>, VEX_L;
7573 let ExeDomain = SSEPackedDouble in {
7574 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7575 v2f64, v2i64, SchedWriteFShuffle.XMM,
7576 SchedWriteFVarShuffle.XMM>;
7577 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7578 v4f64, v4i64, SchedWriteFShuffle.YMM,
7579 SchedWriteFVarShuffle.YMM>, VEX_L;
7582 //===----------------------------------------------------------------------===//
7583 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7586 let ExeDomain = SSEPackedSingle in {
7587 let isCommutable = 1 in
7588 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7589 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7590 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7591 [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7592 (i8 imm:$src3))))]>, VEX_4V, VEX_L,
7593 Sched<[WriteFShuffle256]>;
7594 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7595 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7596 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7597 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
7598 (i8 imm:$src3)))]>, VEX_4V, VEX_L,
7599 Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7602 // Immediate transform to help with commuting.
7603 def Perm2XCommuteImm : SDNodeXForm<imm, [{
7604 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7607 let Predicates = [HasAVX] in {
7608 // Pattern with load in other operand.
7609 def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
7610 VR256:$src1, (i8 imm:$imm))),
7611 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7614 let Predicates = [HasAVX1Only] in {
7615 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7616 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7617 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7618 (loadv4i64 addr:$src2), (i8 imm:$imm))),
7619 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7620 // Pattern with load in other operand.
7621 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7622 VR256:$src1, (i8 imm:$imm))),
7623 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7626 //===----------------------------------------------------------------------===//
7627 // VZERO - Zero YMM registers
7628 // Note: These instruction do not affect the YMM16-YMM31.
7631 let SchedRW = [WriteSystem] in {
7632 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7633 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7634 // Zero All YMM registers
7635 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7636 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7637 Requires<[HasAVX]>, VEX_WIG;
7639 // Zero Upper bits of YMM registers
7640 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7641 [(int_x86_avx_vzeroupper)]>, PS, VEX,
7642 Requires<[HasAVX]>, VEX_WIG;
7646 //===----------------------------------------------------------------------===//
7647 // Half precision conversion instructions
7650 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7651 X86FoldableSchedWrite sched> {
7652 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7653 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7654 [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
7655 T8PD, VEX, Sched<[sched]>;
7656 let hasSideEffects = 0, mayLoad = 1 in
7657 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7658 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7659 [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
7660 T8PD, VEX, Sched<[sched.Folded]>;
7663 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7664 SchedWrite RR, SchedWrite MR> {
7665 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7666 (ins RC:$src1, i32u8imm:$src2),
7667 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7668 [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
7669 TAPD, VEX, Sched<[RR]>;
7670 let hasSideEffects = 0, mayStore = 1 in
7671 def mr : Ii8<0x1D, MRMDestMem, (outs),
7672 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7673 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7674 TAPD, VEX, Sched<[MR]>;
7677 let Predicates = [HasF16C, NoVLX] in {
7678 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>;
7679 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L;
7680 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7682 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7683 WriteCvtPS2PHYSt>, VEX_L;
7685 // Pattern match vcvtph2ps of a scalar i64 load.
7686 def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
7687 (VCVTPH2PSrm addr:$src)>;
7688 def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
7689 (VCVTPH2PSrm addr:$src)>;
7690 def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
7691 (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
7692 (VCVTPH2PSrm addr:$src)>;
7694 def : Pat<(store (f64 (extractelt
7695 (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
7696 (iPTR 0))), addr:$dst),
7697 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
7698 def : Pat<(store (i64 (extractelt
7699 (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
7700 (iPTR 0))), addr:$dst),
7701 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
7702 def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
7703 (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
7706 // Patterns for matching conversions from float to half-float and vice versa.
7707 let Predicates = [HasF16C, NoVLX] in {
7708 // Use MXCSR.RC for rounding instead of explicitly specifying the default
7709 // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
7710 // configurations we support (the default). However, falling back to MXCSR is
7711 // more consistent with other instructions, which are always controlled by it.
7712 // It's encoded as 0b100.
7713 def : Pat<(fp_to_f16 FR32:$src),
7714 (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
7715 (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
7717 def : Pat<(f16_to_fp GR16:$src),
7718 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7719 (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
7721 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
7722 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7723 (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
7726 //===----------------------------------------------------------------------===//
7727 // AVX2 Instructions
7728 //===----------------------------------------------------------------------===//
7730 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7731 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7732 ValueType OpVT, X86FoldableSchedWrite sched,
7734 X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7735 let isCommutable = 1 in
7736 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7737 (ins RC:$src1, RC:$src2, u8imm:$src3),
7738 !strconcat(OpcodeStr,
7739 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7740 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
7741 Sched<[sched]>, VEX_4V;
7742 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7743 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7744 !strconcat(OpcodeStr,
7745 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7747 (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>,
7748 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7750 // Pattern to commute if load is in first source.
7751 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)),
7752 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7753 (commuteXForm imm:$src3))>;
7756 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7757 SchedWriteBlend.XMM, VR128, i128mem,
7759 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7760 SchedWriteBlend.YMM, VR256, i256mem,
7761 BlendCommuteImm8>, VEX_L;
7763 // For insertion into the zero index (low half) of a 256-bit vector, it is
7764 // more efficient to generate a blend with immediate instead of an insert*128.
7765 let Predicates = [HasAVX2] in {
7766 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7767 (VPBLENDDYrri VR256:$src1,
7768 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7769 VR128:$src2, sub_xmm), 0xf)>;
7770 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7771 (VPBLENDDYrri VR256:$src1,
7772 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7773 VR128:$src2, sub_xmm), 0xf)>;
7774 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7775 (VPBLENDDYrri VR256:$src1,
7776 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7777 VR128:$src2, sub_xmm), 0xf)>;
7778 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7779 (VPBLENDDYrri VR256:$src1,
7780 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7781 VR128:$src2, sub_xmm), 0xf)>;
7784 let Predicates = [HasAVX1Only] in {
7785 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7786 (VBLENDPSYrri VR256:$src1,
7787 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7788 VR128:$src2, sub_xmm), 0xf)>;
7789 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7790 (VBLENDPSYrri VR256:$src1,
7791 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7792 VR128:$src2, sub_xmm), 0xf)>;
7793 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7794 (VBLENDPSYrri VR256:$src1,
7795 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7796 VR128:$src2, sub_xmm), 0xf)>;
7797 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7798 (VBLENDPSYrri VR256:$src1,
7799 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7800 VR128:$src2, sub_xmm), 0xf)>;
7803 //===----------------------------------------------------------------------===//
7804 // VPBROADCAST - Load from memory and broadcast to all elements of the
7805 // destination operand
7807 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7808 X86MemOperand x86memop, PatFrag ld_frag,
7809 ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7810 let Predicates = [HasAVX2, prd] in {
7811 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7812 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7814 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7815 Sched<[SchedWriteShuffle.XMM]>, VEX;
7816 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7817 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7819 (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
7820 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7821 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7822 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7824 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7825 Sched<[WriteShuffle256]>, VEX, VEX_L;
7826 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7827 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7829 (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
7830 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7832 // Provide aliases for broadcast from the same register class that
7833 // automatically does the extract.
7834 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7835 (!cast<Instruction>(NAME#"Yrr")
7836 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7840 defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
7841 v16i8, v32i8, NoVLX_Or_NoBWI>;
7842 defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
7843 v8i16, v16i16, NoVLX_Or_NoBWI>;
7844 defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
7845 v4i32, v8i32, NoVLX>;
7846 defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
7847 v2i64, v4i64, NoVLX>;
7849 let Predicates = [HasAVX2, NoVLX] in {
7850 // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
7851 def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
7852 (VPBROADCASTQrm addr:$src)>;
7853 def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
7854 (VPBROADCASTQYrm addr:$src)>;
7856 def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
7857 (VPBROADCASTDrm addr:$src)>;
7858 def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
7859 (VPBROADCASTDYrm addr:$src)>;
7860 def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
7861 (VPBROADCASTQrm addr:$src)>;
7862 def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
7863 (VPBROADCASTQYrm addr:$src)>;
7865 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7866 // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
7867 // This means we'll encounter truncated i32 loads; match that here.
7868 def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7869 (VPBROADCASTWrm addr:$src)>;
7870 def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7871 (VPBROADCASTWYrm addr:$src)>;
7872 def : Pat<(v8i16 (X86VBroadcast
7873 (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7874 (VPBROADCASTWrm addr:$src)>;
7875 def : Pat<(v16i16 (X86VBroadcast
7876 (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7877 (VPBROADCASTWYrm addr:$src)>;
7880 let Predicates = [HasAVX2, NoVLX] in {
7881 // Provide aliases for broadcast from the same register class that
7882 // automatically does the extract.
7883 def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
7884 (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
7886 def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
7887 (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
7891 let Predicates = [HasAVX2, NoVLX] in {
7892 // Provide fallback in case the load node that is used in the patterns above
7893 // is used by additional users, which prevents the pattern selection.
7894 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7895 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7896 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7897 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7898 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7899 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7902 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7903 def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7904 (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS
7905 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7906 GR8:$src, sub_8bit)),
7908 def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7909 (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS
7910 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7911 GR8:$src, sub_8bit)),
7914 def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7915 (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS
7916 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7917 GR16:$src, sub_16bit)),
7919 def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7920 (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS
7921 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7922 GR16:$src, sub_16bit)),
7925 let Predicates = [HasAVX2, NoVLX] in {
7926 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7927 (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
7928 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7929 (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
7930 def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7931 (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
7932 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7933 (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
7936 // AVX1 broadcast patterns
7937 let Predicates = [HasAVX1Only] in {
7938 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
7939 (VBROADCASTSSYrm addr:$src)>;
7940 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
7941 (VBROADCASTSDYrm addr:$src)>;
7942 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
7943 (VBROADCASTSSrm addr:$src)>;
7946 // Provide fallback in case the load node that is used in the patterns above
7947 // is used by additional users, which prevents the pattern selection.
7948 let Predicates = [HasAVX, NoVLX] in {
7949 // 128bit broadcasts:
7950 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7951 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7952 def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
7953 (VMOVDDUPrm addr:$src)>;
7955 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7956 (VMOVDDUPrr VR128:$src)>;
7957 def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
7958 (VMOVDDUPrm addr:$src)>;
7959 def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))),
7960 (VMOVDDUPrm addr:$src)>;
7963 let Predicates = [HasAVX1Only] in {
7964 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7965 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7966 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7967 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7968 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7969 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7970 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7971 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7972 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7973 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7975 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7976 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>;
7977 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7978 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7979 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm),
7980 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>;
7981 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7982 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7983 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm),
7984 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>;
7986 def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7987 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>;
7988 def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
7989 (VMOVDDUPrm addr:$src)>;
7992 //===----------------------------------------------------------------------===//
7993 // VPERM - Permute instructions
7996 multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7997 ValueType OpVT, X86FoldableSchedWrite Sched,
7998 X86MemOperand memOp> {
7999 let Predicates = [HasAVX2, NoVLX] in {
8000 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8001 (ins VR256:$src1, VR256:$src2),
8002 !strconcat(OpcodeStr,
8003 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8005 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
8006 Sched<[Sched]>, VEX_4V, VEX_L;
8007 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8008 (ins VR256:$src1, memOp:$src2),
8009 !strconcat(OpcodeStr,
8010 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8012 (OpVT (X86VPermv VR256:$src1,
8013 (load addr:$src2))))]>,
8014 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
8018 defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
8019 let ExeDomain = SSEPackedSingle in
8020 defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
8022 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8023 ValueType OpVT, X86FoldableSchedWrite Sched,
8024 X86MemOperand memOp> {
8025 let Predicates = [HasAVX2, NoVLX] in {
8026 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
8027 (ins VR256:$src1, u8imm:$src2),
8028 !strconcat(OpcodeStr,
8029 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8031 (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
8032 Sched<[Sched]>, VEX, VEX_L;
8033 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
8034 (ins memOp:$src1, u8imm:$src2),
8035 !strconcat(OpcodeStr,
8036 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8038 (OpVT (X86VPermi (mem_frag addr:$src1),
8039 (i8 imm:$src2))))]>,
8040 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
8044 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
8045 WriteShuffle256, i256mem>, VEX_W;
8046 let ExeDomain = SSEPackedDouble in
8047 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
8048 WriteFShuffle256, f256mem>, VEX_W;
8050 //===----------------------------------------------------------------------===//
8051 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
8053 let isCommutable = 1 in
8054 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
8055 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
8056 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8057 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8058 (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
8060 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
8061 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
8062 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8063 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
8065 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
8067 let Predicates = [HasAVX2] in
8068 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
8069 VR256:$src1, (i8 imm:$imm))),
8070 (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
8073 //===----------------------------------------------------------------------===//
8074 // VINSERTI128 - Insert packed integer values
8076 let hasSideEffects = 0 in {
8077 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
8078 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
8079 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8080 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
8082 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
8083 (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
8084 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8085 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
8088 let Predicates = [HasAVX2, NoVLX] in {
8089 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>;
8090 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>;
8091 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
8092 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>;
8095 //===----------------------------------------------------------------------===//
8096 // VEXTRACTI128 - Extract packed integer values
8098 def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
8099 (ins VR256:$src1, u8imm:$src2),
8100 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8101 Sched<[WriteShuffle256]>, VEX, VEX_L;
8102 let hasSideEffects = 0, mayStore = 1 in
8103 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
8104 (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
8105 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8106 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
8108 let Predicates = [HasAVX2, NoVLX] in {
8109 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
8110 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
8111 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
8112 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
8115 //===----------------------------------------------------------------------===//
8116 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
8118 multiclass avx2_pmovmask<string OpcodeStr,
8119 Intrinsic IntLd128, Intrinsic IntLd256,
8120 Intrinsic IntSt128, Intrinsic IntSt256> {
8121 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
8122 (ins VR128:$src1, i128mem:$src2),
8123 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8124 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
8125 VEX_4V, Sched<[WriteVecMaskedLoad]>;
8126 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
8127 (ins VR256:$src1, i256mem:$src2),
8128 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8129 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8130 VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
8131 def mr : AVX28I<0x8e, MRMDestMem, (outs),
8132 (ins i128mem:$dst, VR128:$src1, VR128:$src2),
8133 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8134 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
8135 VEX_4V, Sched<[WriteVecMaskedStore]>;
8136 def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
8137 (ins i256mem:$dst, VR256:$src1, VR256:$src2),
8138 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8139 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
8140 VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
8143 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
8144 int_x86_avx2_maskload_d,
8145 int_x86_avx2_maskload_d_256,
8146 int_x86_avx2_maskstore_d,
8147 int_x86_avx2_maskstore_d_256>;
8148 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
8149 int_x86_avx2_maskload_q,
8150 int_x86_avx2_maskload_q_256,
8151 int_x86_avx2_maskstore_q,
8152 int_x86_avx2_maskstore_q_256>, VEX_W;
8154 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
8155 ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
8157 def: Pat<(X86mstore (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
8158 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
8160 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)),
8161 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
8162 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask),
8163 (VT (bitconvert (ZeroVT immAllZerosV))))),
8164 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
8165 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
8166 (!cast<Instruction>(BlendStr#"rr")
8168 (VT (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)),
8171 let Predicates = [HasAVX] in {
8172 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
8173 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
8174 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
8175 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
8177 let Predicates = [HasAVX1Only] in {
8178 // load/store i32/i64 not supported use ps/pd version
8179 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
8180 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
8181 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
8182 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
8184 let Predicates = [HasAVX2] in {
8185 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
8186 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
8187 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
8188 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
8191 //===----------------------------------------------------------------------===//
8192 // SubVector Broadcasts
8193 // Provide fallback in case the load node that is used in the patterns above
8194 // is used by additional users, which prevents the pattern selection.
8196 let Predicates = [HasAVX2, NoVLX] in {
8197 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
8198 (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8199 (v2i64 VR128:$src), 1)>;
8200 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
8201 (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8202 (v4i32 VR128:$src), 1)>;
8203 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
8204 (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8205 (v8i16 VR128:$src), 1)>;
8206 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
8207 (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8208 (v16i8 VR128:$src), 1)>;
8211 let Predicates = [HasAVX, NoVLX] in {
8212 def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
8213 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8214 (v2f64 VR128:$src), 1)>;
8215 def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
8216 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8217 (v4f32 VR128:$src), 1)>;
8220 let Predicates = [HasAVX1Only] in {
8221 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
8222 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8223 (v2i64 VR128:$src), 1)>;
8224 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
8225 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8226 (v4i32 VR128:$src), 1)>;
8227 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
8228 (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8229 (v8i16 VR128:$src), 1)>;
8230 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
8231 (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8232 (v16i8 VR128:$src), 1)>;
8235 //===----------------------------------------------------------------------===//
8236 // Variable Bit Shifts
8238 multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
8239 SDNode IntrinNode, ValueType vt128, ValueType vt256> {
8240 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
8241 (ins VR128:$src1, VR128:$src2),
8242 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8244 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
8245 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
8246 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
8247 (ins VR128:$src1, i128mem:$src2),
8248 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8250 (vt128 (OpNode VR128:$src1,
8251 (vt128 (load addr:$src2)))))]>,
8252 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
8253 SchedWriteVarVecShift.XMM.ReadAfterFold]>;
8254 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8255 (ins VR256:$src1, VR256:$src2),
8256 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8258 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
8259 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
8260 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8261 (ins VR256:$src1, i256mem:$src2),
8262 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8264 (vt256 (OpNode VR256:$src1,
8265 (vt256 (load addr:$src2)))))]>,
8266 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
8267 SchedWriteVarVecShift.YMM.ReadAfterFold]>;
8269 def : Pat<(vt128 (IntrinNode VR128:$src1, VR128:$src2)),
8270 (!cast<Instruction>(NAME#"rr") VR128:$src1, VR128:$src2)>;
8271 def : Pat<(vt128 (IntrinNode VR128:$src1, (load addr:$src2))),
8272 (!cast<Instruction>(NAME#"rm") VR128:$src1, addr:$src2)>;
8273 def : Pat<(vt256 (IntrinNode VR256:$src1, VR256:$src2)),
8274 (!cast<Instruction>(NAME#"Yrr") VR256:$src1, VR256:$src2)>;
8275 def : Pat<(vt256 (IntrinNode VR256:$src1, (load addr:$src2))),
8276 (!cast<Instruction>(NAME#"Yrm") VR256:$src1, addr:$src2)>;
8279 let Predicates = [HasAVX2, NoVLX] in {
8280 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, X86vshlv, v4i32, v8i32>;
8281 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, X86vshlv, v2i64, v4i64>, VEX_W;
8282 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, X86vsrlv, v4i32, v8i32>;
8283 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, X86vsrlv, v2i64, v4i64>, VEX_W;
8284 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, X86vsrav, v4i32, v8i32>;
8287 //===----------------------------------------------------------------------===//
8288 // VGATHER - GATHER Operations
8290 // FIXME: Improve scheduling of gather instructions.
8291 multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
8292 ValueType VTy, PatFrag GatherNode128,
8293 PatFrag GatherNode256, RegisterClass RC256,
8294 X86MemOperand memop128, X86MemOperand memop256,
8295 ValueType MTx = VTx, ValueType MTy = VTy> {
8296 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
8297 (ins VR128:$src1, memop128:$src2, VR128:$mask),
8298 !strconcat(OpcodeStr,
8299 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8300 [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
8301 (GatherNode128 VR128:$src1, VR128:$mask,
8302 vectoraddr:$src2))]>,
8303 VEX, Sched<[WriteLoad]>;
8304 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
8305 (ins RC256:$src1, memop256:$src2, RC256:$mask),
8306 !strconcat(OpcodeStr,
8307 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8308 [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
8309 (GatherNode256 RC256:$src1, RC256:$mask,
8310 vectoraddr:$src2))]>,
8311 VEX, VEX_L, Sched<[WriteLoad]>;
8314 let Predicates = [HasAVX2] in {
8315 let mayLoad = 1, hasSideEffects = 0, Constraints
8316 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8318 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
8319 mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
8320 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
8321 mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
8322 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
8323 mgatherv8i32, VR256, vx128mem, vy256mem>;
8324 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
8325 mgatherv4i64, VR128, vx64mem, vy128mem>;
8327 let ExeDomain = SSEPackedDouble in {
8328 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
8329 mgatherv4i32, VR256, vx128mem, vx256mem,
8330 v2i64, v4i64>, VEX_W;
8331 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
8332 mgatherv4i64, VR256, vx128mem, vy256mem,
8333 v2i64, v4i64>, VEX_W;
8336 let ExeDomain = SSEPackedSingle in {
8337 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
8338 mgatherv8i32, VR256, vx128mem, vy256mem,
8340 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
8341 mgatherv4i64, VR128, vx64mem, vy128mem,
8347 //===----------------------------------------------------------------------===//
8348 // GFNI instructions
8349 //===----------------------------------------------------------------------===//
8351 multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
8352 RegisterClass RC, PatFrag MemOpFrag,
8353 X86MemOperand X86MemOp, bit Is2Addr = 0> {
8354 let ExeDomain = SSEPackedInt,
8355 AsmString = !if(Is2Addr,
8356 OpcodeStr##"\t{$src2, $dst|$dst, $src2}",
8357 OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
8358 let isCommutable = 1 in
8359 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
8360 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
8361 Sched<[SchedWriteVecALU.XMM]>, T8PD;
8363 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
8364 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
8365 (MemOpFrag addr:$src2))))]>,
8366 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
8370 multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
8371 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
8372 X86MemOperand X86MemOp, bit Is2Addr = 0> {
8373 let AsmString = !if(Is2Addr,
8374 OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
8375 OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
8376 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
8377 (ins RC:$src1, RC:$src2, u8imm:$src3), "",
8378 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
8379 SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
8380 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
8381 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
8382 [(set RC:$dst, (OpVT (OpNode RC:$src1,
8383 (MemOpFrag addr:$src2),
8384 imm:$src3)))], SSEPackedInt>,
8385 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
8389 multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
8390 let Constraints = "$src1 = $dst",
8391 Predicates = [HasGFNI, UseSSE2] in
8392 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
8393 VR128, load, i128mem, 1>;
8394 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8395 defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
8396 load, i128mem>, VEX_4V, VEX_W;
8397 defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
8398 load, i256mem>, VEX_4V, VEX_L, VEX_W;
8403 let Constraints = "$src1 = $dst",
8404 Predicates = [HasGFNI, UseSSE2] in
8405 defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
8407 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8408 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
8410 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
8411 i256mem>, VEX_4V, VEX_L;
8413 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
8414 let isCommutable = 0 in {
8415 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
8416 X86GF2P8affineinvqb>, TAPD;
8417 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
8418 X86GF2P8affineqb>, TAPD;