1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -disable-peephole -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
3 ; RUN: llc < %s -disable-peephole -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
5 target triple = "x86_64-unknown-unknown"
7 declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
8 declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
9 declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
10 declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
12 declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
13 declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
14 declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
15 declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
17 define void @fmadd_aab_ss(float* %a, float* %b) {
18 ; CHECK-LABEL: fmadd_aab_ss:
20 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
21 ; CHECK-NEXT: vfmadd213ss (%rsi), %xmm0, %xmm0
22 ; CHECK-NEXT: vmovss %xmm0, (%rdi)
24 %a.val = load float, float* %a
25 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
26 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
27 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
28 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
30 %b.val = load float, float* %b
31 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
32 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
33 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
34 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
36 %vr = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
38 %sr = extractelement <4 x float> %vr, i32 0
39 store float %sr, float* %a
43 define void @fmadd_aba_ss(float* %a, float* %b) {
44 ; CHECK-LABEL: fmadd_aba_ss:
46 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
47 ; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
48 ; CHECK-NEXT: vmovss %xmm0, (%rdi)
50 %a.val = load float, float* %a
51 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
52 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
53 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
54 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
56 %b.val = load float, float* %b
57 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
58 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
59 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
60 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
62 %vr = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
64 %sr = extractelement <4 x float> %vr, i32 0
65 store float %sr, float* %a
69 define void @fmsub_aab_ss(float* %a, float* %b) {
70 ; CHECK-LABEL: fmsub_aab_ss:
72 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
73 ; CHECK-NEXT: vfmsub213ss (%rsi), %xmm0, %xmm0
74 ; CHECK-NEXT: vmovss %xmm0, (%rdi)
76 %a.val = load float, float* %a
77 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
78 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
79 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
80 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
82 %b.val = load float, float* %b
83 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
84 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
85 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
86 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
88 %vr = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
90 %sr = extractelement <4 x float> %vr, i32 0
91 store float %sr, float* %a
95 define void @fmsub_aba_ss(float* %a, float* %b) {
96 ; CHECK-LABEL: fmsub_aba_ss:
98 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
99 ; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm0 * mem) - xmm0
100 ; CHECK-NEXT: vmovss %xmm0, (%rdi)
102 %a.val = load float, float* %a
103 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
104 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
105 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
106 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
108 %b.val = load float, float* %b
109 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
110 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
111 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
112 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
114 %vr = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
116 %sr = extractelement <4 x float> %vr, i32 0
117 store float %sr, float* %a
121 define void @fnmadd_aab_ss(float* %a, float* %b) {
122 ; CHECK-LABEL: fnmadd_aab_ss:
124 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
125 ; CHECK-NEXT: vfnmadd213ss (%rsi), %xmm0, %xmm0
126 ; CHECK-NEXT: vmovss %xmm0, (%rdi)
128 %a.val = load float, float* %a
129 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
130 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
131 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
132 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
134 %b.val = load float, float* %b
135 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
136 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
137 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
138 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
140 %vr = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
142 %sr = extractelement <4 x float> %vr, i32 0
143 store float %sr, float* %a
147 define void @fnmadd_aba_ss(float* %a, float* %b) {
148 ; CHECK-LABEL: fnmadd_aba_ss:
150 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
151 ; CHECK-NEXT: vfnmadd231ss {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0
152 ; CHECK-NEXT: vmovss %xmm0, (%rdi)
154 %a.val = load float, float* %a
155 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
156 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
157 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
158 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
160 %b.val = load float, float* %b
161 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
162 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
163 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
164 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
166 %vr = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
168 %sr = extractelement <4 x float> %vr, i32 0
169 store float %sr, float* %a
173 define void @fnmsub_aab_ss(float* %a, float* %b) {
174 ; CHECK-LABEL: fnmsub_aab_ss:
176 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
177 ; CHECK-NEXT: vfnmsub213ss (%rsi), %xmm0, %xmm0
178 ; CHECK-NEXT: vmovss %xmm0, (%rdi)
180 %a.val = load float, float* %a
181 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
182 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
183 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
184 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
186 %b.val = load float, float* %b
187 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
188 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
189 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
190 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
192 %vr = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
194 %sr = extractelement <4 x float> %vr, i32 0
195 store float %sr, float* %a
199 define void @fnmsub_aba_ss(float* %a, float* %b) {
200 ; CHECK-LABEL: fnmsub_aba_ss:
202 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
203 ; CHECK-NEXT: vfnmsub231ss {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0
204 ; CHECK-NEXT: vmovss %xmm0, (%rdi)
206 %a.val = load float, float* %a
207 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
208 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
209 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
210 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
212 %b.val = load float, float* %b
213 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
214 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
215 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
216 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
218 %vr = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
220 %sr = extractelement <4 x float> %vr, i32 0
221 store float %sr, float* %a
225 define void @fmadd_aab_sd(double* %a, double* %b) {
226 ; CHECK-LABEL: fmadd_aab_sd:
228 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
229 ; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + mem
230 ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
232 %a.val = load double, double* %a
233 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
234 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
236 %b.val = load double, double* %b
237 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
238 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
240 %vr = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
242 %sr = extractelement <2 x double> %vr, i32 0
243 store double %sr, double* %a
247 define void @fmadd_aba_sd(double* %a, double* %b) {
248 ; CHECK-LABEL: fmadd_aba_sd:
250 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
251 ; CHECK-NEXT: vfmadd231sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
252 ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
254 %a.val = load double, double* %a
255 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
256 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
258 %b.val = load double, double* %b
259 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
260 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
262 %vr = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
264 %sr = extractelement <2 x double> %vr, i32 0
265 store double %sr, double* %a
269 define void @fmsub_aab_sd(double* %a, double* %b) {
270 ; CHECK-LABEL: fmsub_aab_sd:
272 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
273 ; CHECK-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm0 * xmm0) - mem
274 ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
276 %a.val = load double, double* %a
277 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
278 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
280 %b.val = load double, double* %b
281 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
282 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
284 %vr = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
286 %sr = extractelement <2 x double> %vr, i32 0
287 store double %sr, double* %a
291 define void @fmsub_aba_sd(double* %a, double* %b) {
292 ; CHECK-LABEL: fmsub_aba_sd:
294 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
295 ; CHECK-NEXT: vfmsub231sd {{.*#+}} xmm0 = (xmm0 * mem) - xmm0
296 ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
298 %a.val = load double, double* %a
299 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
300 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
302 %b.val = load double, double* %b
303 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
304 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
306 %vr = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
308 %sr = extractelement <2 x double> %vr, i32 0
309 store double %sr, double* %a
313 define void @fnmadd_aab_sd(double* %a, double* %b) {
314 ; CHECK-LABEL: fnmadd_aab_sd:
316 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
317 ; CHECK-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm0 * xmm0) + mem
318 ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
320 %a.val = load double, double* %a
321 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
322 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
324 %b.val = load double, double* %b
325 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
326 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
328 %vr = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
330 %sr = extractelement <2 x double> %vr, i32 0
331 store double %sr, double* %a
335 define void @fnmadd_aba_sd(double* %a, double* %b) {
336 ; CHECK-LABEL: fnmadd_aba_sd:
338 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
339 ; CHECK-NEXT: vfnmadd231sd {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0
340 ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
342 %a.val = load double, double* %a
343 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
344 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
346 %b.val = load double, double* %b
347 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
348 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
350 %vr = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
352 %sr = extractelement <2 x double> %vr, i32 0
353 store double %sr, double* %a
357 define void @fnmsub_aab_sd(double* %a, double* %b) {
358 ; CHECK-LABEL: fnmsub_aab_sd:
360 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
361 ; CHECK-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm0 * xmm0) - mem
362 ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
364 %a.val = load double, double* %a
365 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
366 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
368 %b.val = load double, double* %b
369 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
370 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
372 %vr = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
374 %sr = extractelement <2 x double> %vr, i32 0
375 store double %sr, double* %a
379 define void @fnmsub_aba_sd(double* %a, double* %b) {
380 ; CHECK-LABEL: fnmsub_aba_sd:
382 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
383 ; CHECK-NEXT: vfnmsub231sd {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0
384 ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
386 %a.val = load double, double* %a
387 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
388 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
390 %b.val = load double, double* %b
391 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
392 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
394 %vr = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
396 %sr = extractelement <2 x double> %vr, i32 0
397 store double %sr, double* %a