1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s --check-prefix=NOFMA
3 ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=FMA,FMA-AVX1
4 ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma4 < %s | FileCheck %s --check-prefix=FMA4
5 ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=FMA,FMA-AVX512
7 define float @f1(float %0, float %1, float %2) #0 {
9 ; NOFMA: # %bb.0: # %entry
10 ; NOFMA-NEXT: pushq %rax
11 ; NOFMA-NEXT: .cfi_def_cfa_offset 16
12 ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
13 ; NOFMA-NEXT: callq fmaf@PLT
14 ; NOFMA-NEXT: popq %rax
15 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
19 ; FMA: # %bb.0: # %entry
20 ; FMA-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
24 ; FMA4: # %bb.0: # %entry
25 ; FMA4-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
29 %result = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %2,
30 metadata !"round.dynamic",
31 metadata !"fpexcept.strict") #0
35 define double @f2(double %0, double %1, double %2) #0 {
37 ; NOFMA: # %bb.0: # %entry
38 ; NOFMA-NEXT: pushq %rax
39 ; NOFMA-NEXT: .cfi_def_cfa_offset 16
40 ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
41 ; NOFMA-NEXT: callq fma@PLT
42 ; NOFMA-NEXT: popq %rax
43 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
47 ; FMA: # %bb.0: # %entry
48 ; FMA-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
52 ; FMA4: # %bb.0: # %entry
53 ; FMA4-NEXT: vfnmaddsd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
57 %result = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %2,
58 metadata !"round.dynamic",
59 metadata !"fpexcept.strict") #0
63 define float @f3(float %0, float %1, float %2) #0 {
65 ; NOFMA: # %bb.0: # %entry
66 ; NOFMA-NEXT: pushq %rax
67 ; NOFMA-NEXT: .cfi_def_cfa_offset 16
68 ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
69 ; NOFMA-NEXT: callq fmaf@PLT
70 ; NOFMA-NEXT: popq %rax
71 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
75 ; FMA: # %bb.0: # %entry
76 ; FMA-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
80 ; FMA4: # %bb.0: # %entry
81 ; FMA4-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
85 %result = call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %3,
86 metadata !"round.dynamic",
87 metadata !"fpexcept.strict") #0
91 define double @f4(double %0, double %1, double %2) #0 {
93 ; NOFMA: # %bb.0: # %entry
94 ; NOFMA-NEXT: pushq %rax
95 ; NOFMA-NEXT: .cfi_def_cfa_offset 16
96 ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
97 ; NOFMA-NEXT: callq fma@PLT
98 ; NOFMA-NEXT: popq %rax
99 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
103 ; FMA: # %bb.0: # %entry
104 ; FMA-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
108 ; FMA4: # %bb.0: # %entry
109 ; FMA4-NEXT: vfmsubsd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
113 %result = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %3,
114 metadata !"round.dynamic",
115 metadata !"fpexcept.strict") #0
119 define float @f5(float %0, float %1, float %2) #0 {
121 ; NOFMA: # %bb.0: # %entry
122 ; NOFMA-NEXT: pushq %rax
123 ; NOFMA-NEXT: .cfi_def_cfa_offset 16
124 ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
125 ; NOFMA-NEXT: xorps %xmm3, %xmm0
126 ; NOFMA-NEXT: xorps %xmm3, %xmm2
127 ; NOFMA-NEXT: callq fmaf@PLT
128 ; NOFMA-NEXT: popq %rax
129 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
133 ; FMA: # %bb.0: # %entry
134 ; FMA-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
138 ; FMA4: # %bb.0: # %entry
139 ; FMA4-NEXT: vfnmsubss {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
144 %result = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %4,
145 metadata !"round.dynamic",
146 metadata !"fpexcept.strict") #0
150 define double @f6(double %0, double %1, double %2) #0 {
152 ; NOFMA: # %bb.0: # %entry
153 ; NOFMA-NEXT: pushq %rax
154 ; NOFMA-NEXT: .cfi_def_cfa_offset 16
155 ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
156 ; NOFMA-NEXT: xorps %xmm3, %xmm0
157 ; NOFMA-NEXT: xorps %xmm3, %xmm2
158 ; NOFMA-NEXT: callq fma@PLT
159 ; NOFMA-NEXT: popq %rax
160 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
164 ; FMA: # %bb.0: # %entry
165 ; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
169 ; FMA4: # %bb.0: # %entry
170 ; FMA4-NEXT: vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
175 %result = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %4,
176 metadata !"round.dynamic",
177 metadata !"fpexcept.strict") #0
181 define float @f7(float %0, float %1, float %2) #0 {
183 ; NOFMA: # %bb.0: # %entry
184 ; NOFMA-NEXT: pushq %rax
185 ; NOFMA-NEXT: .cfi_def_cfa_offset 16
186 ; NOFMA-NEXT: callq fmaf@PLT
187 ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
188 ; NOFMA-NEXT: popq %rax
189 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
192 ; FMA-AVX1-LABEL: f7:
193 ; FMA-AVX1: # %bb.0: # %entry
194 ; FMA-AVX1-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
195 ; FMA-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
196 ; FMA-AVX1-NEXT: retq
199 ; FMA4: # %bb.0: # %entry
200 ; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
201 ; FMA4-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
204 ; FMA-AVX512-LABEL: f7:
205 ; FMA-AVX512: # %bb.0: # %entry
206 ; FMA-AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
207 ; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
208 ; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
209 ; FMA-AVX512-NEXT: retq
211 %3 = call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2,
212 metadata !"round.dynamic",
213 metadata !"fpexcept.strict") #0
214 %result = fneg float %3
218 define double @f8(double %0, double %1, double %2) #0 {
220 ; NOFMA: # %bb.0: # %entry
221 ; NOFMA-NEXT: pushq %rax
222 ; NOFMA-NEXT: .cfi_def_cfa_offset 16
223 ; NOFMA-NEXT: callq fma@PLT
224 ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
225 ; NOFMA-NEXT: popq %rax
226 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
230 ; FMA: # %bb.0: # %entry
231 ; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
232 ; FMA-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
236 ; FMA4: # %bb.0: # %entry
237 ; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
238 ; FMA4-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
241 %3 = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2,
242 metadata !"round.dynamic",
243 metadata !"fpexcept.strict") #0
244 %result = fneg double %3
248 define float @f9(float %0, float %1, float %2) #0 {
250 ; NOFMA: # %bb.0: # %entry
251 ; NOFMA-NEXT: pushq %rax
252 ; NOFMA-NEXT: .cfi_def_cfa_offset 16
253 ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
254 ; NOFMA-NEXT: xorps %xmm3, %xmm0
255 ; NOFMA-NEXT: xorps %xmm3, %xmm2
256 ; NOFMA-NEXT: callq fmaf@PLT
257 ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
258 ; NOFMA-NEXT: popq %rax
259 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
262 ; FMA-AVX1-LABEL: f9:
263 ; FMA-AVX1: # %bb.0: # %entry
264 ; FMA-AVX1-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
265 ; FMA-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
266 ; FMA-AVX1-NEXT: retq
269 ; FMA4: # %bb.0: # %entry
270 ; FMA4-NEXT: vfnmsubss {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
271 ; FMA4-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
274 ; FMA-AVX512-LABEL: f9:
275 ; FMA-AVX512: # %bb.0: # %entry
276 ; FMA-AVX512-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
277 ; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
278 ; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
279 ; FMA-AVX512-NEXT: retq
283 %5 = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %4,
284 metadata !"round.dynamic",
285 metadata !"fpexcept.strict") #0
286 %result = fneg float %5
290 define double @f10(double %0, double %1, double %2) #0 {
292 ; NOFMA: # %bb.0: # %entry
293 ; NOFMA-NEXT: pushq %rax
294 ; NOFMA-NEXT: .cfi_def_cfa_offset 16
295 ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
296 ; NOFMA-NEXT: xorps %xmm3, %xmm0
297 ; NOFMA-NEXT: xorps %xmm3, %xmm2
298 ; NOFMA-NEXT: callq fma@PLT
299 ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
300 ; NOFMA-NEXT: popq %rax
301 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
305 ; FMA: # %bb.0: # %entry
306 ; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
307 ; FMA-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
311 ; FMA4: # %bb.0: # %entry
312 ; FMA4-NEXT: vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
313 ; FMA4-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
318 %5 = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %4,
319 metadata !"round.dynamic",
320 metadata !"fpexcept.strict") #0
321 %result = fneg double %5
325 ; Verify constrained fmul and fadd aren't fused.
326 define float @f11(float %0, float %1, float %2) #0 {
328 ; NOFMA: # %bb.0: # %entry
329 ; NOFMA-NEXT: mulss %xmm1, %xmm0
330 ; NOFMA-NEXT: addss %xmm2, %xmm0
334 ; FMA: # %bb.0: # %entry
335 ; FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
336 ; FMA-NEXT: vaddss %xmm2, %xmm0, %xmm0
340 ; FMA4: # %bb.0: # %entry
341 ; FMA4-NEXT: vmulss %xmm1, %xmm0, %xmm0
342 ; FMA4-NEXT: vaddss %xmm2, %xmm0, %xmm0
345 %3 = call float @llvm.experimental.constrained.fmul.f32(float %0, float %1,
346 metadata !"round.dynamic",
347 metadata !"fpexcept.strict") #0
348 %4 = call float @llvm.experimental.constrained.fadd.f32(float %3, float %2,
349 metadata !"round.dynamic",
350 metadata !"fpexcept.strict") #0
354 ; Verify constrained fmul and fadd aren't fused.
355 define double @f12(double %0, double %1, double %2) #0 {
357 ; NOFMA: # %bb.0: # %entry
358 ; NOFMA-NEXT: mulsd %xmm1, %xmm0
359 ; NOFMA-NEXT: addsd %xmm2, %xmm0
363 ; FMA: # %bb.0: # %entry
364 ; FMA-NEXT: vmulsd %xmm1, %xmm0, %xmm0
365 ; FMA-NEXT: vaddsd %xmm2, %xmm0, %xmm0
369 ; FMA4: # %bb.0: # %entry
370 ; FMA4-NEXT: vmulsd %xmm1, %xmm0, %xmm0
371 ; FMA4-NEXT: vaddsd %xmm2, %xmm0, %xmm0
374 %3 = call double @llvm.experimental.constrained.fmul.f64(double %0, double %1,
375 metadata !"round.dynamic",
376 metadata !"fpexcept.strict") #0
377 %4 = call double @llvm.experimental.constrained.fadd.f64(double %3, double %2,
378 metadata !"round.dynamic",
379 metadata !"fpexcept.strict") #0
383 ; Verify that fmuladd(3.5) isn't simplified when the rounding mode is
385 define float @f15() #0 {
387 ; NOFMA: # %bb.0: # %entry
388 ; NOFMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
389 ; NOFMA-NEXT: movaps %xmm1, %xmm0
390 ; NOFMA-NEXT: mulss %xmm1, %xmm0
391 ; NOFMA-NEXT: addss %xmm1, %xmm0
395 ; FMA: # %bb.0: # %entry
396 ; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
397 ; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
401 ; FMA4: # %bb.0: # %entry
402 ; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
403 ; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
406 %result = call float @llvm.experimental.constrained.fmuladd.f32(
410 metadata !"round.dynamic",
411 metadata !"fpexcept.strict") #0
415 ; Verify that fmuladd(42.1) isn't simplified when the rounding mode is
417 define double @f16() #0 {
419 ; NOFMA: # %bb.0: # %entry
420 ; NOFMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
421 ; NOFMA-NEXT: movapd %xmm1, %xmm0
422 ; NOFMA-NEXT: mulsd %xmm1, %xmm0
423 ; NOFMA-NEXT: addsd %xmm1, %xmm0
427 ; FMA: # %bb.0: # %entry
428 ; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
429 ; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
433 ; FMA4: # %bb.0: # %entry
434 ; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
435 ; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
438 %result = call double @llvm.experimental.constrained.fmuladd.f64(
442 metadata !"round.dynamic",
443 metadata !"fpexcept.strict") #0
447 ; Verify that fma(3.5) isn't simplified when the rounding mode is
449 define float @f17() #0 {
451 ; NOFMA: # %bb.0: # %entry
452 ; NOFMA-NEXT: pushq %rax
453 ; NOFMA-NEXT: .cfi_def_cfa_offset 16
454 ; NOFMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
455 ; NOFMA-NEXT: movaps %xmm0, %xmm1
456 ; NOFMA-NEXT: movaps %xmm0, %xmm2
457 ; NOFMA-NEXT: callq fmaf@PLT
458 ; NOFMA-NEXT: popq %rax
459 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
463 ; FMA: # %bb.0: # %entry
464 ; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
465 ; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
469 ; FMA4: # %bb.0: # %entry
470 ; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
471 ; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
474 %result = call float @llvm.experimental.constrained.fma.f32(
478 metadata !"round.dynamic",
479 metadata !"fpexcept.strict") #0
483 ; Verify that fma(42.1) isn't simplified when the rounding mode is
485 define double @f18() #0 {
487 ; NOFMA: # %bb.0: # %entry
488 ; NOFMA-NEXT: pushq %rax
489 ; NOFMA-NEXT: .cfi_def_cfa_offset 16
490 ; NOFMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
491 ; NOFMA-NEXT: movaps %xmm0, %xmm1
492 ; NOFMA-NEXT: movaps %xmm0, %xmm2
493 ; NOFMA-NEXT: callq fma@PLT
494 ; NOFMA-NEXT: popq %rax
495 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
499 ; FMA: # %bb.0: # %entry
500 ; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
501 ; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
505 ; FMA4: # %bb.0: # %entry
506 ; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
507 ; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
510 %result = call double @llvm.experimental.constrained.fma.f64(
514 metadata !"round.dynamic",
515 metadata !"fpexcept.strict") #0
519 define <4 x float> @f19(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
521 ; NOFMA: # %bb.0: # %entry
522 ; NOFMA-NEXT: subq $88, %rsp
523 ; NOFMA-NEXT: .cfi_def_cfa_offset 96
524 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
525 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
526 ; NOFMA-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
527 ; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
528 ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
529 ; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
530 ; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
531 ; NOFMA-NEXT: callq fmaf@PLT
532 ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
533 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
534 ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
535 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
536 ; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
537 ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
538 ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
539 ; NOFMA-NEXT: callq fmaf@PLT
540 ; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
541 ; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
542 ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
543 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
544 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
545 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
546 ; NOFMA-NEXT: callq fmaf@PLT
547 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
548 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
549 ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
550 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
551 ; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
552 ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
553 ; NOFMA-NEXT: # xmm0 = mem[1,1,1,1]
554 ; NOFMA-NEXT: callq fmaf@PLT
555 ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
556 ; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
557 ; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
558 ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
559 ; NOFMA-NEXT: movdqa %xmm1, %xmm0
560 ; NOFMA-NEXT: addq $88, %rsp
561 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
565 ; FMA: # %bb.0: # %entry
566 ; FMA-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
570 ; FMA4: # %bb.0: # %entry
571 ; FMA4-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
574 %3 = fneg <4 x float> %0
575 %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %2,
576 metadata !"round.dynamic",
577 metadata !"fpexcept.strict") #0
578 ret <4 x float> %result
581 define <2 x double> @f20(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
583 ; NOFMA: # %bb.0: # %entry
584 ; NOFMA-NEXT: subq $72, %rsp
585 ; NOFMA-NEXT: .cfi_def_cfa_offset 80
586 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
587 ; NOFMA-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
588 ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
589 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
590 ; NOFMA-NEXT: callq fma@PLT
591 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
592 ; NOFMA-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
593 ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
594 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
595 ; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
596 ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
597 ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
598 ; NOFMA-NEXT: callq fma@PLT
599 ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
600 ; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
601 ; NOFMA-NEXT: movdqa %xmm1, %xmm0
602 ; NOFMA-NEXT: addq $72, %rsp
603 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
607 ; FMA: # %bb.0: # %entry
608 ; FMA-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
612 ; FMA4: # %bb.0: # %entry
613 ; FMA4-NEXT: vfnmaddpd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
616 %3 = fneg <2 x double> %0
617 %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %2,
618 metadata !"round.dynamic",
619 metadata !"fpexcept.strict") #0
620 ret <2 x double> %result
623 define <4 x float> @f21(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
625 ; NOFMA: # %bb.0: # %entry
626 ; NOFMA-NEXT: subq $88, %rsp
627 ; NOFMA-NEXT: .cfi_def_cfa_offset 96
628 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
629 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
630 ; NOFMA-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
631 ; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
632 ; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
633 ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
634 ; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
635 ; NOFMA-NEXT: callq fmaf@PLT
636 ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
637 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
638 ; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
639 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
640 ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
641 ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
642 ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
643 ; NOFMA-NEXT: callq fmaf@PLT
644 ; NOFMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
645 ; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
646 ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
647 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
648 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
649 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
650 ; NOFMA-NEXT: callq fmaf@PLT
651 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
652 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
653 ; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
654 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
655 ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
656 ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
657 ; NOFMA-NEXT: # xmm2 = mem[1,1,1,1]
658 ; NOFMA-NEXT: callq fmaf@PLT
659 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
660 ; NOFMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
661 ; NOFMA-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
662 ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
663 ; NOFMA-NEXT: movaps %xmm1, %xmm0
664 ; NOFMA-NEXT: addq $88, %rsp
665 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
669 ; FMA: # %bb.0: # %entry
670 ; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
674 ; FMA4: # %bb.0: # %entry
675 ; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
678 %3 = fneg <4 x float> %2
679 %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %3,
680 metadata !"round.dynamic",
681 metadata !"fpexcept.strict") #0
682 ret <4 x float> %result
685 define <2 x double> @f22(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
687 ; NOFMA: # %bb.0: # %entry
688 ; NOFMA-NEXT: subq $72, %rsp
689 ; NOFMA-NEXT: .cfi_def_cfa_offset 80
690 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
691 ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
692 ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
693 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
694 ; NOFMA-NEXT: callq fma@PLT
695 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
696 ; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
697 ; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
698 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
699 ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
700 ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
701 ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
702 ; NOFMA-NEXT: callq fma@PLT
703 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
704 ; NOFMA-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
705 ; NOFMA-NEXT: movaps %xmm1, %xmm0
706 ; NOFMA-NEXT: addq $72, %rsp
707 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
711 ; FMA: # %bb.0: # %entry
712 ; FMA-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
716 ; FMA4: # %bb.0: # %entry
717 ; FMA4-NEXT: vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
720 %3 = fneg <2 x double> %2
721 %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %3,
722 metadata !"round.dynamic",
723 metadata !"fpexcept.strict") #0
724 ret <2 x double> %result
727 define <4 x float> @f23(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
729 ; NOFMA: # %bb.0: # %entry
730 ; NOFMA-NEXT: subq $88, %rsp
731 ; NOFMA-NEXT: .cfi_def_cfa_offset 96
732 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
733 ; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
734 ; NOFMA-NEXT: pxor %xmm3, %xmm0
735 ; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
736 ; NOFMA-NEXT: pxor %xmm3, %xmm2
737 ; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
738 ; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
739 ; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
740 ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
741 ; NOFMA-NEXT: callq fmaf@PLT
742 ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
743 ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
744 ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
745 ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
746 ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
747 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
748 ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
749 ; NOFMA-NEXT: callq fmaf@PLT
750 ; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
751 ; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
752 ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
753 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
754 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
755 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
756 ; NOFMA-NEXT: callq fmaf@PLT
757 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
758 ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
759 ; NOFMA-NEXT: # xmm0 = mem[1,1,1,1]
760 ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
761 ; NOFMA-NEXT: # xmm2 = mem[1,1,1,1]
762 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
763 ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
764 ; NOFMA-NEXT: callq fmaf@PLT
765 ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
766 ; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
767 ; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
768 ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
769 ; NOFMA-NEXT: movdqa %xmm1, %xmm0
770 ; NOFMA-NEXT: addq $88, %rsp
771 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
775 ; FMA: # %bb.0: # %entry
776 ; FMA-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
780 ; FMA4: # %bb.0: # %entry
781 ; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
784 %3 = fneg <4 x float> %0
785 %4 = fneg <4 x float> %2
786 %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %4,
787 metadata !"round.dynamic",
788 metadata !"fpexcept.strict") #0
789 ret <4 x float> %result
792 define <2 x double> @f24(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
794 ; NOFMA: # %bb.0: # %entry
795 ; NOFMA-NEXT: subq $72, %rsp
796 ; NOFMA-NEXT: .cfi_def_cfa_offset 80
797 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
798 ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
799 ; NOFMA-NEXT: xorps %xmm3, %xmm0
800 ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
801 ; NOFMA-NEXT: xorps %xmm3, %xmm2
802 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
803 ; NOFMA-NEXT: callq fma@PLT
804 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
805 ; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
806 ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
807 ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
808 ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
809 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
810 ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
811 ; NOFMA-NEXT: callq fma@PLT
812 ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
813 ; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
814 ; NOFMA-NEXT: movdqa %xmm1, %xmm0
815 ; NOFMA-NEXT: addq $72, %rsp
816 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
820 ; FMA: # %bb.0: # %entry
821 ; FMA-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
825 ; FMA4: # %bb.0: # %entry
826 ; FMA4-NEXT: vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
829 %3 = fneg <2 x double> %0
830 %4 = fneg <2 x double> %2
831 %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %4,
832 metadata !"round.dynamic",
833 metadata !"fpexcept.strict") #0
834 ret <2 x double> %result
837 define <4 x float> @f25(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
839 ; NOFMA: # %bb.0: # %entry
840 ; NOFMA-NEXT: subq $88, %rsp
841 ; NOFMA-NEXT: .cfi_def_cfa_offset 96
842 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
843 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
844 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
845 ; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
846 ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
847 ; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
848 ; NOFMA-NEXT: callq fmaf@PLT
849 ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
850 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
851 ; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
852 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
853 ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
854 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
855 ; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
856 ; NOFMA-NEXT: callq fmaf@PLT
857 ; NOFMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
858 ; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
859 ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
860 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
861 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
862 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
863 ; NOFMA-NEXT: callq fmaf@PLT
864 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
865 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
866 ; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
867 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
868 ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
869 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
870 ; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
871 ; NOFMA-NEXT: callq fmaf@PLT
872 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
873 ; NOFMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
874 ; NOFMA-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
875 ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
876 ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
877 ; NOFMA-NEXT: movaps %xmm1, %xmm0
878 ; NOFMA-NEXT: addq $88, %rsp
879 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
882 ; FMA-AVX1-LABEL: f25:
883 ; FMA-AVX1: # %bb.0: # %entry
884 ; FMA-AVX1-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
885 ; FMA-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
886 ; FMA-AVX1-NEXT: retq
889 ; FMA4: # %bb.0: # %entry
890 ; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
891 ; FMA4-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
894 ; FMA-AVX512-LABEL: f25:
895 ; FMA-AVX512: # %bb.0: # %entry
896 ; FMA-AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
897 ; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
898 ; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
899 ; FMA-AVX512-NEXT: retq
901 %3 = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2,
902 metadata !"round.dynamic",
903 metadata !"fpexcept.strict") #0
904 %result = fneg <4 x float> %3
905 ret <4 x float> %result
908 define <2 x double> @f26(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
910 ; NOFMA: # %bb.0: # %entry
911 ; NOFMA-NEXT: subq $72, %rsp
912 ; NOFMA-NEXT: .cfi_def_cfa_offset 80
913 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
914 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
915 ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
916 ; NOFMA-NEXT: callq fma@PLT
917 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
918 ; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
919 ; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
920 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
921 ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
922 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
923 ; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
924 ; NOFMA-NEXT: callq fma@PLT
925 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
926 ; NOFMA-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
927 ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
928 ; NOFMA-NEXT: movaps %xmm1, %xmm0
929 ; NOFMA-NEXT: addq $72, %rsp
930 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
934 ; FMA: # %bb.0: # %entry
935 ; FMA-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
936 ; FMA-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
940 ; FMA4: # %bb.0: # %entry
941 ; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
942 ; FMA4-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
945 %3 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %2,
946 metadata !"round.dynamic",
947 metadata !"fpexcept.strict") #0
948 %result = fneg <2 x double> %3
949 ret <2 x double> %result
952 define <4 x float> @f27(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
954 ; NOFMA: # %bb.0: # %entry
955 ; NOFMA-NEXT: subq $88, %rsp
956 ; NOFMA-NEXT: .cfi_def_cfa_offset 96
957 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
958 ; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
959 ; NOFMA-NEXT: pxor %xmm3, %xmm0
960 ; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
961 ; NOFMA-NEXT: pxor %xmm3, %xmm2
962 ; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
963 ; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
964 ; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
965 ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
966 ; NOFMA-NEXT: callq fmaf@PLT
967 ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
968 ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
969 ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
970 ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
971 ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
972 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
973 ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
974 ; NOFMA-NEXT: callq fmaf@PLT
975 ; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
976 ; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
977 ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
978 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
979 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
980 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
981 ; NOFMA-NEXT: callq fmaf@PLT
982 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
983 ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
984 ; NOFMA-NEXT: # xmm0 = mem[1,1,1,1]
985 ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
986 ; NOFMA-NEXT: # xmm2 = mem[1,1,1,1]
987 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
988 ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
989 ; NOFMA-NEXT: callq fmaf@PLT
990 ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
991 ; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
992 ; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
993 ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
994 ; NOFMA-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
995 ; NOFMA-NEXT: movdqa %xmm1, %xmm0
996 ; NOFMA-NEXT: addq $88, %rsp
997 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
1000 ; FMA-AVX1-LABEL: f27:
1001 ; FMA-AVX1: # %bb.0: # %entry
1002 ; FMA-AVX1-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
1003 ; FMA-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1004 ; FMA-AVX1-NEXT: retq
1007 ; FMA4: # %bb.0: # %entry
1008 ; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
1009 ; FMA4-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1012 ; FMA-AVX512-LABEL: f27:
1013 ; FMA-AVX512: # %bb.0: # %entry
1014 ; FMA-AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
1015 ; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1016 ; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
1017 ; FMA-AVX512-NEXT: retq
1019 %3 = fneg <4 x float> %0
1020 %4 = fneg <4 x float> %2
1021 %5 = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %4,
1022 metadata !"round.dynamic",
1023 metadata !"fpexcept.strict") #0
1024 %result = fneg <4 x float> %5
1025 ret <4 x float> %result
1028 define <2 x double> @f28(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
1030 ; NOFMA: # %bb.0: # %entry
1031 ; NOFMA-NEXT: subq $72, %rsp
1032 ; NOFMA-NEXT: .cfi_def_cfa_offset 80
1033 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1034 ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
1035 ; NOFMA-NEXT: xorps %xmm3, %xmm0
1036 ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
1037 ; NOFMA-NEXT: xorps %xmm3, %xmm2
1038 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1039 ; NOFMA-NEXT: callq fma@PLT
1040 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1041 ; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
1042 ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
1043 ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1044 ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
1045 ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1046 ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
1047 ; NOFMA-NEXT: callq fma@PLT
1048 ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1049 ; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1050 ; NOFMA-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1051 ; NOFMA-NEXT: movdqa %xmm1, %xmm0
1052 ; NOFMA-NEXT: addq $72, %rsp
1053 ; NOFMA-NEXT: .cfi_def_cfa_offset 8
1057 ; FMA: # %bb.0: # %entry
1058 ; FMA-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
1059 ; FMA-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1063 ; FMA4: # %bb.0: # %entry
1064 ; FMA4-NEXT: vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
1065 ; FMA4-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1068 %3 = fneg <2 x double> %0
1069 %4 = fneg <2 x double> %2
1070 %5 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %4,
1071 metadata !"round.dynamic",
1072 metadata !"fpexcept.strict") #0
1073 %result = fneg <2 x double> %5
1074 ret <2 x double> %result
1077 attributes #0 = { strictfp }
1079 declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata)
1080 declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
1081 declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
1082 declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
1083 declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
1084 declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
1085 declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
1086 declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
1087 declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata)
1088 declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata)