1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefixes=CHECK,NOFMA
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck %s -check-prefixes=CHECK,FMA3,FMA3_256
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck %s -check-prefixes=CHECK,FMA3,FMA3_512
5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck %s -check-prefixes=CHECK,FMA4
7 ; This test checks the fusing of MUL + SUB/ADD to FMSUBADD.
9 define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
10 ; NOFMA-LABEL: mul_subadd_pd128:
11 ; NOFMA: # %bb.0: # %entry
12 ; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0
13 ; NOFMA-NEXT: vsubpd %xmm2, %xmm0, %xmm1
14 ; NOFMA-NEXT: vaddpd %xmm2, %xmm0, %xmm0
15 ; NOFMA-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
18 ; FMA3-LABEL: mul_subadd_pd128:
19 ; FMA3: # %bb.0: # %entry
20 ; FMA3-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
23 ; FMA4-LABEL: mul_subadd_pd128:
24 ; FMA4: # %bb.0: # %entry
25 ; FMA4-NEXT: vfmsubaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
28 %AB = fmul <2 x double> %A, %B
29 %Sub = fsub <2 x double> %AB, %C
30 %Add = fadd <2 x double> %AB, %C
31 %subadd = shufflevector <2 x double> %Add, <2 x double> %Sub, <2 x i32> <i32 0, i32 3>
32 ret <2 x double> %subadd
35 define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
36 ; NOFMA-LABEL: mul_subadd_ps128:
37 ; NOFMA: # %bb.0: # %entry
38 ; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
39 ; NOFMA-NEXT: vsubps %xmm2, %xmm0, %xmm1
40 ; NOFMA-NEXT: vaddps %xmm2, %xmm0, %xmm0
41 ; NOFMA-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
44 ; FMA3-LABEL: mul_subadd_ps128:
45 ; FMA3: # %bb.0: # %entry
46 ; FMA3-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
49 ; FMA4-LABEL: mul_subadd_ps128:
50 ; FMA4: # %bb.0: # %entry
51 ; FMA4-NEXT: vfmsubaddps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
54 %AB = fmul <4 x float> %A, %B
55 %Sub = fsub <4 x float> %AB, %C
56 %Add = fadd <4 x float> %AB, %C
57 %subadd = shufflevector <4 x float> %Add, <4 x float> %Sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
58 ret <4 x float> %subadd
61 define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
62 ; NOFMA-LABEL: mul_subadd_pd256:
63 ; NOFMA: # %bb.0: # %entry
64 ; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0
65 ; NOFMA-NEXT: vsubpd %ymm2, %ymm0, %ymm1
66 ; NOFMA-NEXT: vaddpd %ymm2, %ymm0, %ymm0
67 ; NOFMA-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
70 ; FMA3-LABEL: mul_subadd_pd256:
71 ; FMA3: # %bb.0: # %entry
72 ; FMA3-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
75 ; FMA4-LABEL: mul_subadd_pd256:
76 ; FMA4: # %bb.0: # %entry
77 ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
80 %AB = fmul <4 x double> %A, %B
81 %Sub = fsub <4 x double> %AB, %C
82 %Add = fadd <4 x double> %AB, %C
83 %subadd = shufflevector <4 x double> %Add, <4 x double> %Sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
84 ret <4 x double> %subadd
87 define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
88 ; NOFMA-LABEL: mul_subadd_ps256:
89 ; NOFMA: # %bb.0: # %entry
90 ; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
91 ; NOFMA-NEXT: vsubps %ymm2, %ymm0, %ymm1
92 ; NOFMA-NEXT: vaddps %ymm2, %ymm0, %ymm0
93 ; NOFMA-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
96 ; FMA3-LABEL: mul_subadd_ps256:
97 ; FMA3: # %bb.0: # %entry
98 ; FMA3-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
101 ; FMA4-LABEL: mul_subadd_ps256:
102 ; FMA4: # %bb.0: # %entry
103 ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
106 %AB = fmul <8 x float> %A, %B
107 %Sub = fsub <8 x float> %AB, %C
108 %Add = fadd <8 x float> %AB, %C
109 %subadd = shufflevector <8 x float> %Add, <8 x float> %Sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
110 ret <8 x float> %subadd
113 define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
114 ; NOFMA-LABEL: mul_subadd_pd512:
115 ; NOFMA: # %bb.0: # %entry
116 ; NOFMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0
117 ; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1
118 ; NOFMA-NEXT: vsubpd %ymm5, %ymm1, %ymm2
119 ; NOFMA-NEXT: vsubpd %ymm4, %ymm0, %ymm3
120 ; NOFMA-NEXT: vaddpd %ymm5, %ymm1, %ymm1
121 ; NOFMA-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
122 ; NOFMA-NEXT: vaddpd %ymm4, %ymm0, %ymm0
123 ; NOFMA-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
126 ; FMA3_256-LABEL: mul_subadd_pd512:
127 ; FMA3_256: # %bb.0: # %entry
128 ; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
129 ; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
130 ; FMA3_256-NEXT: retq
132 ; FMA3_512-LABEL: mul_subadd_pd512:
133 ; FMA3_512: # %bb.0: # %entry
134 ; FMA3_512-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
135 ; FMA3_512-NEXT: retq
137 ; FMA4-LABEL: mul_subadd_pd512:
138 ; FMA4: # %bb.0: # %entry
139 ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4
140 ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5
143 %AB = fmul <8 x double> %A, %B
144 %Sub = fsub <8 x double> %AB, %C
145 %Add = fadd <8 x double> %AB, %C
146 %subadd = shufflevector <8 x double> %Add, <8 x double> %Sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
147 ret <8 x double> %subadd
150 define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
151 ; NOFMA-LABEL: mul_subadd_ps512:
152 ; NOFMA: # %bb.0: # %entry
153 ; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
154 ; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1
155 ; NOFMA-NEXT: vsubps %ymm5, %ymm1, %ymm2
156 ; NOFMA-NEXT: vsubps %ymm4, %ymm0, %ymm3
157 ; NOFMA-NEXT: vaddps %ymm5, %ymm1, %ymm1
158 ; NOFMA-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
159 ; NOFMA-NEXT: vaddps %ymm4, %ymm0, %ymm0
160 ; NOFMA-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
163 ; FMA3_256-LABEL: mul_subadd_ps512:
164 ; FMA3_256: # %bb.0: # %entry
165 ; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
166 ; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
167 ; FMA3_256-NEXT: retq
169 ; FMA3_512-LABEL: mul_subadd_ps512:
170 ; FMA3_512: # %bb.0: # %entry
171 ; FMA3_512-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
172 ; FMA3_512-NEXT: retq
174 ; FMA4-LABEL: mul_subadd_ps512:
175 ; FMA4: # %bb.0: # %entry
176 ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4
177 ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5
180 %AB = fmul <16 x float> %A, %B
181 %Sub = fsub <16 x float> %AB, %C
182 %Add = fadd <16 x float> %AB, %C
183 %subadd = shufflevector <16 x float> %Add, <16 x float> %Sub, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
184 ret <16 x float> %subadd
187 ; This should not be matched to fmsubadd because the mul is on the wrong side of the fsub.
188 define <2 x double> @mul_subadd_bad_commute(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
189 ; CHECK-LABEL: mul_subadd_bad_commute:
190 ; CHECK: # %bb.0: # %entry
191 ; CHECK-NEXT: vmulpd %xmm1, %xmm0, %xmm0
192 ; CHECK-NEXT: vsubpd %xmm0, %xmm2, %xmm1
193 ; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
194 ; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
197 %AB = fmul <2 x double> %A, %B
198 %Sub = fsub <2 x double> %C, %AB
199 %Add = fadd <2 x double> %AB, %C
200 %subadd = shufflevector <2 x double> %Add, <2 x double> %Sub, <2 x i32> <i32 0, i32 3>
201 ret <2 x double> %subadd
204 attributes #0 = { nounwind "unsafe-fp-math"="true" }