1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck %s -check-prefixes=FMA3,FMA3_256
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck %s -check-prefixes=FMA3,FMA3_512
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck %s -check-prefixes=FMA4
6 ; This test checks the fusing of MUL + ADDSUB to FMADDSUB.
8 define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
9 ; FMA3-LABEL: mul_addsub_pd128:
10 ; FMA3: # %bb.0: # %entry
11 ; FMA3-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
14 ; FMA4-LABEL: mul_addsub_pd128:
15 ; FMA4: # %bb.0: # %entry
16 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
19 %AB = fmul <2 x double> %A, %B
20 %Sub = fsub <2 x double> %AB, %C
21 %Add = fadd <2 x double> %AB, %C
22 %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3>
23 ret <2 x double> %Addsub
26 define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
27 ; FMA3-LABEL: mul_addsub_ps128:
28 ; FMA3: # %bb.0: # %entry
29 ; FMA3-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
32 ; FMA4-LABEL: mul_addsub_ps128:
33 ; FMA4: # %bb.0: # %entry
34 ; FMA4-NEXT: vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
37 %AB = fmul <4 x float> %A, %B
38 %Sub = fsub <4 x float> %AB, %C
39 %Add = fadd <4 x float> %AB, %C
40 %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
41 ret <4 x float> %Addsub
44 define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
45 ; FMA3-LABEL: mul_addsub_pd256:
46 ; FMA3: # %bb.0: # %entry
47 ; FMA3-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
50 ; FMA4-LABEL: mul_addsub_pd256:
51 ; FMA4: # %bb.0: # %entry
52 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
55 %AB = fmul <4 x double> %A, %B
56 %Sub = fsub <4 x double> %AB, %C
57 %Add = fadd <4 x double> %AB, %C
58 %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
59 ret <4 x double> %Addsub
62 define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
63 ; FMA3-LABEL: mul_addsub_ps256:
64 ; FMA3: # %bb.0: # %entry
65 ; FMA3-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
68 ; FMA4-LABEL: mul_addsub_ps256:
69 ; FMA4: # %bb.0: # %entry
70 ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
73 %AB = fmul <8 x float> %A, %B
74 %Sub = fsub <8 x float> %AB, %C
75 %Add = fadd <8 x float> %AB, %C
76 %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
77 ret <8 x float> %Addsub
80 define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
81 ; FMA3_256-LABEL: mul_addsub_pd512:
82 ; FMA3_256: # %bb.0: # %entry
83 ; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
84 ; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
87 ; FMA3_512-LABEL: mul_addsub_pd512:
88 ; FMA3_512: # %bb.0: # %entry
89 ; FMA3_512-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
92 ; FMA4-LABEL: mul_addsub_pd512:
93 ; FMA4: # %bb.0: # %entry
94 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
95 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
98 %AB = fmul <8 x double> %A, %B
99 %Sub = fsub <8 x double> %AB, %C
100 %Add = fadd <8 x double> %AB, %C
101 %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
102 ret <8 x double> %Addsub
105 define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
106 ; FMA3_256-LABEL: mul_addsub_ps512:
107 ; FMA3_256: # %bb.0: # %entry
108 ; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
109 ; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
110 ; FMA3_256-NEXT: retq
112 ; FMA3_512-LABEL: mul_addsub_ps512:
113 ; FMA3_512: # %bb.0: # %entry
114 ; FMA3_512-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
115 ; FMA3_512-NEXT: retq
117 ; FMA4-LABEL: mul_addsub_ps512:
118 ; FMA4: # %bb.0: # %entry
119 ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
120 ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
123 %AB = fmul <16 x float> %A, %B
124 %Sub = fsub <16 x float> %AB, %C
125 %Add = fadd <16 x float> %AB, %C
126 %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
127 ret <16 x float> %Addsub
130 define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
131 ; FMA3-LABEL: buildvector_mul_addsub_ps128:
132 ; FMA3: # %bb.0: # %bb
133 ; FMA3-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
136 ; FMA4-LABEL: buildvector_mul_addsub_ps128:
137 ; FMA4: # %bb.0: # %bb
138 ; FMA4-NEXT: vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
141 %A = fmul <4 x float> %C, %D
142 %A0 = extractelement <4 x float> %A, i32 0
143 %B0 = extractelement <4 x float> %B, i32 0
144 %sub0 = fsub float %A0, %B0
145 %A2 = extractelement <4 x float> %A, i32 2
146 %B2 = extractelement <4 x float> %B, i32 2
147 %sub2 = fsub float %A2, %B2
148 %A1 = extractelement <4 x float> %A, i32 1
149 %B1 = extractelement <4 x float> %B, i32 1
150 %add1 = fadd float %A1, %B1
151 %A3 = extractelement <4 x float> %A, i32 3
152 %B3 = extractelement <4 x float> %B, i32 3
153 %add3 = fadd float %A3, %B3
154 %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
155 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
156 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
157 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
158 ret <4 x float> %vecinsert4
161 define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
162 ; FMA3-LABEL: buildvector_mul_addsub_pd128:
163 ; FMA3: # %bb.0: # %bb
164 ; FMA3-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
167 ; FMA4-LABEL: buildvector_mul_addsub_pd128:
168 ; FMA4: # %bb.0: # %bb
169 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
172 %A = fmul <2 x double> %C, %D
173 %A0 = extractelement <2 x double> %A, i32 0
174 %B0 = extractelement <2 x double> %B, i32 0
175 %sub0 = fsub double %A0, %B0
176 %A1 = extractelement <2 x double> %A, i32 1
177 %B1 = extractelement <2 x double> %B, i32 1
178 %add1 = fadd double %A1, %B1
179 %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
180 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
181 ret <2 x double> %vecinsert2
184 define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
185 ; FMA3-LABEL: buildvector_mul_addsub_ps256:
186 ; FMA3: # %bb.0: # %bb
187 ; FMA3-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
190 ; FMA4-LABEL: buildvector_mul_addsub_ps256:
191 ; FMA4: # %bb.0: # %bb
192 ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
195 %A = fmul <8 x float> %C, %D
196 %A0 = extractelement <8 x float> %A, i32 0
197 %B0 = extractelement <8 x float> %B, i32 0
198 %sub0 = fsub float %A0, %B0
199 %A2 = extractelement <8 x float> %A, i32 2
200 %B2 = extractelement <8 x float> %B, i32 2
201 %sub2 = fsub float %A2, %B2
202 %A4 = extractelement <8 x float> %A, i32 4
203 %B4 = extractelement <8 x float> %B, i32 4
204 %sub4 = fsub float %A4, %B4
205 %A6 = extractelement <8 x float> %A, i32 6
206 %B6 = extractelement <8 x float> %B, i32 6
207 %sub6 = fsub float %A6, %B6
208 %A1 = extractelement <8 x float> %A, i32 1
209 %B1 = extractelement <8 x float> %B, i32 1
210 %add1 = fadd float %A1, %B1
211 %A3 = extractelement <8 x float> %A, i32 3
212 %B3 = extractelement <8 x float> %B, i32 3
213 %add3 = fadd float %A3, %B3
214 %A5 = extractelement <8 x float> %A, i32 5
215 %B5 = extractelement <8 x float> %B, i32 5
216 %add5 = fadd float %A5, %B5
217 %A7 = extractelement <8 x float> %A, i32 7
218 %B7 = extractelement <8 x float> %B, i32 7
219 %add7 = fadd float %A7, %B7
220 %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
221 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
222 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
223 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
224 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
225 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
226 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
227 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
228 ret <8 x float> %vecinsert8
231 define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
232 ; FMA3-LABEL: buildvector_mul_addsub_pd256:
233 ; FMA3: # %bb.0: # %bb
234 ; FMA3-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
237 ; FMA4-LABEL: buildvector_mul_addsub_pd256:
238 ; FMA4: # %bb.0: # %bb
239 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
242 %A = fmul <4 x double> %C, %D
243 %A0 = extractelement <4 x double> %A, i32 0
244 %B0 = extractelement <4 x double> %B, i32 0
245 %sub0 = fsub double %A0, %B0
246 %A2 = extractelement <4 x double> %A, i32 2
247 %B2 = extractelement <4 x double> %B, i32 2
248 %sub2 = fsub double %A2, %B2
249 %A1 = extractelement <4 x double> %A, i32 1
250 %B1 = extractelement <4 x double> %B, i32 1
251 %add1 = fadd double %A1, %B1
252 %A3 = extractelement <4 x double> %A, i32 3
253 %B3 = extractelement <4 x double> %B, i32 3
254 %add3 = fadd double %A3, %B3
255 %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
256 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
257 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
258 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
259 ret <4 x double> %vecinsert4
262 define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
263 ; FMA3_256-LABEL: buildvector_mul_addsub_ps512:
264 ; FMA3_256: # %bb.0: # %bb
265 ; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
266 ; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
267 ; FMA3_256-NEXT: retq
269 ; FMA3_512-LABEL: buildvector_mul_addsub_ps512:
270 ; FMA3_512: # %bb.0: # %bb
271 ; FMA3_512-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
272 ; FMA3_512-NEXT: retq
274 ; FMA4-LABEL: buildvector_mul_addsub_ps512:
275 ; FMA4: # %bb.0: # %bb
276 ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
277 ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
280 %A = fmul <16 x float> %C, %D
281 %A0 = extractelement <16 x float> %A, i32 0
282 %B0 = extractelement <16 x float> %B, i32 0
283 %sub0 = fsub float %A0, %B0
284 %A2 = extractelement <16 x float> %A, i32 2
285 %B2 = extractelement <16 x float> %B, i32 2
286 %sub2 = fsub float %A2, %B2
287 %A4 = extractelement <16 x float> %A, i32 4
288 %B4 = extractelement <16 x float> %B, i32 4
289 %sub4 = fsub float %A4, %B4
290 %A6 = extractelement <16 x float> %A, i32 6
291 %B6 = extractelement <16 x float> %B, i32 6
292 %sub6 = fsub float %A6, %B6
293 %A8 = extractelement <16 x float> %A, i32 8
294 %B8 = extractelement <16 x float> %B, i32 8
295 %sub8 = fsub float %A8, %B8
296 %A10 = extractelement <16 x float> %A, i32 10
297 %B10 = extractelement <16 x float> %B, i32 10
298 %sub10 = fsub float %A10, %B10
299 %A12 = extractelement <16 x float> %A, i32 12
300 %B12 = extractelement <16 x float> %B, i32 12
301 %sub12 = fsub float %A12, %B12
302 %A14 = extractelement <16 x float> %A, i32 14
303 %B14 = extractelement <16 x float> %B, i32 14
304 %sub14 = fsub float %A14, %B14
305 %A1 = extractelement <16 x float> %A, i32 1
306 %B1 = extractelement <16 x float> %B, i32 1
307 %add1 = fadd float %A1, %B1
308 %A3 = extractelement <16 x float> %A, i32 3
309 %B3 = extractelement <16 x float> %B, i32 3
310 %add3 = fadd float %A3, %B3
311 %A5 = extractelement <16 x float> %A, i32 5
312 %B5 = extractelement <16 x float> %B, i32 5
313 %add5 = fadd float %A5, %B5
314 %A7 = extractelement <16 x float> %A, i32 7
315 %B7 = extractelement <16 x float> %B, i32 7
316 %add7 = fadd float %A7, %B7
317 %A9 = extractelement <16 x float> %A, i32 9
318 %B9 = extractelement <16 x float> %B, i32 9
319 %add9 = fadd float %A9, %B9
320 %A11 = extractelement <16 x float> %A, i32 11
321 %B11 = extractelement <16 x float> %B, i32 11
322 %add11 = fadd float %A11, %B11
323 %A13 = extractelement <16 x float> %A, i32 13
324 %B13 = extractelement <16 x float> %B, i32 13
325 %add13 = fadd float %A13, %B13
326 %A15 = extractelement <16 x float> %A, i32 15
327 %B15 = extractelement <16 x float> %B, i32 15
328 %add15 = fadd float %A15, %B15
329 %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
330 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
331 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
332 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
333 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
335 %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
336 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
337 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
338 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
339 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
340 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
341 ; element 12 is undef
342 %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
343 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
344 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
345 ret <16 x float> %vecinsert16
348 define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
349 ; FMA3_256-LABEL: buildvector_mul_addsub_pd512:
350 ; FMA3_256: # %bb.0: # %bb
351 ; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
352 ; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
353 ; FMA3_256-NEXT: retq
355 ; FMA3_512-LABEL: buildvector_mul_addsub_pd512:
356 ; FMA3_512: # %bb.0: # %bb
357 ; FMA3_512-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
358 ; FMA3_512-NEXT: retq
360 ; FMA4-LABEL: buildvector_mul_addsub_pd512:
361 ; FMA4: # %bb.0: # %bb
362 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
363 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
366 %A = fmul <8 x double> %C, %D
367 %A0 = extractelement <8 x double> %A, i32 0
368 %B0 = extractelement <8 x double> %B, i32 0
369 %sub0 = fsub double %A0, %B0
370 %A2 = extractelement <8 x double> %A, i32 2
371 %B2 = extractelement <8 x double> %B, i32 2
372 %sub2 = fsub double %A2, %B2
373 %A4 = extractelement <8 x double> %A, i32 4
374 %B4 = extractelement <8 x double> %B, i32 4
375 %sub4 = fsub double %A4, %B4
376 %A6 = extractelement <8 x double> %A, i32 6
377 %B6 = extractelement <8 x double> %B, i32 6
378 %sub6 = fsub double %A6, %B6
379 %A1 = extractelement <8 x double> %A, i32 1
380 %B1 = extractelement <8 x double> %B, i32 1
381 %add1 = fadd double %A1, %B1
382 %A3 = extractelement <8 x double> %A, i32 3
383 %B3 = extractelement <8 x double> %B, i32 3
384 %add3 = fadd double %A3, %B3
385 %A7 = extractelement <8 x double> %A, i32 7
386 %B7 = extractelement <8 x double> %B, i32 7
387 %add7 = fadd double %A7, %B7
388 %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
389 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
390 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
391 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
392 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
394 %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
395 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
396 ret <8 x double> %vecinsert8
399 define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
400 ; FMA3-LABEL: buildvector_mul_subadd_ps128:
401 ; FMA3: # %bb.0: # %bb
402 ; FMA3-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
405 ; FMA4-LABEL: buildvector_mul_subadd_ps128:
406 ; FMA4: # %bb.0: # %bb
407 ; FMA4-NEXT: vfmsubaddps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
410 %A = fmul <4 x float> %C, %D
411 %A0 = extractelement <4 x float> %A, i32 0
412 %B0 = extractelement <4 x float> %B, i32 0
413 %sub0 = fadd float %A0, %B0
414 %A2 = extractelement <4 x float> %A, i32 2
415 %B2 = extractelement <4 x float> %B, i32 2
416 %sub2 = fadd float %A2, %B2
417 %A1 = extractelement <4 x float> %A, i32 1
418 %B1 = extractelement <4 x float> %B, i32 1
419 %add1 = fsub float %A1, %B1
420 %A3 = extractelement <4 x float> %A, i32 3
421 %B3 = extractelement <4 x float> %B, i32 3
422 %add3 = fsub float %A3, %B3
423 %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
424 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
425 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
426 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
427 ret <4 x float> %vecinsert4
430 define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
431 ; FMA3-LABEL: buildvector_mul_subadd_pd128:
432 ; FMA3: # %bb.0: # %bb
433 ; FMA3-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
436 ; FMA4-LABEL: buildvector_mul_subadd_pd128:
437 ; FMA4: # %bb.0: # %bb
438 ; FMA4-NEXT: vfmsubaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
441 %A = fmul <2 x double> %C, %D
442 %A0 = extractelement <2 x double> %A, i32 0
443 %B0 = extractelement <2 x double> %B, i32 0
444 %sub0 = fadd double %A0, %B0
445 %A1 = extractelement <2 x double> %A, i32 1
446 %B1 = extractelement <2 x double> %B, i32 1
447 %add1 = fsub double %A1, %B1
448 %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
449 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
450 ret <2 x double> %vecinsert2
453 define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
454 ; FMA3-LABEL: buildvector_mul_subadd_ps256:
455 ; FMA3: # %bb.0: # %bb
456 ; FMA3-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
459 ; FMA4-LABEL: buildvector_mul_subadd_ps256:
460 ; FMA4: # %bb.0: # %bb
461 ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
464 %A = fmul <8 x float> %C, %D
465 %A0 = extractelement <8 x float> %A, i32 0
466 %B0 = extractelement <8 x float> %B, i32 0
467 %sub0 = fadd float %A0, %B0
468 %A2 = extractelement <8 x float> %A, i32 2
469 %B2 = extractelement <8 x float> %B, i32 2
470 %sub2 = fadd float %A2, %B2
471 %A4 = extractelement <8 x float> %A, i32 4
472 %B4 = extractelement <8 x float> %B, i32 4
473 %sub4 = fadd float %A4, %B4
474 %A6 = extractelement <8 x float> %A, i32 6
475 %B6 = extractelement <8 x float> %B, i32 6
476 %sub6 = fadd float %A6, %B6
477 %A1 = extractelement <8 x float> %A, i32 1
478 %B1 = extractelement <8 x float> %B, i32 1
479 %add1 = fsub float %A1, %B1
480 %A3 = extractelement <8 x float> %A, i32 3
481 %B3 = extractelement <8 x float> %B, i32 3
482 %add3 = fsub float %A3, %B3
483 %A5 = extractelement <8 x float> %A, i32 5
484 %B5 = extractelement <8 x float> %B, i32 5
485 %add5 = fsub float %A5, %B5
486 %A7 = extractelement <8 x float> %A, i32 7
487 %B7 = extractelement <8 x float> %B, i32 7
488 %add7 = fsub float %A7, %B7
489 %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
490 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
491 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
492 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
493 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
494 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
495 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
496 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
497 ret <8 x float> %vecinsert8
500 define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
501 ; FMA3-LABEL: buildvector_mul_subadd_pd256:
502 ; FMA3: # %bb.0: # %bb
503 ; FMA3-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
506 ; FMA4-LABEL: buildvector_mul_subadd_pd256:
507 ; FMA4: # %bb.0: # %bb
508 ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
511 %A = fmul <4 x double> %C, %D
512 %A0 = extractelement <4 x double> %A, i32 0
513 %B0 = extractelement <4 x double> %B, i32 0
514 %sub0 = fadd double %A0, %B0
515 %A2 = extractelement <4 x double> %A, i32 2
516 %B2 = extractelement <4 x double> %B, i32 2
517 %sub2 = fadd double %A2, %B2
518 %A1 = extractelement <4 x double> %A, i32 1
519 %B1 = extractelement <4 x double> %B, i32 1
520 %add1 = fsub double %A1, %B1
521 %A3 = extractelement <4 x double> %A, i32 3
522 %B3 = extractelement <4 x double> %B, i32 3
523 %add3 = fsub double %A3, %B3
524 %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
525 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
526 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
527 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
528 ret <4 x double> %vecinsert4
531 define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
532 ; FMA3_256-LABEL: buildvector_mul_subadd_ps512:
533 ; FMA3_256: # %bb.0: # %bb
534 ; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
535 ; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
536 ; FMA3_256-NEXT: retq
538 ; FMA3_512-LABEL: buildvector_mul_subadd_ps512:
539 ; FMA3_512: # %bb.0: # %bb
540 ; FMA3_512-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
541 ; FMA3_512-NEXT: retq
543 ; FMA4-LABEL: buildvector_mul_subadd_ps512:
544 ; FMA4: # %bb.0: # %bb
545 ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4
546 ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5
549 %A = fmul <16 x float> %C, %D
550 %A0 = extractelement <16 x float> %A, i32 0
551 %B0 = extractelement <16 x float> %B, i32 0
552 %sub0 = fadd float %A0, %B0
553 %A2 = extractelement <16 x float> %A, i32 2
554 %B2 = extractelement <16 x float> %B, i32 2
555 %sub2 = fadd float %A2, %B2
556 %A4 = extractelement <16 x float> %A, i32 4
557 %B4 = extractelement <16 x float> %B, i32 4
558 %sub4 = fadd float %A4, %B4
559 %A6 = extractelement <16 x float> %A, i32 6
560 %B6 = extractelement <16 x float> %B, i32 6
561 %sub6 = fadd float %A6, %B6
562 %A8 = extractelement <16 x float> %A, i32 8
563 %B8 = extractelement <16 x float> %B, i32 8
564 %sub8 = fadd float %A8, %B8
565 %A10 = extractelement <16 x float> %A, i32 10
566 %B10 = extractelement <16 x float> %B, i32 10
567 %sub10 = fadd float %A10, %B10
568 %A12 = extractelement <16 x float> %A, i32 12
569 %B12 = extractelement <16 x float> %B, i32 12
570 %sub12 = fadd float %A12, %B12
571 %A14 = extractelement <16 x float> %A, i32 14
572 %B14 = extractelement <16 x float> %B, i32 14
573 %sub14 = fadd float %A14, %B14
574 %A1 = extractelement <16 x float> %A, i32 1
575 %B1 = extractelement <16 x float> %B, i32 1
576 %add1 = fsub float %A1, %B1
577 %A3 = extractelement <16 x float> %A, i32 3
578 %B3 = extractelement <16 x float> %B, i32 3
579 %add3 = fsub float %A3, %B3
580 %A5 = extractelement <16 x float> %A, i32 5
581 %B5 = extractelement <16 x float> %B, i32 5
582 %add5 = fsub float %A5, %B5
583 %A7 = extractelement <16 x float> %A, i32 7
584 %B7 = extractelement <16 x float> %B, i32 7
585 %add7 = fsub float %A7, %B7
586 %A9 = extractelement <16 x float> %A, i32 9
587 %B9 = extractelement <16 x float> %B, i32 9
588 %add9 = fsub float %A9, %B9
589 %A11 = extractelement <16 x float> %A, i32 11
590 %B11 = extractelement <16 x float> %B, i32 11
591 %add11 = fsub float %A11, %B11
592 %A13 = extractelement <16 x float> %A, i32 13
593 %B13 = extractelement <16 x float> %B, i32 13
594 %add13 = fsub float %A13, %B13
595 %A15 = extractelement <16 x float> %A, i32 15
596 %B15 = extractelement <16 x float> %B, i32 15
597 %add15 = fsub float %A15, %B15
598 %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
599 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
600 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
601 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
602 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
604 %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
605 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
606 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
607 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
608 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
609 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
610 ; element 12 is undef
611 %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
612 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
613 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
614 ret <16 x float> %vecinsert16
617 define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
618 ; FMA3_256-LABEL: buildvector_mul_subadd_pd512:
619 ; FMA3_256: # %bb.0: # %bb
620 ; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
621 ; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
622 ; FMA3_256-NEXT: retq
624 ; FMA3_512-LABEL: buildvector_mul_subadd_pd512:
625 ; FMA3_512: # %bb.0: # %bb
626 ; FMA3_512-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
627 ; FMA3_512-NEXT: retq
629 ; FMA4-LABEL: buildvector_mul_subadd_pd512:
630 ; FMA4: # %bb.0: # %bb
631 ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4
632 ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5
635 %A = fmul <8 x double> %C, %D
636 %A0 = extractelement <8 x double> %A, i32 0
637 %B0 = extractelement <8 x double> %B, i32 0
638 %sub0 = fadd double %A0, %B0
639 %A2 = extractelement <8 x double> %A, i32 2
640 %B2 = extractelement <8 x double> %B, i32 2
641 %sub2 = fadd double %A2, %B2
642 %A4 = extractelement <8 x double> %A, i32 4
643 %B4 = extractelement <8 x double> %B, i32 4
644 %sub4 = fadd double %A4, %B4
645 %A6 = extractelement <8 x double> %A, i32 6
646 %B6 = extractelement <8 x double> %B, i32 6
647 %sub6 = fadd double %A6, %B6
648 %A1 = extractelement <8 x double> %A, i32 1
649 %B1 = extractelement <8 x double> %B, i32 1
650 %add1 = fsub double %A1, %B1
651 %A3 = extractelement <8 x double> %A, i32 3
652 %B3 = extractelement <8 x double> %B, i32 3
653 %add3 = fsub double %A3, %B3
654 %A7 = extractelement <8 x double> %A, i32 7
655 %B7 = extractelement <8 x double> %B, i32 7
656 %add7 = fsub double %A7, %B7
657 %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
658 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
659 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
660 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
661 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
663 %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
664 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
665 ret <8 x double> %vecinsert8
668 attributes #0 = { nounwind "unsafe-fp-math"="true" }