1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefixes=NOFMA
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck %s -check-prefixes=FMA3,FMA3_256
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck %s -check-prefixes=FMA3,FMA3_512
5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck %s -check-prefixes=FMA4
7 ; This test checks the fusing of MUL + ADDSUB to FMADDSUB.
9 define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
10 ; NOFMA-LABEL: mul_addsub_pd128:
11 ; NOFMA: # %bb.0: # %entry
12 ; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0
13 ; NOFMA-NEXT: vaddsubpd %xmm2, %xmm0, %xmm0
16 ; FMA3-LABEL: mul_addsub_pd128:
17 ; FMA3: # %bb.0: # %entry
18 ; FMA3-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
21 ; FMA4-LABEL: mul_addsub_pd128:
22 ; FMA4: # %bb.0: # %entry
23 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
26 %AB = fmul <2 x double> %A, %B
27 %Sub = fsub <2 x double> %AB, %C
28 %Add = fadd <2 x double> %AB, %C
29 %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3>
30 ret <2 x double> %Addsub
33 define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
34 ; NOFMA-LABEL: mul_addsub_ps128:
35 ; NOFMA: # %bb.0: # %entry
36 ; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
37 ; NOFMA-NEXT: vaddsubps %xmm2, %xmm0, %xmm0
40 ; FMA3-LABEL: mul_addsub_ps128:
41 ; FMA3: # %bb.0: # %entry
42 ; FMA3-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
45 ; FMA4-LABEL: mul_addsub_ps128:
46 ; FMA4: # %bb.0: # %entry
47 ; FMA4-NEXT: vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
50 %AB = fmul <4 x float> %A, %B
51 %Sub = fsub <4 x float> %AB, %C
52 %Add = fadd <4 x float> %AB, %C
53 %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
54 ret <4 x float> %Addsub
57 define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
58 ; NOFMA-LABEL: mul_addsub_pd256:
59 ; NOFMA: # %bb.0: # %entry
60 ; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0
61 ; NOFMA-NEXT: vaddsubpd %ymm2, %ymm0, %ymm0
64 ; FMA3-LABEL: mul_addsub_pd256:
65 ; FMA3: # %bb.0: # %entry
66 ; FMA3-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
69 ; FMA4-LABEL: mul_addsub_pd256:
70 ; FMA4: # %bb.0: # %entry
71 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
74 %AB = fmul <4 x double> %A, %B
75 %Sub = fsub <4 x double> %AB, %C
76 %Add = fadd <4 x double> %AB, %C
77 %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
78 ret <4 x double> %Addsub
81 define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
82 ; NOFMA-LABEL: mul_addsub_ps256:
83 ; NOFMA: # %bb.0: # %entry
84 ; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
85 ; NOFMA-NEXT: vaddsubps %ymm2, %ymm0, %ymm0
88 ; FMA3-LABEL: mul_addsub_ps256:
89 ; FMA3: # %bb.0: # %entry
90 ; FMA3-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
93 ; FMA4-LABEL: mul_addsub_ps256:
94 ; FMA4: # %bb.0: # %entry
95 ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
98 %AB = fmul <8 x float> %A, %B
99 %Sub = fsub <8 x float> %AB, %C
100 %Add = fadd <8 x float> %AB, %C
101 %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
102 ret <8 x float> %Addsub
105 define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
106 ; NOFMA-LABEL: mul_addsub_pd512:
107 ; NOFMA: # %bb.0: # %entry
108 ; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1
109 ; NOFMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0
110 ; NOFMA-NEXT: vaddsubpd %ymm4, %ymm0, %ymm0
111 ; NOFMA-NEXT: vaddsubpd %ymm5, %ymm1, %ymm1
114 ; FMA3_256-LABEL: mul_addsub_pd512:
115 ; FMA3_256: # %bb.0: # %entry
116 ; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
117 ; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
118 ; FMA3_256-NEXT: retq
120 ; FMA3_512-LABEL: mul_addsub_pd512:
121 ; FMA3_512: # %bb.0: # %entry
122 ; FMA3_512-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
123 ; FMA3_512-NEXT: retq
125 ; FMA4-LABEL: mul_addsub_pd512:
126 ; FMA4: # %bb.0: # %entry
127 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
128 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
131 %AB = fmul <8 x double> %A, %B
132 %Sub = fsub <8 x double> %AB, %C
133 %Add = fadd <8 x double> %AB, %C
134 %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
135 ret <8 x double> %Addsub
138 define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
139 ; NOFMA-LABEL: mul_addsub_ps512:
140 ; NOFMA: # %bb.0: # %entry
141 ; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1
142 ; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
143 ; NOFMA-NEXT: vaddsubps %ymm4, %ymm0, %ymm0
144 ; NOFMA-NEXT: vaddsubps %ymm5, %ymm1, %ymm1
147 ; FMA3_256-LABEL: mul_addsub_ps512:
148 ; FMA3_256: # %bb.0: # %entry
149 ; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
150 ; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
151 ; FMA3_256-NEXT: retq
153 ; FMA3_512-LABEL: mul_addsub_ps512:
154 ; FMA3_512: # %bb.0: # %entry
155 ; FMA3_512-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
156 ; FMA3_512-NEXT: retq
158 ; FMA4-LABEL: mul_addsub_ps512:
159 ; FMA4: # %bb.0: # %entry
160 ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
161 ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
164 %AB = fmul <16 x float> %A, %B
165 %Sub = fsub <16 x float> %AB, %C
166 %Add = fadd <16 x float> %AB, %C
167 %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
168 ret <16 x float> %Addsub
171 define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
172 ; NOFMA-LABEL: buildvector_mul_addsub_ps128:
173 ; NOFMA: # %bb.0: # %bb
174 ; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
175 ; NOFMA-NEXT: vaddsubps %xmm2, %xmm0, %xmm0
178 ; FMA3-LABEL: buildvector_mul_addsub_ps128:
179 ; FMA3: # %bb.0: # %bb
180 ; FMA3-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
183 ; FMA4-LABEL: buildvector_mul_addsub_ps128:
184 ; FMA4: # %bb.0: # %bb
185 ; FMA4-NEXT: vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
188 %A = fmul <4 x float> %C, %D
189 %A0 = extractelement <4 x float> %A, i32 0
190 %B0 = extractelement <4 x float> %B, i32 0
191 %sub0 = fsub float %A0, %B0
192 %A2 = extractelement <4 x float> %A, i32 2
193 %B2 = extractelement <4 x float> %B, i32 2
194 %sub2 = fsub float %A2, %B2
195 %A1 = extractelement <4 x float> %A, i32 1
196 %B1 = extractelement <4 x float> %B, i32 1
197 %add1 = fadd float %A1, %B1
198 %A3 = extractelement <4 x float> %A, i32 3
199 %B3 = extractelement <4 x float> %B, i32 3
200 %add3 = fadd float %A3, %B3
201 %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
202 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
203 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
204 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
205 ret <4 x float> %vecinsert4
208 define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
209 ; NOFMA-LABEL: buildvector_mul_addsub_pd128:
210 ; NOFMA: # %bb.0: # %bb
211 ; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0
212 ; NOFMA-NEXT: vaddsubpd %xmm2, %xmm0, %xmm0
215 ; FMA3-LABEL: buildvector_mul_addsub_pd128:
216 ; FMA3: # %bb.0: # %bb
217 ; FMA3-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
220 ; FMA4-LABEL: buildvector_mul_addsub_pd128:
221 ; FMA4: # %bb.0: # %bb
222 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
225 %A = fmul <2 x double> %C, %D
226 %A0 = extractelement <2 x double> %A, i32 0
227 %B0 = extractelement <2 x double> %B, i32 0
228 %sub0 = fsub double %A0, %B0
229 %A1 = extractelement <2 x double> %A, i32 1
230 %B1 = extractelement <2 x double> %B, i32 1
231 %add1 = fadd double %A1, %B1
232 %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
233 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
234 ret <2 x double> %vecinsert2
237 define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
238 ; NOFMA-LABEL: buildvector_mul_addsub_ps256:
239 ; NOFMA: # %bb.0: # %bb
240 ; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
241 ; NOFMA-NEXT: vaddsubps %ymm2, %ymm0, %ymm0
244 ; FMA3-LABEL: buildvector_mul_addsub_ps256:
245 ; FMA3: # %bb.0: # %bb
246 ; FMA3-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
249 ; FMA4-LABEL: buildvector_mul_addsub_ps256:
250 ; FMA4: # %bb.0: # %bb
251 ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
254 %A = fmul <8 x float> %C, %D
255 %A0 = extractelement <8 x float> %A, i32 0
256 %B0 = extractelement <8 x float> %B, i32 0
257 %sub0 = fsub float %A0, %B0
258 %A2 = extractelement <8 x float> %A, i32 2
259 %B2 = extractelement <8 x float> %B, i32 2
260 %sub2 = fsub float %A2, %B2
261 %A4 = extractelement <8 x float> %A, i32 4
262 %B4 = extractelement <8 x float> %B, i32 4
263 %sub4 = fsub float %A4, %B4
264 %A6 = extractelement <8 x float> %A, i32 6
265 %B6 = extractelement <8 x float> %B, i32 6
266 %sub6 = fsub float %A6, %B6
267 %A1 = extractelement <8 x float> %A, i32 1
268 %B1 = extractelement <8 x float> %B, i32 1
269 %add1 = fadd float %A1, %B1
270 %A3 = extractelement <8 x float> %A, i32 3
271 %B3 = extractelement <8 x float> %B, i32 3
272 %add3 = fadd float %A3, %B3
273 %A5 = extractelement <8 x float> %A, i32 5
274 %B5 = extractelement <8 x float> %B, i32 5
275 %add5 = fadd float %A5, %B5
276 %A7 = extractelement <8 x float> %A, i32 7
277 %B7 = extractelement <8 x float> %B, i32 7
278 %add7 = fadd float %A7, %B7
279 %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
280 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
281 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
282 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
283 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
284 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
285 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
286 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
287 ret <8 x float> %vecinsert8
290 define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
291 ; NOFMA-LABEL: buildvector_mul_addsub_pd256:
292 ; NOFMA: # %bb.0: # %bb
293 ; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0
294 ; NOFMA-NEXT: vaddsubpd %ymm2, %ymm0, %ymm0
297 ; FMA3-LABEL: buildvector_mul_addsub_pd256:
298 ; FMA3: # %bb.0: # %bb
299 ; FMA3-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
302 ; FMA4-LABEL: buildvector_mul_addsub_pd256:
303 ; FMA4: # %bb.0: # %bb
304 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
307 %A = fmul <4 x double> %C, %D
308 %A0 = extractelement <4 x double> %A, i32 0
309 %B0 = extractelement <4 x double> %B, i32 0
310 %sub0 = fsub double %A0, %B0
311 %A2 = extractelement <4 x double> %A, i32 2
312 %B2 = extractelement <4 x double> %B, i32 2
313 %sub2 = fsub double %A2, %B2
314 %A1 = extractelement <4 x double> %A, i32 1
315 %B1 = extractelement <4 x double> %B, i32 1
316 %add1 = fadd double %A1, %B1
317 %A3 = extractelement <4 x double> %A, i32 3
318 %B3 = extractelement <4 x double> %B, i32 3
319 %add3 = fadd double %A3, %B3
320 %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
321 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
322 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
323 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
324 ret <4 x double> %vecinsert4
327 define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
328 ; NOFMA-LABEL: buildvector_mul_addsub_ps512:
329 ; NOFMA: # %bb.0: # %bb
330 ; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1
331 ; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
332 ; NOFMA-NEXT: vaddsubps %ymm4, %ymm0, %ymm0
333 ; NOFMA-NEXT: vaddsubps %ymm5, %ymm1, %ymm1
336 ; FMA3_256-LABEL: buildvector_mul_addsub_ps512:
337 ; FMA3_256: # %bb.0: # %bb
338 ; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
339 ; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
340 ; FMA3_256-NEXT: retq
342 ; FMA3_512-LABEL: buildvector_mul_addsub_ps512:
343 ; FMA3_512: # %bb.0: # %bb
344 ; FMA3_512-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
345 ; FMA3_512-NEXT: retq
347 ; FMA4-LABEL: buildvector_mul_addsub_ps512:
348 ; FMA4: # %bb.0: # %bb
349 ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
350 ; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
353 %A = fmul <16 x float> %C, %D
354 %A0 = extractelement <16 x float> %A, i32 0
355 %B0 = extractelement <16 x float> %B, i32 0
356 %sub0 = fsub float %A0, %B0
357 %A2 = extractelement <16 x float> %A, i32 2
358 %B2 = extractelement <16 x float> %B, i32 2
359 %sub2 = fsub float %A2, %B2
360 %A4 = extractelement <16 x float> %A, i32 4
361 %B4 = extractelement <16 x float> %B, i32 4
362 %sub4 = fsub float %A4, %B4
363 %A6 = extractelement <16 x float> %A, i32 6
364 %B6 = extractelement <16 x float> %B, i32 6
365 %sub6 = fsub float %A6, %B6
366 %A8 = extractelement <16 x float> %A, i32 8
367 %B8 = extractelement <16 x float> %B, i32 8
368 %sub8 = fsub float %A8, %B8
369 %A10 = extractelement <16 x float> %A, i32 10
370 %B10 = extractelement <16 x float> %B, i32 10
371 %sub10 = fsub float %A10, %B10
372 %A12 = extractelement <16 x float> %A, i32 12
373 %B12 = extractelement <16 x float> %B, i32 12
374 %sub12 = fsub float %A12, %B12
375 %A14 = extractelement <16 x float> %A, i32 14
376 %B14 = extractelement <16 x float> %B, i32 14
377 %sub14 = fsub float %A14, %B14
378 %A1 = extractelement <16 x float> %A, i32 1
379 %B1 = extractelement <16 x float> %B, i32 1
380 %add1 = fadd float %A1, %B1
381 %A3 = extractelement <16 x float> %A, i32 3
382 %B3 = extractelement <16 x float> %B, i32 3
383 %add3 = fadd float %A3, %B3
384 %A5 = extractelement <16 x float> %A, i32 5
385 %B5 = extractelement <16 x float> %B, i32 5
386 %add5 = fadd float %A5, %B5
387 %A7 = extractelement <16 x float> %A, i32 7
388 %B7 = extractelement <16 x float> %B, i32 7
389 %add7 = fadd float %A7, %B7
390 %A9 = extractelement <16 x float> %A, i32 9
391 %B9 = extractelement <16 x float> %B, i32 9
392 %add9 = fadd float %A9, %B9
393 %A11 = extractelement <16 x float> %A, i32 11
394 %B11 = extractelement <16 x float> %B, i32 11
395 %add11 = fadd float %A11, %B11
396 %A13 = extractelement <16 x float> %A, i32 13
397 %B13 = extractelement <16 x float> %B, i32 13
398 %add13 = fadd float %A13, %B13
399 %A15 = extractelement <16 x float> %A, i32 15
400 %B15 = extractelement <16 x float> %B, i32 15
401 %add15 = fadd float %A15, %B15
402 %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
403 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
404 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
405 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
406 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
408 %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
409 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
410 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
411 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
412 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
413 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
414 ; element 12 is undef
415 %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
416 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
417 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
418 ret <16 x float> %vecinsert16
421 define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
422 ; NOFMA-LABEL: buildvector_mul_addsub_pd512:
423 ; NOFMA: # %bb.0: # %bb
424 ; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1
425 ; NOFMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0
426 ; NOFMA-NEXT: vaddsubpd %ymm4, %ymm0, %ymm0
427 ; NOFMA-NEXT: vaddsubpd %ymm5, %ymm1, %ymm1
430 ; FMA3_256-LABEL: buildvector_mul_addsub_pd512:
431 ; FMA3_256: # %bb.0: # %bb
432 ; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
433 ; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
434 ; FMA3_256-NEXT: retq
436 ; FMA3_512-LABEL: buildvector_mul_addsub_pd512:
437 ; FMA3_512: # %bb.0: # %bb
438 ; FMA3_512-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
439 ; FMA3_512-NEXT: retq
441 ; FMA4-LABEL: buildvector_mul_addsub_pd512:
442 ; FMA4: # %bb.0: # %bb
443 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
444 ; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
447 %A = fmul <8 x double> %C, %D
448 %A0 = extractelement <8 x double> %A, i32 0
449 %B0 = extractelement <8 x double> %B, i32 0
450 %sub0 = fsub double %A0, %B0
451 %A2 = extractelement <8 x double> %A, i32 2
452 %B2 = extractelement <8 x double> %B, i32 2
453 %sub2 = fsub double %A2, %B2
454 %A4 = extractelement <8 x double> %A, i32 4
455 %B4 = extractelement <8 x double> %B, i32 4
456 %sub4 = fsub double %A4, %B4
457 %A6 = extractelement <8 x double> %A, i32 6
458 %B6 = extractelement <8 x double> %B, i32 6
459 %sub6 = fsub double %A6, %B6
460 %A1 = extractelement <8 x double> %A, i32 1
461 %B1 = extractelement <8 x double> %B, i32 1
462 %add1 = fadd double %A1, %B1
463 %A3 = extractelement <8 x double> %A, i32 3
464 %B3 = extractelement <8 x double> %B, i32 3
465 %add3 = fadd double %A3, %B3
466 %A7 = extractelement <8 x double> %A, i32 7
467 %B7 = extractelement <8 x double> %B, i32 7
468 %add7 = fadd double %A7, %B7
469 %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
470 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
471 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
472 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
473 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
475 %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
476 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
477 ret <8 x double> %vecinsert8
480 define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
481 ; NOFMA-LABEL: buildvector_mul_subadd_ps128:
482 ; NOFMA: # %bb.0: # %bb
483 ; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
484 ; NOFMA-NEXT: vaddss %xmm2, %xmm0, %xmm1
485 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
486 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
487 ; NOFMA-NEXT: vaddss %xmm4, %xmm3, %xmm3
488 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
489 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
490 ; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4
491 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
492 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
493 ; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
494 ; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
495 ; NOFMA-NEXT: vsubss %xmm2, %xmm0, %xmm0
496 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
499 ; FMA3-LABEL: buildvector_mul_subadd_ps128:
500 ; FMA3: # %bb.0: # %bb
501 ; FMA3-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
504 ; FMA4-LABEL: buildvector_mul_subadd_ps128:
505 ; FMA4: # %bb.0: # %bb
506 ; FMA4-NEXT: vfmsubaddps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
509 %A = fmul <4 x float> %C, %D
510 %A0 = extractelement <4 x float> %A, i32 0
511 %B0 = extractelement <4 x float> %B, i32 0
512 %sub0 = fadd float %A0, %B0
513 %A2 = extractelement <4 x float> %A, i32 2
514 %B2 = extractelement <4 x float> %B, i32 2
515 %sub2 = fadd float %A2, %B2
516 %A1 = extractelement <4 x float> %A, i32 1
517 %B1 = extractelement <4 x float> %B, i32 1
518 %add1 = fsub float %A1, %B1
519 %A3 = extractelement <4 x float> %A, i32 3
520 %B3 = extractelement <4 x float> %B, i32 3
521 %add3 = fsub float %A3, %B3
522 %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
523 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
524 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
525 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
526 ret <4 x float> %vecinsert4
529 define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
530 ; NOFMA-LABEL: buildvector_mul_subadd_pd128:
531 ; NOFMA: # %bb.0: # %bb
532 ; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0
533 ; NOFMA-NEXT: vaddsd %xmm2, %xmm0, %xmm1
534 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
535 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
536 ; NOFMA-NEXT: vsubsd %xmm2, %xmm0, %xmm0
537 ; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
540 ; FMA3-LABEL: buildvector_mul_subadd_pd128:
541 ; FMA3: # %bb.0: # %bb
542 ; FMA3-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
545 ; FMA4-LABEL: buildvector_mul_subadd_pd128:
546 ; FMA4: # %bb.0: # %bb
547 ; FMA4-NEXT: vfmsubaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
550 %A = fmul <2 x double> %C, %D
551 %A0 = extractelement <2 x double> %A, i32 0
552 %B0 = extractelement <2 x double> %B, i32 0
553 %sub0 = fadd double %A0, %B0
554 %A1 = extractelement <2 x double> %A, i32 1
555 %B1 = extractelement <2 x double> %B, i32 1
556 %add1 = fsub double %A1, %B1
557 %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
558 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
559 ret <2 x double> %vecinsert2
562 define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
563 ; NOFMA-LABEL: buildvector_mul_subadd_ps256:
564 ; NOFMA: # %bb.0: # %bb
565 ; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
566 ; NOFMA-NEXT: vaddss %xmm2, %xmm0, %xmm1
567 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
568 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
569 ; NOFMA-NEXT: vaddss %xmm4, %xmm3, %xmm3
570 ; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm4
571 ; NOFMA-NEXT: vextractf128 $1, %ymm2, %xmm5
572 ; NOFMA-NEXT: vaddss %xmm5, %xmm4, %xmm6
573 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm7 = xmm4[1,0]
574 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm8 = xmm5[1,0]
575 ; NOFMA-NEXT: vaddss %xmm7, %xmm8, %xmm7
576 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm8 = xmm0[1,1,3,3]
577 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm9 = xmm2[1,1,3,3]
578 ; NOFMA-NEXT: vsubss %xmm9, %xmm8, %xmm8
579 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[2,3]
580 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
581 ; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
582 ; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
583 ; NOFMA-NEXT: vsubss %xmm2, %xmm0, %xmm0
584 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
585 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
586 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3]
587 ; NOFMA-NEXT: vsubss %xmm2, %xmm1, %xmm1
588 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[2,3]
589 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3]
590 ; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3,3,3]
591 ; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm5[3,3,3,3]
592 ; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2
593 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
594 ; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
597 ; FMA3-LABEL: buildvector_mul_subadd_ps256:
598 ; FMA3: # %bb.0: # %bb
599 ; FMA3-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
602 ; FMA4-LABEL: buildvector_mul_subadd_ps256:
603 ; FMA4: # %bb.0: # %bb
604 ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
607 %A = fmul <8 x float> %C, %D
608 %A0 = extractelement <8 x float> %A, i32 0
609 %B0 = extractelement <8 x float> %B, i32 0
610 %sub0 = fadd float %A0, %B0
611 %A2 = extractelement <8 x float> %A, i32 2
612 %B2 = extractelement <8 x float> %B, i32 2
613 %sub2 = fadd float %A2, %B2
614 %A4 = extractelement <8 x float> %A, i32 4
615 %B4 = extractelement <8 x float> %B, i32 4
616 %sub4 = fadd float %A4, %B4
617 %A6 = extractelement <8 x float> %A, i32 6
618 %B6 = extractelement <8 x float> %B, i32 6
619 %sub6 = fadd float %A6, %B6
620 %A1 = extractelement <8 x float> %A, i32 1
621 %B1 = extractelement <8 x float> %B, i32 1
622 %add1 = fsub float %A1, %B1
623 %A3 = extractelement <8 x float> %A, i32 3
624 %B3 = extractelement <8 x float> %B, i32 3
625 %add3 = fsub float %A3, %B3
626 %A5 = extractelement <8 x float> %A, i32 5
627 %B5 = extractelement <8 x float> %B, i32 5
628 %add5 = fsub float %A5, %B5
629 %A7 = extractelement <8 x float> %A, i32 7
630 %B7 = extractelement <8 x float> %B, i32 7
631 %add7 = fsub float %A7, %B7
632 %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
633 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
634 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
635 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
636 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
637 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
638 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
639 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
640 ret <8 x float> %vecinsert8
643 define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
644 ; NOFMA-LABEL: buildvector_mul_subadd_pd256:
645 ; NOFMA: # %bb.0: # %bb
646 ; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0
647 ; NOFMA-NEXT: vaddsd %xmm2, %xmm0, %xmm1
648 ; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm3
649 ; NOFMA-NEXT: vextractf128 $1, %ymm2, %xmm4
650 ; NOFMA-NEXT: vaddsd %xmm4, %xmm3, %xmm5
651 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
652 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
653 ; NOFMA-NEXT: vsubsd %xmm2, %xmm0, %xmm0
654 ; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
655 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
656 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm4[1,0]
657 ; NOFMA-NEXT: vsubsd %xmm2, %xmm1, %xmm1
658 ; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm5[0],xmm1[0]
659 ; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
662 ; FMA3-LABEL: buildvector_mul_subadd_pd256:
663 ; FMA3: # %bb.0: # %bb
664 ; FMA3-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
667 ; FMA4-LABEL: buildvector_mul_subadd_pd256:
668 ; FMA4: # %bb.0: # %bb
669 ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
672 %A = fmul <4 x double> %C, %D
673 %A0 = extractelement <4 x double> %A, i32 0
674 %B0 = extractelement <4 x double> %B, i32 0
675 %sub0 = fadd double %A0, %B0
676 %A2 = extractelement <4 x double> %A, i32 2
677 %B2 = extractelement <4 x double> %B, i32 2
678 %sub2 = fadd double %A2, %B2
679 %A1 = extractelement <4 x double> %A, i32 1
680 %B1 = extractelement <4 x double> %B, i32 1
681 %add1 = fsub double %A1, %B1
682 %A3 = extractelement <4 x double> %A, i32 3
683 %B3 = extractelement <4 x double> %B, i32 3
684 %add3 = fsub double %A3, %B3
685 %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
686 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
687 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
688 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
689 ret <4 x double> %vecinsert4
692 define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
693 ; NOFMA-LABEL: buildvector_mul_subadd_ps512:
694 ; NOFMA: # %bb.0: # %bb
695 ; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1
696 ; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
697 ; NOFMA-NEXT: vaddss %xmm4, %xmm0, %xmm2
698 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
699 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm6 = xmm4[1,0]
700 ; NOFMA-NEXT: vaddss %xmm6, %xmm3, %xmm3
701 ; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm6
702 ; NOFMA-NEXT: vextractf128 $1, %ymm4, %xmm7
703 ; NOFMA-NEXT: vaddss %xmm7, %xmm6, %xmm8
704 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm9 = xmm6[1,0]
705 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm10 = xmm7[1,0]
706 ; NOFMA-NEXT: vaddss %xmm10, %xmm9, %xmm9
707 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1],xmm9[0],xmm8[3]
708 ; NOFMA-NEXT: vaddss %xmm5, %xmm1, %xmm9
709 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm10 = xmm1[1,0]
710 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm11 = xmm5[1,0]
711 ; NOFMA-NEXT: vaddss %xmm11, %xmm10, %xmm10
712 ; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm11
713 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm12 = xmm11[1,0]
714 ; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm13
715 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm14 = xmm13[1,0]
716 ; NOFMA-NEXT: vaddss %xmm14, %xmm12, %xmm12
717 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm14 = xmm0[1,1,3,3]
718 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm15 = xmm4[1,1,3,3]
719 ; NOFMA-NEXT: vsubss %xmm15, %xmm14, %xmm14
720 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[2,3]
721 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
722 ; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
723 ; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3,3,3]
724 ; NOFMA-NEXT: vsubss %xmm3, %xmm0, %xmm0
725 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
726 ; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm6[3,3,3,3]
727 ; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,3,3,3]
728 ; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2
729 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[0]
730 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
731 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm5[1,1,3,3]
732 ; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3
733 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[2,3]
734 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm10[0],xmm3[3]
735 ; NOFMA-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
736 ; NOFMA-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,3,3,3]
737 ; NOFMA-NEXT: vsubss %xmm4, %xmm1, %xmm1
738 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
739 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm11[1,1,3,3]
740 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm13[1,1,3,3]
741 ; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3
742 ; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm12[0,0]
743 ; NOFMA-NEXT: vshufps {{.*#+}} xmm4 = xmm11[3,3,3,3]
744 ; NOFMA-NEXT: vshufps {{.*#+}} xmm5 = xmm13[3,3,3,3]
745 ; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4
746 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
747 ; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
748 ; NOFMA-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
751 ; FMA3_256-LABEL: buildvector_mul_subadd_ps512:
752 ; FMA3_256: # %bb.0: # %bb
753 ; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
754 ; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
755 ; FMA3_256-NEXT: retq
757 ; FMA3_512-LABEL: buildvector_mul_subadd_ps512:
758 ; FMA3_512: # %bb.0: # %bb
759 ; FMA3_512-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
760 ; FMA3_512-NEXT: retq
762 ; FMA4-LABEL: buildvector_mul_subadd_ps512:
763 ; FMA4: # %bb.0: # %bb
764 ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4
765 ; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5
768 %A = fmul <16 x float> %C, %D
769 %A0 = extractelement <16 x float> %A, i32 0
770 %B0 = extractelement <16 x float> %B, i32 0
771 %sub0 = fadd float %A0, %B0
772 %A2 = extractelement <16 x float> %A, i32 2
773 %B2 = extractelement <16 x float> %B, i32 2
774 %sub2 = fadd float %A2, %B2
775 %A4 = extractelement <16 x float> %A, i32 4
776 %B4 = extractelement <16 x float> %B, i32 4
777 %sub4 = fadd float %A4, %B4
778 %A6 = extractelement <16 x float> %A, i32 6
779 %B6 = extractelement <16 x float> %B, i32 6
780 %sub6 = fadd float %A6, %B6
781 %A8 = extractelement <16 x float> %A, i32 8
782 %B8 = extractelement <16 x float> %B, i32 8
783 %sub8 = fadd float %A8, %B8
784 %A10 = extractelement <16 x float> %A, i32 10
785 %B10 = extractelement <16 x float> %B, i32 10
786 %sub10 = fadd float %A10, %B10
787 %A12 = extractelement <16 x float> %A, i32 12
788 %B12 = extractelement <16 x float> %B, i32 12
789 %sub12 = fadd float %A12, %B12
790 %A14 = extractelement <16 x float> %A, i32 14
791 %B14 = extractelement <16 x float> %B, i32 14
792 %sub14 = fadd float %A14, %B14
793 %A1 = extractelement <16 x float> %A, i32 1
794 %B1 = extractelement <16 x float> %B, i32 1
795 %add1 = fsub float %A1, %B1
796 %A3 = extractelement <16 x float> %A, i32 3
797 %B3 = extractelement <16 x float> %B, i32 3
798 %add3 = fsub float %A3, %B3
799 %A5 = extractelement <16 x float> %A, i32 5
800 %B5 = extractelement <16 x float> %B, i32 5
801 %add5 = fsub float %A5, %B5
802 %A7 = extractelement <16 x float> %A, i32 7
803 %B7 = extractelement <16 x float> %B, i32 7
804 %add7 = fsub float %A7, %B7
805 %A9 = extractelement <16 x float> %A, i32 9
806 %B9 = extractelement <16 x float> %B, i32 9
807 %add9 = fsub float %A9, %B9
808 %A11 = extractelement <16 x float> %A, i32 11
809 %B11 = extractelement <16 x float> %B, i32 11
810 %add11 = fsub float %A11, %B11
811 %A13 = extractelement <16 x float> %A, i32 13
812 %B13 = extractelement <16 x float> %B, i32 13
813 %add13 = fsub float %A13, %B13
814 %A15 = extractelement <16 x float> %A, i32 15
815 %B15 = extractelement <16 x float> %B, i32 15
816 %add15 = fsub float %A15, %B15
817 %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
818 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
819 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
820 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
821 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
823 %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
824 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
825 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
826 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
827 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
828 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
829 ; element 12 is undef
830 %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
831 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
832 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
833 ret <16 x float> %vecinsert16
836 define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
837 ; NOFMA-LABEL: buildvector_mul_subadd_pd512:
838 ; NOFMA: # %bb.0: # %bb
839 ; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1
840 ; NOFMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0
841 ; NOFMA-NEXT: vaddsd %xmm4, %xmm0, %xmm2
842 ; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm3
843 ; NOFMA-NEXT: vextractf128 $1, %ymm4, %xmm6
844 ; NOFMA-NEXT: vaddsd %xmm6, %xmm3, %xmm7
845 ; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm8
846 ; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm1
847 ; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm5
848 ; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm9
849 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
850 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm4 = xmm4[1,0]
851 ; NOFMA-NEXT: vsubsd %xmm4, %xmm0, %xmm0
852 ; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
853 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
854 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm6[1,0]
855 ; NOFMA-NEXT: vsubsd %xmm3, %xmm2, %xmm2
856 ; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm7[0],xmm2[0]
857 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
858 ; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm5[1,0]
859 ; NOFMA-NEXT: vsubsd %xmm3, %xmm1, %xmm1
860 ; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm9[0],xmm1[0]
861 ; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
862 ; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1
865 ; FMA3_256-LABEL: buildvector_mul_subadd_pd512:
866 ; FMA3_256: # %bb.0: # %bb
867 ; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
868 ; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
869 ; FMA3_256-NEXT: retq
871 ; FMA3_512-LABEL: buildvector_mul_subadd_pd512:
872 ; FMA3_512: # %bb.0: # %bb
873 ; FMA3_512-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
874 ; FMA3_512-NEXT: retq
876 ; FMA4-LABEL: buildvector_mul_subadd_pd512:
877 ; FMA4: # %bb.0: # %bb
878 ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4
879 ; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5
882 %A = fmul <8 x double> %C, %D
883 %A0 = extractelement <8 x double> %A, i32 0
884 %B0 = extractelement <8 x double> %B, i32 0
885 %sub0 = fadd double %A0, %B0
886 %A2 = extractelement <8 x double> %A, i32 2
887 %B2 = extractelement <8 x double> %B, i32 2
888 %sub2 = fadd double %A2, %B2
889 %A4 = extractelement <8 x double> %A, i32 4
890 %B4 = extractelement <8 x double> %B, i32 4
891 %sub4 = fadd double %A4, %B4
892 %A6 = extractelement <8 x double> %A, i32 6
893 %B6 = extractelement <8 x double> %B, i32 6
894 %sub6 = fadd double %A6, %B6
895 %A1 = extractelement <8 x double> %A, i32 1
896 %B1 = extractelement <8 x double> %B, i32 1
897 %add1 = fsub double %A1, %B1
898 %A3 = extractelement <8 x double> %A, i32 3
899 %B3 = extractelement <8 x double> %B, i32 3
900 %add3 = fsub double %A3, %B3
901 %A7 = extractelement <8 x double> %A, i32 7
902 %B7 = extractelement <8 x double> %B, i32 7
903 %add7 = fsub double %A7, %B7
904 %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
905 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
906 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
907 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
908 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
910 %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
911 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
912 ret <8 x double> %vecinsert8
915 attributes #0 = { nounwind "unsafe-fp-math"="true" }