1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
13 define float @test_v2f32(float %a0, <2 x float> %a1) {
14 ; SSE2-LABEL: test_v2f32:
16 ; SSE2-NEXT: movaps %xmm1, %xmm0
17 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
18 ; SSE2-NEXT: mulps %xmm1, %xmm0
21 ; SSE41-LABEL: test_v2f32:
23 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
24 ; SSE41-NEXT: mulps %xmm1, %xmm0
27 ; AVX-LABEL: test_v2f32:
29 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
30 ; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
33 ; AVX512-LABEL: test_v2f32:
35 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
36 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0
38 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
42 define float @test_v4f32(float %a0, <4 x float> %a1) {
43 ; SSE2-LABEL: test_v4f32:
45 ; SSE2-NEXT: movaps %xmm1, %xmm2
46 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
47 ; SSE2-NEXT: mulps %xmm1, %xmm2
48 ; SSE2-NEXT: movaps %xmm2, %xmm0
49 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
50 ; SSE2-NEXT: mulps %xmm2, %xmm0
53 ; SSE41-LABEL: test_v4f32:
55 ; SSE41-NEXT: movaps %xmm1, %xmm2
56 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
57 ; SSE41-NEXT: mulps %xmm1, %xmm2
58 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
59 ; SSE41-NEXT: mulps %xmm2, %xmm0
62 ; AVX-LABEL: test_v4f32:
64 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
65 ; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
66 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
67 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
70 ; AVX512-LABEL: test_v4f32:
72 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
73 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0
74 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
75 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
77 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
81 define float @test_v8f32(float %a0, <8 x float> %a1) {
82 ; SSE2-LABEL: test_v8f32:
84 ; SSE2-NEXT: mulps %xmm2, %xmm1
85 ; SSE2-NEXT: movaps %xmm1, %xmm2
86 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
87 ; SSE2-NEXT: mulps %xmm1, %xmm2
88 ; SSE2-NEXT: movaps %xmm2, %xmm0
89 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
90 ; SSE2-NEXT: mulps %xmm2, %xmm0
93 ; SSE41-LABEL: test_v8f32:
95 ; SSE41-NEXT: mulps %xmm2, %xmm1
96 ; SSE41-NEXT: movaps %xmm1, %xmm2
97 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
98 ; SSE41-NEXT: mulps %xmm1, %xmm2
99 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
100 ; SSE41-NEXT: mulps %xmm2, %xmm0
103 ; AVX-LABEL: test_v8f32:
105 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
106 ; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
107 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
108 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
109 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
110 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
111 ; AVX-NEXT: vzeroupper
114 ; AVX512-LABEL: test_v8f32:
116 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
117 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0
118 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
119 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
120 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
121 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
122 ; AVX512-NEXT: vzeroupper
124 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
128 define float @test_v16f32(float %a0, <16 x float> %a1) {
129 ; SSE2-LABEL: test_v16f32:
131 ; SSE2-NEXT: mulps %xmm4, %xmm2
132 ; SSE2-NEXT: mulps %xmm3, %xmm1
133 ; SSE2-NEXT: mulps %xmm2, %xmm1
134 ; SSE2-NEXT: movaps %xmm1, %xmm2
135 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
136 ; SSE2-NEXT: mulps %xmm1, %xmm2
137 ; SSE2-NEXT: movaps %xmm2, %xmm0
138 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
139 ; SSE2-NEXT: mulps %xmm2, %xmm0
142 ; SSE41-LABEL: test_v16f32:
144 ; SSE41-NEXT: mulps %xmm4, %xmm2
145 ; SSE41-NEXT: mulps %xmm3, %xmm1
146 ; SSE41-NEXT: mulps %xmm2, %xmm1
147 ; SSE41-NEXT: movaps %xmm1, %xmm2
148 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
149 ; SSE41-NEXT: mulps %xmm1, %xmm2
150 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
151 ; SSE41-NEXT: mulps %xmm2, %xmm0
154 ; AVX-LABEL: test_v16f32:
156 ; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm0
157 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
158 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
159 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
160 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
161 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
162 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
163 ; AVX-NEXT: vzeroupper
166 ; AVX512-LABEL: test_v16f32:
168 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0
169 ; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0
170 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
171 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
172 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
173 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
174 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
175 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
176 ; AVX512-NEXT: vzeroupper
178 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
186 define float @test_v2f32_zero(<2 x float> %a0) {
187 ; SSE2-LABEL: test_v2f32_zero:
189 ; SSE2-NEXT: movaps %xmm0, %xmm1
190 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
191 ; SSE2-NEXT: mulps %xmm0, %xmm1
192 ; SSE2-NEXT: movaps %xmm1, %xmm0
195 ; SSE41-LABEL: test_v2f32_zero:
197 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
198 ; SSE41-NEXT: mulps %xmm1, %xmm0
201 ; AVX-LABEL: test_v2f32_zero:
203 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
204 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
207 ; AVX512-LABEL: test_v2f32_zero:
209 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
210 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
212 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
216 define float @test_v4f32_zero(<4 x float> %a0) {
217 ; SSE2-LABEL: test_v4f32_zero:
219 ; SSE2-NEXT: movaps %xmm0, %xmm1
220 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
221 ; SSE2-NEXT: mulps %xmm0, %xmm1
222 ; SSE2-NEXT: movaps %xmm1, %xmm0
223 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
224 ; SSE2-NEXT: mulps %xmm1, %xmm0
227 ; SSE41-LABEL: test_v4f32_zero:
229 ; SSE41-NEXT: movaps %xmm0, %xmm1
230 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
231 ; SSE41-NEXT: mulps %xmm0, %xmm1
232 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
233 ; SSE41-NEXT: mulps %xmm0, %xmm1
234 ; SSE41-NEXT: movaps %xmm1, %xmm0
237 ; AVX-LABEL: test_v4f32_zero:
239 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
240 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
241 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
242 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
245 ; AVX512-LABEL: test_v4f32_zero:
247 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
248 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
249 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
250 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
252 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
256 define float @test_v8f32_zero(<8 x float> %a0) {
257 ; SSE2-LABEL: test_v8f32_zero:
259 ; SSE2-NEXT: mulps %xmm1, %xmm0
260 ; SSE2-NEXT: movaps %xmm0, %xmm1
261 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
262 ; SSE2-NEXT: mulps %xmm0, %xmm1
263 ; SSE2-NEXT: movaps %xmm1, %xmm0
264 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
265 ; SSE2-NEXT: mulps %xmm1, %xmm0
268 ; SSE41-LABEL: test_v8f32_zero:
270 ; SSE41-NEXT: mulps %xmm1, %xmm0
271 ; SSE41-NEXT: movaps %xmm0, %xmm1
272 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
273 ; SSE41-NEXT: mulps %xmm0, %xmm1
274 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
275 ; SSE41-NEXT: mulps %xmm0, %xmm1
276 ; SSE41-NEXT: movaps %xmm1, %xmm0
279 ; AVX-LABEL: test_v8f32_zero:
281 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
282 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
283 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
284 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
285 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
286 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
287 ; AVX-NEXT: vzeroupper
290 ; AVX512-LABEL: test_v8f32_zero:
292 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
293 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
294 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
295 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
296 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
297 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
298 ; AVX512-NEXT: vzeroupper
300 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
304 define float @test_v16f32_zero(<16 x float> %a0) {
305 ; SSE2-LABEL: test_v16f32_zero:
307 ; SSE2-NEXT: mulps %xmm3, %xmm1
308 ; SSE2-NEXT: mulps %xmm2, %xmm0
309 ; SSE2-NEXT: mulps %xmm1, %xmm0
310 ; SSE2-NEXT: movaps %xmm0, %xmm1
311 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
312 ; SSE2-NEXT: mulps %xmm0, %xmm1
313 ; SSE2-NEXT: movaps %xmm1, %xmm0
314 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
315 ; SSE2-NEXT: mulps %xmm1, %xmm0
318 ; SSE41-LABEL: test_v16f32_zero:
320 ; SSE41-NEXT: mulps %xmm3, %xmm1
321 ; SSE41-NEXT: mulps %xmm2, %xmm0
322 ; SSE41-NEXT: mulps %xmm1, %xmm0
323 ; SSE41-NEXT: movaps %xmm0, %xmm1
324 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
325 ; SSE41-NEXT: mulps %xmm0, %xmm1
326 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
327 ; SSE41-NEXT: mulps %xmm0, %xmm1
328 ; SSE41-NEXT: movaps %xmm1, %xmm0
331 ; AVX-LABEL: test_v16f32_zero:
333 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
334 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
335 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
336 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
337 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
338 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
339 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
340 ; AVX-NEXT: vzeroupper
343 ; AVX512-LABEL: test_v16f32_zero:
345 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
346 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
347 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
348 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
349 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
350 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
351 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
352 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
353 ; AVX512-NEXT: vzeroupper
355 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
363 define float @test_v2f32_undef(<2 x float> %a0) {
364 ; SSE2-LABEL: test_v2f32_undef:
366 ; SSE2-NEXT: movaps %xmm0, %xmm1
367 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
368 ; SSE2-NEXT: mulps %xmm0, %xmm1
369 ; SSE2-NEXT: movaps %xmm1, %xmm0
372 ; SSE41-LABEL: test_v2f32_undef:
374 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
375 ; SSE41-NEXT: mulps %xmm1, %xmm0
378 ; AVX-LABEL: test_v2f32_undef:
380 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
381 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
384 ; AVX512-LABEL: test_v2f32_undef:
386 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
387 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
389 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
393 define float @test_v4f32_undef(<4 x float> %a0) {
394 ; SSE2-LABEL: test_v4f32_undef:
396 ; SSE2-NEXT: movaps %xmm0, %xmm1
397 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
398 ; SSE2-NEXT: mulps %xmm0, %xmm1
399 ; SSE2-NEXT: movaps %xmm1, %xmm0
400 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
401 ; SSE2-NEXT: mulps %xmm1, %xmm0
404 ; SSE41-LABEL: test_v4f32_undef:
406 ; SSE41-NEXT: movaps %xmm0, %xmm1
407 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
408 ; SSE41-NEXT: mulps %xmm0, %xmm1
409 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
410 ; SSE41-NEXT: mulps %xmm0, %xmm1
411 ; SSE41-NEXT: movaps %xmm1, %xmm0
414 ; AVX-LABEL: test_v4f32_undef:
416 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
417 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
418 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
419 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
422 ; AVX512-LABEL: test_v4f32_undef:
424 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
425 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
426 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
427 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
429 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
433 define float @test_v8f32_undef(<8 x float> %a0) {
434 ; SSE2-LABEL: test_v8f32_undef:
436 ; SSE2-NEXT: mulps %xmm1, %xmm0
437 ; SSE2-NEXT: movaps %xmm0, %xmm1
438 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
439 ; SSE2-NEXT: mulps %xmm0, %xmm1
440 ; SSE2-NEXT: movaps %xmm1, %xmm0
441 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
442 ; SSE2-NEXT: mulps %xmm1, %xmm0
445 ; SSE41-LABEL: test_v8f32_undef:
447 ; SSE41-NEXT: mulps %xmm1, %xmm0
448 ; SSE41-NEXT: movaps %xmm0, %xmm1
449 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
450 ; SSE41-NEXT: mulps %xmm0, %xmm1
451 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
452 ; SSE41-NEXT: mulps %xmm0, %xmm1
453 ; SSE41-NEXT: movaps %xmm1, %xmm0
456 ; AVX-LABEL: test_v8f32_undef:
458 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
459 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
460 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
461 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
462 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
463 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
464 ; AVX-NEXT: vzeroupper
467 ; AVX512-LABEL: test_v8f32_undef:
469 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
470 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
471 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
472 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
473 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
474 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
475 ; AVX512-NEXT: vzeroupper
477 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
481 define float @test_v16f32_undef(<16 x float> %a0) {
482 ; SSE2-LABEL: test_v16f32_undef:
484 ; SSE2-NEXT: mulps %xmm3, %xmm1
485 ; SSE2-NEXT: mulps %xmm2, %xmm0
486 ; SSE2-NEXT: mulps %xmm1, %xmm0
487 ; SSE2-NEXT: movaps %xmm0, %xmm1
488 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
489 ; SSE2-NEXT: mulps %xmm0, %xmm1
490 ; SSE2-NEXT: movaps %xmm1, %xmm0
491 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
492 ; SSE2-NEXT: mulps %xmm1, %xmm0
495 ; SSE41-LABEL: test_v16f32_undef:
497 ; SSE41-NEXT: mulps %xmm3, %xmm1
498 ; SSE41-NEXT: mulps %xmm2, %xmm0
499 ; SSE41-NEXT: mulps %xmm1, %xmm0
500 ; SSE41-NEXT: movaps %xmm0, %xmm1
501 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
502 ; SSE41-NEXT: mulps %xmm0, %xmm1
503 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
504 ; SSE41-NEXT: mulps %xmm0, %xmm1
505 ; SSE41-NEXT: movaps %xmm1, %xmm0
508 ; AVX-LABEL: test_v16f32_undef:
510 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
511 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
512 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
513 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
514 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
515 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
516 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
517 ; AVX-NEXT: vzeroupper
520 ; AVX512-LABEL: test_v16f32_undef:
522 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
523 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
524 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
525 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
526 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
527 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
528 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
529 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
530 ; AVX512-NEXT: vzeroupper
532 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
540 define double @test_v2f64(double %a0, <2 x double> %a1) {
541 ; SSE-LABEL: test_v2f64:
543 ; SSE-NEXT: movapd %xmm1, %xmm0
544 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
545 ; SSE-NEXT: mulpd %xmm1, %xmm0
548 ; AVX-LABEL: test_v2f64:
550 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
551 ; AVX-NEXT: vmulpd %xmm0, %xmm1, %xmm0
554 ; AVX512-LABEL: test_v2f64:
556 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
557 ; AVX512-NEXT: vmulpd %xmm0, %xmm1, %xmm0
559 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
563 define double @test_v4f64(double %a0, <4 x double> %a1) {
564 ; SSE-LABEL: test_v4f64:
566 ; SSE-NEXT: mulpd %xmm2, %xmm1
567 ; SSE-NEXT: movapd %xmm1, %xmm0
568 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
569 ; SSE-NEXT: mulpd %xmm1, %xmm0
572 ; AVX-LABEL: test_v4f64:
574 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
575 ; AVX-NEXT: vmulpd %xmm0, %xmm1, %xmm0
576 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
577 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
578 ; AVX-NEXT: vzeroupper
581 ; AVX512-LABEL: test_v4f64:
583 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
584 ; AVX512-NEXT: vmulpd %xmm0, %xmm1, %xmm0
585 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
586 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
587 ; AVX512-NEXT: vzeroupper
589 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
593 define double @test_v8f64(double %a0, <8 x double> %a1) {
594 ; SSE-LABEL: test_v8f64:
596 ; SSE-NEXT: mulpd %xmm4, %xmm2
597 ; SSE-NEXT: mulpd %xmm3, %xmm1
598 ; SSE-NEXT: mulpd %xmm2, %xmm1
599 ; SSE-NEXT: movapd %xmm1, %xmm0
600 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
601 ; SSE-NEXT: mulpd %xmm1, %xmm0
604 ; AVX-LABEL: test_v8f64:
606 ; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm0
607 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
608 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
609 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
610 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
611 ; AVX-NEXT: vzeroupper
614 ; AVX512-LABEL: test_v8f64:
616 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0
617 ; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0
618 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
619 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
620 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
621 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
622 ; AVX512-NEXT: vzeroupper
624 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
628 define double @test_v16f64(double %a0, <16 x double> %a1) {
629 ; SSE-LABEL: test_v16f64:
631 ; SSE-NEXT: mulpd %xmm6, %xmm2
632 ; SSE-NEXT: mulpd %xmm7, %xmm3
633 ; SSE-NEXT: mulpd %xmm5, %xmm1
634 ; SSE-NEXT: mulpd %xmm3, %xmm1
635 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4
636 ; SSE-NEXT: mulpd %xmm2, %xmm4
637 ; SSE-NEXT: mulpd %xmm1, %xmm4
638 ; SSE-NEXT: movapd %xmm4, %xmm0
639 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
640 ; SSE-NEXT: mulpd %xmm4, %xmm0
643 ; AVX-LABEL: test_v16f64:
645 ; AVX-NEXT: vmulpd %ymm4, %ymm2, %ymm0
646 ; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1
647 ; AVX-NEXT: vmulpd %ymm0, %ymm1, %ymm0
648 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
649 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
650 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
651 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
652 ; AVX-NEXT: vzeroupper
655 ; AVX512-LABEL: test_v16f64:
657 ; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm0
658 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
659 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
660 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
661 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
662 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
663 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
664 ; AVX512-NEXT: vzeroupper
666 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
674 define double @test_v2f64_zero(<2 x double> %a0) {
675 ; SSE-LABEL: test_v2f64_zero:
677 ; SSE-NEXT: movapd %xmm0, %xmm1
678 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
679 ; SSE-NEXT: mulpd %xmm0, %xmm1
680 ; SSE-NEXT: movapd %xmm1, %xmm0
683 ; AVX-LABEL: test_v2f64_zero:
685 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
686 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
689 ; AVX512-LABEL: test_v2f64_zero:
691 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
692 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
694 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
698 define double @test_v4f64_zero(<4 x double> %a0) {
699 ; SSE-LABEL: test_v4f64_zero:
701 ; SSE-NEXT: mulpd %xmm1, %xmm0
702 ; SSE-NEXT: movapd %xmm0, %xmm1
703 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
704 ; SSE-NEXT: mulpd %xmm0, %xmm1
705 ; SSE-NEXT: movapd %xmm1, %xmm0
708 ; AVX-LABEL: test_v4f64_zero:
710 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
711 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
712 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
713 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
714 ; AVX-NEXT: vzeroupper
717 ; AVX512-LABEL: test_v4f64_zero:
719 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
720 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
721 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
722 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
723 ; AVX512-NEXT: vzeroupper
725 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
729 define double @test_v8f64_zero(<8 x double> %a0) {
730 ; SSE-LABEL: test_v8f64_zero:
732 ; SSE-NEXT: mulpd %xmm3, %xmm1
733 ; SSE-NEXT: mulpd %xmm2, %xmm0
734 ; SSE-NEXT: mulpd %xmm1, %xmm0
735 ; SSE-NEXT: movapd %xmm0, %xmm1
736 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
737 ; SSE-NEXT: mulpd %xmm0, %xmm1
738 ; SSE-NEXT: movapd %xmm1, %xmm0
741 ; AVX-LABEL: test_v8f64_zero:
743 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
744 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
745 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
746 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
747 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
748 ; AVX-NEXT: vzeroupper
751 ; AVX512-LABEL: test_v8f64_zero:
753 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
754 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
755 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
756 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
757 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
758 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
759 ; AVX512-NEXT: vzeroupper
761 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
765 define double @test_v16f64_zero(<16 x double> %a0) {
766 ; SSE-LABEL: test_v16f64_zero:
768 ; SSE-NEXT: mulpd %xmm6, %xmm2
769 ; SSE-NEXT: mulpd %xmm4, %xmm0
770 ; SSE-NEXT: mulpd %xmm2, %xmm0
771 ; SSE-NEXT: mulpd %xmm7, %xmm3
772 ; SSE-NEXT: mulpd %xmm5, %xmm1
773 ; SSE-NEXT: mulpd %xmm3, %xmm1
774 ; SSE-NEXT: mulpd %xmm0, %xmm1
775 ; SSE-NEXT: movapd %xmm1, %xmm0
776 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
777 ; SSE-NEXT: mulpd %xmm1, %xmm0
780 ; AVX-LABEL: test_v16f64_zero:
782 ; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1
783 ; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0
784 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
785 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
786 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
787 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
788 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
789 ; AVX-NEXT: vzeroupper
792 ; AVX512-LABEL: test_v16f64_zero:
794 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
795 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
796 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
797 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
798 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
799 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
800 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
801 ; AVX512-NEXT: vzeroupper
803 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
811 define double @test_v2f64_undef(<2 x double> %a0) {
812 ; SSE-LABEL: test_v2f64_undef:
814 ; SSE-NEXT: movapd %xmm0, %xmm1
815 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
816 ; SSE-NEXT: mulpd %xmm0, %xmm1
817 ; SSE-NEXT: movapd %xmm1, %xmm0
820 ; AVX-LABEL: test_v2f64_undef:
822 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
823 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
826 ; AVX512-LABEL: test_v2f64_undef:
828 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
829 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
831 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
835 define double @test_v4f64_undef(<4 x double> %a0) {
836 ; SSE-LABEL: test_v4f64_undef:
838 ; SSE-NEXT: mulpd %xmm1, %xmm0
839 ; SSE-NEXT: movapd %xmm0, %xmm1
840 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
841 ; SSE-NEXT: mulpd %xmm0, %xmm1
842 ; SSE-NEXT: movapd %xmm1, %xmm0
845 ; AVX-LABEL: test_v4f64_undef:
847 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
848 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
849 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
850 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
851 ; AVX-NEXT: vzeroupper
854 ; AVX512-LABEL: test_v4f64_undef:
856 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
857 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
858 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
859 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
860 ; AVX512-NEXT: vzeroupper
862 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
866 define double @test_v8f64_undef(<8 x double> %a0) {
867 ; SSE-LABEL: test_v8f64_undef:
869 ; SSE-NEXT: mulpd %xmm3, %xmm1
870 ; SSE-NEXT: mulpd %xmm2, %xmm0
871 ; SSE-NEXT: mulpd %xmm1, %xmm0
872 ; SSE-NEXT: movapd %xmm0, %xmm1
873 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
874 ; SSE-NEXT: mulpd %xmm0, %xmm1
875 ; SSE-NEXT: movapd %xmm1, %xmm0
878 ; AVX-LABEL: test_v8f64_undef:
880 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
881 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
882 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
883 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
884 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
885 ; AVX-NEXT: vzeroupper
888 ; AVX512-LABEL: test_v8f64_undef:
890 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
891 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
892 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
893 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
894 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
895 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
896 ; AVX512-NEXT: vzeroupper
898 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
902 define double @test_v16f64_undef(<16 x double> %a0) {
903 ; SSE-LABEL: test_v16f64_undef:
905 ; SSE-NEXT: mulpd %xmm6, %xmm2
906 ; SSE-NEXT: mulpd %xmm4, %xmm0
907 ; SSE-NEXT: mulpd %xmm2, %xmm0
908 ; SSE-NEXT: mulpd %xmm7, %xmm3
909 ; SSE-NEXT: mulpd %xmm5, %xmm1
910 ; SSE-NEXT: mulpd %xmm3, %xmm1
911 ; SSE-NEXT: mulpd %xmm0, %xmm1
912 ; SSE-NEXT: movapd %xmm1, %xmm0
913 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
914 ; SSE-NEXT: mulpd %xmm1, %xmm0
917 ; AVX-LABEL: test_v16f64_undef:
919 ; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1
920 ; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0
921 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
922 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
923 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
924 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
925 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
926 ; AVX-NEXT: vzeroupper
929 ; AVX512-LABEL: test_v16f64_undef:
931 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
932 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
933 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
934 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
935 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
936 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
937 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
938 ; AVX512-NEXT: vzeroupper
940 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
944 declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>)
945 declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>)
946 declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>)
947 declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>)
949 declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>)
950 declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>)
951 declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>)
952 declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>)