1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
13 define float @test_v2f32(float %a0, <2 x float> %a1) {
14 ; SSE2-LABEL: test_v2f32:
16 ; SSE2-NEXT: movaps %xmm1, %xmm2
17 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
18 ; SSE2-NEXT: mulss %xmm1, %xmm2
19 ; SSE2-NEXT: mulss %xmm2, %xmm0
22 ; SSE41-LABEL: test_v2f32:
24 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
25 ; SSE41-NEXT: mulss %xmm1, %xmm2
26 ; SSE41-NEXT: mulss %xmm2, %xmm0
29 ; AVX-LABEL: test_v2f32:
31 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
32 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
33 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
36 ; AVX512-LABEL: test_v2f32:
38 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
39 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
40 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
42 %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %a0, <2 x float> %a1)
46 define float @test_v4f32(float %a0, <4 x float> %a1) {
47 ; SSE2-LABEL: test_v4f32:
49 ; SSE2-NEXT: movaps %xmm1, %xmm2
50 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
51 ; SSE2-NEXT: mulps %xmm1, %xmm2
52 ; SSE2-NEXT: movaps %xmm2, %xmm1
53 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
54 ; SSE2-NEXT: mulss %xmm2, %xmm1
55 ; SSE2-NEXT: mulss %xmm1, %xmm0
58 ; SSE41-LABEL: test_v4f32:
60 ; SSE41-NEXT: movaps %xmm1, %xmm2
61 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
62 ; SSE41-NEXT: mulps %xmm1, %xmm2
63 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
64 ; SSE41-NEXT: mulss %xmm2, %xmm1
65 ; SSE41-NEXT: mulss %xmm1, %xmm0
68 ; AVX-LABEL: test_v4f32:
70 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
71 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
72 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
73 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
74 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
77 ; AVX512-LABEL: test_v4f32:
79 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
80 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
81 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
82 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
83 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
85 %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %a0, <4 x float> %a1)
89 define float @test_v8f32(float %a0, <8 x float> %a1) {
90 ; SSE2-LABEL: test_v8f32:
92 ; SSE2-NEXT: mulps %xmm2, %xmm1
93 ; SSE2-NEXT: movaps %xmm1, %xmm2
94 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
95 ; SSE2-NEXT: mulps %xmm1, %xmm2
96 ; SSE2-NEXT: movaps %xmm2, %xmm1
97 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
98 ; SSE2-NEXT: mulss %xmm2, %xmm1
99 ; SSE2-NEXT: mulss %xmm1, %xmm0
102 ; SSE41-LABEL: test_v8f32:
104 ; SSE41-NEXT: mulps %xmm2, %xmm1
105 ; SSE41-NEXT: movaps %xmm1, %xmm2
106 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
107 ; SSE41-NEXT: mulps %xmm1, %xmm2
108 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
109 ; SSE41-NEXT: mulss %xmm2, %xmm1
110 ; SSE41-NEXT: mulss %xmm1, %xmm0
113 ; AVX-LABEL: test_v8f32:
115 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
116 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
117 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
118 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
119 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
120 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
121 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
122 ; AVX-NEXT: vzeroupper
125 ; AVX512-LABEL: test_v8f32:
127 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
128 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
129 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
130 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
131 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
132 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
133 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
134 ; AVX512-NEXT: vzeroupper
136 %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %a0, <8 x float> %a1)
140 define float @test_v16f32(float %a0, <16 x float> %a1) {
141 ; SSE2-LABEL: test_v16f32:
143 ; SSE2-NEXT: mulps %xmm4, %xmm2
144 ; SSE2-NEXT: mulps %xmm3, %xmm1
145 ; SSE2-NEXT: mulps %xmm2, %xmm1
146 ; SSE2-NEXT: movaps %xmm1, %xmm2
147 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
148 ; SSE2-NEXT: mulps %xmm1, %xmm2
149 ; SSE2-NEXT: movaps %xmm2, %xmm1
150 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
151 ; SSE2-NEXT: mulss %xmm2, %xmm1
152 ; SSE2-NEXT: mulss %xmm1, %xmm0
155 ; SSE41-LABEL: test_v16f32:
157 ; SSE41-NEXT: mulps %xmm4, %xmm2
158 ; SSE41-NEXT: mulps %xmm3, %xmm1
159 ; SSE41-NEXT: mulps %xmm2, %xmm1
160 ; SSE41-NEXT: movaps %xmm1, %xmm2
161 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
162 ; SSE41-NEXT: mulps %xmm1, %xmm2
163 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
164 ; SSE41-NEXT: mulss %xmm2, %xmm1
165 ; SSE41-NEXT: mulss %xmm1, %xmm0
168 ; AVX-LABEL: test_v16f32:
170 ; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
171 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
172 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
173 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
174 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
175 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
176 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
177 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
178 ; AVX-NEXT: vzeroupper
181 ; AVX512-LABEL: test_v16f32:
183 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
184 ; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1
185 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
186 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
187 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
188 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
189 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
190 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
191 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
192 ; AVX512-NEXT: vzeroupper
194 %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float %a0, <16 x float> %a1)
202 define float @test_v2f32_zero(<2 x float> %a0) {
203 ; SSE2-LABEL: test_v2f32_zero:
205 ; SSE2-NEXT: movaps %xmm0, %xmm1
206 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
207 ; SSE2-NEXT: mulss %xmm0, %xmm1
208 ; SSE2-NEXT: movaps %xmm1, %xmm0
211 ; SSE41-LABEL: test_v2f32_zero:
213 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
214 ; SSE41-NEXT: mulss %xmm1, %xmm0
217 ; AVX-LABEL: test_v2f32_zero:
219 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
220 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
223 ; AVX512-LABEL: test_v2f32_zero:
225 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
226 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
228 %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
232 define float @test_v4f32_zero(<4 x float> %a0) {
233 ; SSE2-LABEL: test_v4f32_zero:
235 ; SSE2-NEXT: movaps %xmm0, %xmm1
236 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
237 ; SSE2-NEXT: mulps %xmm0, %xmm1
238 ; SSE2-NEXT: movaps %xmm1, %xmm0
239 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
240 ; SSE2-NEXT: mulss %xmm1, %xmm0
243 ; SSE41-LABEL: test_v4f32_zero:
245 ; SSE41-NEXT: movaps %xmm0, %xmm1
246 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
247 ; SSE41-NEXT: mulps %xmm0, %xmm1
248 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
249 ; SSE41-NEXT: mulss %xmm0, %xmm1
250 ; SSE41-NEXT: movaps %xmm1, %xmm0
253 ; AVX-LABEL: test_v4f32_zero:
255 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
256 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
257 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
258 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
261 ; AVX512-LABEL: test_v4f32_zero:
263 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
264 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
265 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
266 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
268 %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
272 define float @test_v8f32_zero(<8 x float> %a0) {
273 ; SSE2-LABEL: test_v8f32_zero:
275 ; SSE2-NEXT: mulps %xmm1, %xmm0
276 ; SSE2-NEXT: movaps %xmm0, %xmm1
277 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
278 ; SSE2-NEXT: mulps %xmm0, %xmm1
279 ; SSE2-NEXT: movaps %xmm1, %xmm0
280 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
281 ; SSE2-NEXT: mulss %xmm1, %xmm0
284 ; SSE41-LABEL: test_v8f32_zero:
286 ; SSE41-NEXT: mulps %xmm1, %xmm0
287 ; SSE41-NEXT: movaps %xmm0, %xmm1
288 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
289 ; SSE41-NEXT: mulps %xmm0, %xmm1
290 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
291 ; SSE41-NEXT: mulss %xmm0, %xmm1
292 ; SSE41-NEXT: movaps %xmm1, %xmm0
295 ; AVX-LABEL: test_v8f32_zero:
297 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
298 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
299 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
300 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
301 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
302 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
303 ; AVX-NEXT: vzeroupper
306 ; AVX512-LABEL: test_v8f32_zero:
308 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
309 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
310 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
311 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
312 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
313 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
314 ; AVX512-NEXT: vzeroupper
316 %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
320 define float @test_v16f32_zero(<16 x float> %a0) {
321 ; SSE2-LABEL: test_v16f32_zero:
323 ; SSE2-NEXT: mulps %xmm3, %xmm1
324 ; SSE2-NEXT: mulps %xmm2, %xmm0
325 ; SSE2-NEXT: mulps %xmm1, %xmm0
326 ; SSE2-NEXT: movaps %xmm0, %xmm1
327 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
328 ; SSE2-NEXT: mulps %xmm0, %xmm1
329 ; SSE2-NEXT: movaps %xmm1, %xmm0
330 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
331 ; SSE2-NEXT: mulss %xmm1, %xmm0
334 ; SSE41-LABEL: test_v16f32_zero:
336 ; SSE41-NEXT: mulps %xmm3, %xmm1
337 ; SSE41-NEXT: mulps %xmm2, %xmm0
338 ; SSE41-NEXT: mulps %xmm1, %xmm0
339 ; SSE41-NEXT: movaps %xmm0, %xmm1
340 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
341 ; SSE41-NEXT: mulps %xmm0, %xmm1
342 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
343 ; SSE41-NEXT: mulss %xmm0, %xmm1
344 ; SSE41-NEXT: movaps %xmm1, %xmm0
347 ; AVX-LABEL: test_v16f32_zero:
349 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
350 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
351 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
352 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
353 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
354 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
355 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
356 ; AVX-NEXT: vzeroupper
359 ; AVX512-LABEL: test_v16f32_zero:
361 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
362 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
363 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
364 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
365 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
366 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
367 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
368 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
369 ; AVX512-NEXT: vzeroupper
371 %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
379 define float @test_v2f32_undef(<2 x float> %a0) {
380 ; SSE2-LABEL: test_v2f32_undef:
382 ; SSE2-NEXT: movaps %xmm0, %xmm1
383 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
384 ; SSE2-NEXT: mulss %xmm0, %xmm1
385 ; SSE2-NEXT: movaps %xmm1, %xmm0
388 ; SSE41-LABEL: test_v2f32_undef:
390 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
391 ; SSE41-NEXT: mulss %xmm1, %xmm0
394 ; AVX-LABEL: test_v2f32_undef:
396 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
397 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
400 ; AVX512-LABEL: test_v2f32_undef:
402 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
403 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
405 %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
409 define float @test_v4f32_undef(<4 x float> %a0) {
410 ; SSE2-LABEL: test_v4f32_undef:
412 ; SSE2-NEXT: movaps %xmm0, %xmm1
413 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
414 ; SSE2-NEXT: mulps %xmm0, %xmm1
415 ; SSE2-NEXT: movaps %xmm1, %xmm0
416 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
417 ; SSE2-NEXT: mulss %xmm1, %xmm0
420 ; SSE41-LABEL: test_v4f32_undef:
422 ; SSE41-NEXT: movaps %xmm0, %xmm1
423 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
424 ; SSE41-NEXT: mulps %xmm0, %xmm1
425 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
426 ; SSE41-NEXT: mulss %xmm0, %xmm1
427 ; SSE41-NEXT: movaps %xmm1, %xmm0
430 ; AVX-LABEL: test_v4f32_undef:
432 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
433 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
434 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
435 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
438 ; AVX512-LABEL: test_v4f32_undef:
440 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
441 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
442 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
443 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
445 %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
449 define float @test_v8f32_undef(<8 x float> %a0) {
450 ; SSE2-LABEL: test_v8f32_undef:
452 ; SSE2-NEXT: mulps %xmm1, %xmm0
453 ; SSE2-NEXT: movaps %xmm0, %xmm1
454 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
455 ; SSE2-NEXT: mulps %xmm0, %xmm1
456 ; SSE2-NEXT: movaps %xmm1, %xmm0
457 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
458 ; SSE2-NEXT: mulss %xmm1, %xmm0
461 ; SSE41-LABEL: test_v8f32_undef:
463 ; SSE41-NEXT: mulps %xmm1, %xmm0
464 ; SSE41-NEXT: movaps %xmm0, %xmm1
465 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
466 ; SSE41-NEXT: mulps %xmm0, %xmm1
467 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
468 ; SSE41-NEXT: mulss %xmm0, %xmm1
469 ; SSE41-NEXT: movaps %xmm1, %xmm0
472 ; AVX-LABEL: test_v8f32_undef:
474 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
475 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
476 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
477 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
478 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
479 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
480 ; AVX-NEXT: vzeroupper
483 ; AVX512-LABEL: test_v8f32_undef:
485 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
486 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
487 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
488 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
489 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
490 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
491 ; AVX512-NEXT: vzeroupper
493 %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
497 define float @test_v16f32_undef(<16 x float> %a0) {
498 ; SSE2-LABEL: test_v16f32_undef:
500 ; SSE2-NEXT: mulps %xmm3, %xmm1
501 ; SSE2-NEXT: mulps %xmm2, %xmm0
502 ; SSE2-NEXT: mulps %xmm1, %xmm0
503 ; SSE2-NEXT: movaps %xmm0, %xmm1
504 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
505 ; SSE2-NEXT: mulps %xmm0, %xmm1
506 ; SSE2-NEXT: movaps %xmm1, %xmm0
507 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
508 ; SSE2-NEXT: mulss %xmm1, %xmm0
511 ; SSE41-LABEL: test_v16f32_undef:
513 ; SSE41-NEXT: mulps %xmm3, %xmm1
514 ; SSE41-NEXT: mulps %xmm2, %xmm0
515 ; SSE41-NEXT: mulps %xmm1, %xmm0
516 ; SSE41-NEXT: movaps %xmm0, %xmm1
517 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
518 ; SSE41-NEXT: mulps %xmm0, %xmm1
519 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
520 ; SSE41-NEXT: mulss %xmm0, %xmm1
521 ; SSE41-NEXT: movaps %xmm1, %xmm0
524 ; AVX-LABEL: test_v16f32_undef:
526 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
527 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
528 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
529 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
530 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
531 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
532 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
533 ; AVX-NEXT: vzeroupper
536 ; AVX512-LABEL: test_v16f32_undef:
538 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
539 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
540 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
541 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
542 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
543 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
544 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
545 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
546 ; AVX512-NEXT: vzeroupper
548 %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
556 define double @test_v2f64(double %a0, <2 x double> %a1) {
557 ; SSE-LABEL: test_v2f64:
559 ; SSE-NEXT: movapd %xmm1, %xmm2
560 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
561 ; SSE-NEXT: mulsd %xmm1, %xmm2
562 ; SSE-NEXT: mulsd %xmm2, %xmm0
565 ; AVX-LABEL: test_v2f64:
567 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
568 ; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
569 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
572 ; AVX512-LABEL: test_v2f64:
574 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
575 ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
576 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
578 %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %a0, <2 x double> %a1)
582 define double @test_v4f64(double %a0, <4 x double> %a1) {
583 ; SSE-LABEL: test_v4f64:
585 ; SSE-NEXT: mulpd %xmm2, %xmm1
586 ; SSE-NEXT: movapd %xmm1, %xmm2
587 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
588 ; SSE-NEXT: mulsd %xmm1, %xmm2
589 ; SSE-NEXT: mulsd %xmm2, %xmm0
592 ; AVX-LABEL: test_v4f64:
594 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
595 ; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1
596 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
597 ; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
598 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
599 ; AVX-NEXT: vzeroupper
602 ; AVX512-LABEL: test_v4f64:
604 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
605 ; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1
606 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
607 ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
608 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
609 ; AVX512-NEXT: vzeroupper
611 %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %a0, <4 x double> %a1)
615 define double @test_v8f64(double %a0, <8 x double> %a1) {
616 ; SSE-LABEL: test_v8f64:
618 ; SSE-NEXT: mulpd %xmm4, %xmm2
619 ; SSE-NEXT: mulpd %xmm3, %xmm1
620 ; SSE-NEXT: mulpd %xmm2, %xmm1
621 ; SSE-NEXT: movapd %xmm1, %xmm2
622 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
623 ; SSE-NEXT: mulsd %xmm1, %xmm2
624 ; SSE-NEXT: mulsd %xmm2, %xmm0
627 ; AVX-LABEL: test_v8f64:
629 ; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1
630 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
631 ; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1
632 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
633 ; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
634 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
635 ; AVX-NEXT: vzeroupper
638 ; AVX512-LABEL: test_v8f64:
640 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
641 ; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1
642 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
643 ; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1
644 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
645 ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
646 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
647 ; AVX512-NEXT: vzeroupper
649 %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double %a0, <8 x double> %a1)
653 define double @test_v16f64(double %a0, <16 x double> %a1) {
654 ; SSE-LABEL: test_v16f64:
656 ; SSE-NEXT: mulpd %xmm6, %xmm2
657 ; SSE-NEXT: mulpd %xmm7, %xmm3
658 ; SSE-NEXT: mulpd %xmm5, %xmm1
659 ; SSE-NEXT: mulpd %xmm3, %xmm1
660 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4
661 ; SSE-NEXT: mulpd %xmm2, %xmm4
662 ; SSE-NEXT: mulpd %xmm1, %xmm4
663 ; SSE-NEXT: movapd %xmm4, %xmm1
664 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
665 ; SSE-NEXT: mulsd %xmm4, %xmm1
666 ; SSE-NEXT: mulsd %xmm1, %xmm0
669 ; AVX-LABEL: test_v16f64:
671 ; AVX-NEXT: vmulpd %ymm4, %ymm2, %ymm2
672 ; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1
673 ; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1
674 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
675 ; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1
676 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
677 ; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
678 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
679 ; AVX-NEXT: vzeroupper
682 ; AVX512-LABEL: test_v16f64:
684 ; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1
685 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
686 ; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1
687 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
688 ; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1
689 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
690 ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
691 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
692 ; AVX512-NEXT: vzeroupper
694 %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double %a0, <16 x double> %a1)
702 define double @test_v2f64_zero(<2 x double> %a0) {
703 ; SSE-LABEL: test_v2f64_zero:
705 ; SSE-NEXT: movapd %xmm0, %xmm1
706 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
707 ; SSE-NEXT: mulsd %xmm0, %xmm1
708 ; SSE-NEXT: movapd %xmm1, %xmm0
711 ; AVX-LABEL: test_v2f64_zero:
713 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
714 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
717 ; AVX512-LABEL: test_v2f64_zero:
719 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
720 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
722 %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
726 define double @test_v4f64_zero(<4 x double> %a0) {
727 ; SSE-LABEL: test_v4f64_zero:
729 ; SSE-NEXT: mulpd %xmm1, %xmm0
730 ; SSE-NEXT: movapd %xmm0, %xmm1
731 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
732 ; SSE-NEXT: mulsd %xmm0, %xmm1
733 ; SSE-NEXT: movapd %xmm1, %xmm0
736 ; AVX-LABEL: test_v4f64_zero:
738 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
739 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
740 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
741 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
742 ; AVX-NEXT: vzeroupper
745 ; AVX512-LABEL: test_v4f64_zero:
747 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
748 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
749 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
750 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
751 ; AVX512-NEXT: vzeroupper
753 %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
757 define double @test_v8f64_zero(<8 x double> %a0) {
758 ; SSE-LABEL: test_v8f64_zero:
760 ; SSE-NEXT: mulpd %xmm3, %xmm1
761 ; SSE-NEXT: mulpd %xmm2, %xmm0
762 ; SSE-NEXT: mulpd %xmm1, %xmm0
763 ; SSE-NEXT: movapd %xmm0, %xmm1
764 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
765 ; SSE-NEXT: mulsd %xmm0, %xmm1
766 ; SSE-NEXT: movapd %xmm1, %xmm0
769 ; AVX-LABEL: test_v8f64_zero:
771 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
772 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
773 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
774 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
775 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
776 ; AVX-NEXT: vzeroupper
779 ; AVX512-LABEL: test_v8f64_zero:
781 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
782 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
783 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
784 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
785 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
786 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
787 ; AVX512-NEXT: vzeroupper
789 %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
793 define double @test_v16f64_zero(<16 x double> %a0) {
794 ; SSE-LABEL: test_v16f64_zero:
796 ; SSE-NEXT: mulpd %xmm6, %xmm2
797 ; SSE-NEXT: mulpd %xmm4, %xmm0
798 ; SSE-NEXT: mulpd %xmm2, %xmm0
799 ; SSE-NEXT: mulpd %xmm7, %xmm3
800 ; SSE-NEXT: mulpd %xmm5, %xmm1
801 ; SSE-NEXT: mulpd %xmm3, %xmm1
802 ; SSE-NEXT: mulpd %xmm0, %xmm1
803 ; SSE-NEXT: movapd %xmm1, %xmm0
804 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
805 ; SSE-NEXT: mulsd %xmm1, %xmm0
808 ; AVX-LABEL: test_v16f64_zero:
810 ; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1
811 ; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0
812 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
813 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
814 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
815 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
816 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
817 ; AVX-NEXT: vzeroupper
820 ; AVX512-LABEL: test_v16f64_zero:
822 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
823 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
824 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
825 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
826 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
827 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
828 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
829 ; AVX512-NEXT: vzeroupper
831 %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
839 define double @test_v2f64_undef(<2 x double> %a0) {
840 ; SSE-LABEL: test_v2f64_undef:
842 ; SSE-NEXT: movapd %xmm0, %xmm1
843 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
844 ; SSE-NEXT: mulsd %xmm0, %xmm1
845 ; SSE-NEXT: movapd %xmm1, %xmm0
848 ; AVX-LABEL: test_v2f64_undef:
850 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
851 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
854 ; AVX512-LABEL: test_v2f64_undef:
856 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
857 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
859 %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
863 define double @test_v4f64_undef(<4 x double> %a0) {
864 ; SSE-LABEL: test_v4f64_undef:
866 ; SSE-NEXT: mulpd %xmm1, %xmm0
867 ; SSE-NEXT: movapd %xmm0, %xmm1
868 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
869 ; SSE-NEXT: mulsd %xmm0, %xmm1
870 ; SSE-NEXT: movapd %xmm1, %xmm0
873 ; AVX-LABEL: test_v4f64_undef:
875 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
876 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
877 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
878 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
879 ; AVX-NEXT: vzeroupper
882 ; AVX512-LABEL: test_v4f64_undef:
884 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
885 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
886 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
887 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
888 ; AVX512-NEXT: vzeroupper
890 %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
894 define double @test_v8f64_undef(<8 x double> %a0) {
895 ; SSE-LABEL: test_v8f64_undef:
897 ; SSE-NEXT: mulpd %xmm3, %xmm1
898 ; SSE-NEXT: mulpd %xmm2, %xmm0
899 ; SSE-NEXT: mulpd %xmm1, %xmm0
900 ; SSE-NEXT: movapd %xmm0, %xmm1
901 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
902 ; SSE-NEXT: mulsd %xmm0, %xmm1
903 ; SSE-NEXT: movapd %xmm1, %xmm0
906 ; AVX-LABEL: test_v8f64_undef:
908 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
909 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
910 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
911 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
912 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
913 ; AVX-NEXT: vzeroupper
916 ; AVX512-LABEL: test_v8f64_undef:
918 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
919 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
920 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
921 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
922 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
923 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
924 ; AVX512-NEXT: vzeroupper
926 %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
930 define double @test_v16f64_undef(<16 x double> %a0) {
931 ; SSE-LABEL: test_v16f64_undef:
933 ; SSE-NEXT: mulpd %xmm6, %xmm2
934 ; SSE-NEXT: mulpd %xmm4, %xmm0
935 ; SSE-NEXT: mulpd %xmm2, %xmm0
936 ; SSE-NEXT: mulpd %xmm7, %xmm3
937 ; SSE-NEXT: mulpd %xmm5, %xmm1
938 ; SSE-NEXT: mulpd %xmm3, %xmm1
939 ; SSE-NEXT: mulpd %xmm0, %xmm1
940 ; SSE-NEXT: movapd %xmm1, %xmm0
941 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
942 ; SSE-NEXT: mulsd %xmm1, %xmm0
945 ; AVX-LABEL: test_v16f64_undef:
947 ; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1
948 ; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0
949 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
950 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
951 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
952 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
953 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
954 ; AVX-NEXT: vzeroupper
957 ; AVX512-LABEL: test_v16f64_undef:
959 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
960 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
961 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
962 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
963 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
964 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
965 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
966 ; AVX512-NEXT: vzeroupper
968 %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
972 declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>)
973 declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
974 declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>)
975 declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>)
977 declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>)
978 declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>)
979 declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double, <8 x double>)
980 declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double, <16 x double>)