1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX
5 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
7 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
13 define float @test_v2f32(float %a0, <2 x float> %a1) {
14 ; SSE2-LABEL: test_v2f32:
16 ; SSE2-NEXT: movaps %xmm1, %xmm2
17 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
18 ; SSE2-NEXT: mulss %xmm1, %xmm2
19 ; SSE2-NEXT: mulss %xmm2, %xmm0
22 ; SSE41-LABEL: test_v2f32:
24 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
25 ; SSE41-NEXT: mulss %xmm1, %xmm2
26 ; SSE41-NEXT: mulss %xmm2, %xmm0
29 ; AVX-LABEL: test_v2f32:
31 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
32 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
33 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
36 ; AVX512-LABEL: test_v2f32:
38 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
39 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
40 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
42 %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
46 define float @test_v4f32(float %a0, <4 x float> %a1) {
47 ; SSE2-LABEL: test_v4f32:
49 ; SSE2-NEXT: movaps %xmm1, %xmm2
50 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
51 ; SSE2-NEXT: mulps %xmm1, %xmm2
52 ; SSE2-NEXT: movaps %xmm2, %xmm1
53 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
54 ; SSE2-NEXT: mulss %xmm2, %xmm1
55 ; SSE2-NEXT: mulss %xmm1, %xmm0
58 ; SSE41-LABEL: test_v4f32:
60 ; SSE41-NEXT: movaps %xmm1, %xmm2
61 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
62 ; SSE41-NEXT: mulps %xmm1, %xmm2
63 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
64 ; SSE41-NEXT: mulss %xmm2, %xmm1
65 ; SSE41-NEXT: mulss %xmm1, %xmm0
68 ; AVX-LABEL: test_v4f32:
70 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
71 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
72 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
73 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
74 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
77 ; AVX512-LABEL: test_v4f32:
79 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
80 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
81 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
82 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
83 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
85 %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
89 define float @test_v8f32(float %a0, <8 x float> %a1) {
90 ; SSE2-LABEL: test_v8f32:
92 ; SSE2-NEXT: mulps %xmm2, %xmm1
93 ; SSE2-NEXT: movaps %xmm1, %xmm2
94 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
95 ; SSE2-NEXT: mulps %xmm1, %xmm2
96 ; SSE2-NEXT: movaps %xmm2, %xmm1
97 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
98 ; SSE2-NEXT: mulss %xmm2, %xmm1
99 ; SSE2-NEXT: mulss %xmm1, %xmm0
102 ; SSE41-LABEL: test_v8f32:
104 ; SSE41-NEXT: mulps %xmm2, %xmm1
105 ; SSE41-NEXT: movaps %xmm1, %xmm2
106 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
107 ; SSE41-NEXT: mulps %xmm1, %xmm2
108 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
109 ; SSE41-NEXT: mulss %xmm2, %xmm1
110 ; SSE41-NEXT: mulss %xmm1, %xmm0
113 ; AVX-LABEL: test_v8f32:
115 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
116 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
117 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
118 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
119 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
120 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
121 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
122 ; AVX-NEXT: vzeroupper
125 ; AVX512-LABEL: test_v8f32:
127 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
128 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
129 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
130 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
131 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
132 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
133 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
134 ; AVX512-NEXT: vzeroupper
136 %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
140 define float @test_v16f32(float %a0, <16 x float> %a1) {
141 ; SSE2-LABEL: test_v16f32:
143 ; SSE2-NEXT: mulps %xmm4, %xmm2
144 ; SSE2-NEXT: mulps %xmm3, %xmm1
145 ; SSE2-NEXT: mulps %xmm2, %xmm1
146 ; SSE2-NEXT: movaps %xmm1, %xmm2
147 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
148 ; SSE2-NEXT: mulps %xmm1, %xmm2
149 ; SSE2-NEXT: movaps %xmm2, %xmm1
150 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
151 ; SSE2-NEXT: mulss %xmm2, %xmm1
152 ; SSE2-NEXT: mulss %xmm1, %xmm0
155 ; SSE41-LABEL: test_v16f32:
157 ; SSE41-NEXT: mulps %xmm4, %xmm2
158 ; SSE41-NEXT: mulps %xmm3, %xmm1
159 ; SSE41-NEXT: mulps %xmm2, %xmm1
160 ; SSE41-NEXT: movaps %xmm1, %xmm2
161 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
162 ; SSE41-NEXT: mulps %xmm1, %xmm2
163 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
164 ; SSE41-NEXT: mulss %xmm2, %xmm1
165 ; SSE41-NEXT: mulss %xmm1, %xmm0
168 ; AVX-LABEL: test_v16f32:
170 ; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
171 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
172 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
173 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
174 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
175 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
176 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
177 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
178 ; AVX-NEXT: vzeroupper
181 ; AVX512-LABEL: test_v16f32:
183 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
184 ; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1
185 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
186 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
187 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
188 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
189 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
190 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
191 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
192 ; AVX512-NEXT: vzeroupper
194 %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
202 define float @test_v2f32_zero(<2 x float> %a0) {
203 ; SSE2-LABEL: test_v2f32_zero:
205 ; SSE2-NEXT: movaps %xmm0, %xmm1
206 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
207 ; SSE2-NEXT: mulss %xmm1, %xmm0
210 ; SSE41-LABEL: test_v2f32_zero:
212 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
213 ; SSE41-NEXT: mulss %xmm1, %xmm0
216 ; AVX-LABEL: test_v2f32_zero:
218 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
219 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
222 ; AVX512-LABEL: test_v2f32_zero:
224 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
225 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
227 %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
231 define float @test_v4f32_zero(<4 x float> %a0) {
232 ; SSE2-LABEL: test_v4f32_zero:
234 ; SSE2-NEXT: movaps %xmm0, %xmm1
235 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
236 ; SSE2-NEXT: mulps %xmm1, %xmm0
237 ; SSE2-NEXT: movaps %xmm0, %xmm1
238 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
239 ; SSE2-NEXT: mulss %xmm1, %xmm0
242 ; SSE41-LABEL: test_v4f32_zero:
244 ; SSE41-NEXT: movaps %xmm0, %xmm1
245 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
246 ; SSE41-NEXT: mulps %xmm1, %xmm0
247 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
248 ; SSE41-NEXT: mulss %xmm1, %xmm0
251 ; AVX-LABEL: test_v4f32_zero:
253 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
254 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
255 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
256 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
259 ; AVX512-LABEL: test_v4f32_zero:
261 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
262 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
263 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
264 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
266 %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
270 define float @test_v8f32_zero(<8 x float> %a0) {
271 ; SSE2-LABEL: test_v8f32_zero:
273 ; SSE2-NEXT: mulps %xmm1, %xmm0
274 ; SSE2-NEXT: movaps %xmm0, %xmm1
275 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
276 ; SSE2-NEXT: mulps %xmm1, %xmm0
277 ; SSE2-NEXT: movaps %xmm0, %xmm1
278 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
279 ; SSE2-NEXT: mulss %xmm1, %xmm0
282 ; SSE41-LABEL: test_v8f32_zero:
284 ; SSE41-NEXT: mulps %xmm1, %xmm0
285 ; SSE41-NEXT: movaps %xmm0, %xmm1
286 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
287 ; SSE41-NEXT: mulps %xmm1, %xmm0
288 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
289 ; SSE41-NEXT: mulss %xmm1, %xmm0
292 ; AVX-LABEL: test_v8f32_zero:
294 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
295 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
296 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
297 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
298 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
299 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
300 ; AVX-NEXT: vzeroupper
303 ; AVX512-LABEL: test_v8f32_zero:
305 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
306 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
307 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
308 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
309 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
310 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
311 ; AVX512-NEXT: vzeroupper
313 %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
317 define float @test_v16f32_zero(<16 x float> %a0) {
318 ; SSE2-LABEL: test_v16f32_zero:
320 ; SSE2-NEXT: mulps %xmm3, %xmm1
321 ; SSE2-NEXT: mulps %xmm2, %xmm0
322 ; SSE2-NEXT: mulps %xmm1, %xmm0
323 ; SSE2-NEXT: movaps %xmm0, %xmm1
324 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
325 ; SSE2-NEXT: mulps %xmm1, %xmm0
326 ; SSE2-NEXT: movaps %xmm0, %xmm1
327 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
328 ; SSE2-NEXT: mulss %xmm1, %xmm0
331 ; SSE41-LABEL: test_v16f32_zero:
333 ; SSE41-NEXT: mulps %xmm3, %xmm1
334 ; SSE41-NEXT: mulps %xmm2, %xmm0
335 ; SSE41-NEXT: mulps %xmm1, %xmm0
336 ; SSE41-NEXT: movaps %xmm0, %xmm1
337 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
338 ; SSE41-NEXT: mulps %xmm1, %xmm0
339 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
340 ; SSE41-NEXT: mulss %xmm1, %xmm0
343 ; AVX-LABEL: test_v16f32_zero:
345 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
346 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
347 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
348 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
349 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
350 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
351 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
352 ; AVX-NEXT: vzeroupper
355 ; AVX512-LABEL: test_v16f32_zero:
357 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
358 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
359 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
360 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
361 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
362 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
363 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
364 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
365 ; AVX512-NEXT: vzeroupper
367 %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
375 define float @test_v2f32_undef(<2 x float> %a0) {
376 ; SSE2-LABEL: test_v2f32_undef:
378 ; SSE2-NEXT: movaps %xmm0, %xmm1
379 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
380 ; SSE2-NEXT: mulss %xmm1, %xmm0
383 ; SSE41-LABEL: test_v2f32_undef:
385 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
386 ; SSE41-NEXT: mulss %xmm1, %xmm0
389 ; AVX-LABEL: test_v2f32_undef:
391 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
392 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
395 ; AVX512-LABEL: test_v2f32_undef:
397 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
398 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
400 %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
404 define float @test_v4f32_undef(<4 x float> %a0) {
405 ; SSE2-LABEL: test_v4f32_undef:
407 ; SSE2-NEXT: movaps %xmm0, %xmm1
408 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
409 ; SSE2-NEXT: mulps %xmm1, %xmm0
410 ; SSE2-NEXT: movaps %xmm0, %xmm1
411 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
412 ; SSE2-NEXT: mulss %xmm1, %xmm0
415 ; SSE41-LABEL: test_v4f32_undef:
417 ; SSE41-NEXT: movaps %xmm0, %xmm1
418 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
419 ; SSE41-NEXT: mulps %xmm1, %xmm0
420 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
421 ; SSE41-NEXT: mulss %xmm1, %xmm0
424 ; AVX-LABEL: test_v4f32_undef:
426 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
427 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
428 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
429 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
432 ; AVX512-LABEL: test_v4f32_undef:
434 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
435 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
436 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
437 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
439 %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
443 define float @test_v8f32_undef(<8 x float> %a0) {
444 ; SSE2-LABEL: test_v8f32_undef:
446 ; SSE2-NEXT: mulps %xmm1, %xmm0
447 ; SSE2-NEXT: movaps %xmm0, %xmm1
448 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
449 ; SSE2-NEXT: mulps %xmm1, %xmm0
450 ; SSE2-NEXT: movaps %xmm0, %xmm1
451 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
452 ; SSE2-NEXT: mulss %xmm1, %xmm0
455 ; SSE41-LABEL: test_v8f32_undef:
457 ; SSE41-NEXT: mulps %xmm1, %xmm0
458 ; SSE41-NEXT: movaps %xmm0, %xmm1
459 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
460 ; SSE41-NEXT: mulps %xmm1, %xmm0
461 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
462 ; SSE41-NEXT: mulss %xmm1, %xmm0
465 ; AVX-LABEL: test_v8f32_undef:
467 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
468 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
469 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
470 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
471 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
472 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
473 ; AVX-NEXT: vzeroupper
476 ; AVX512-LABEL: test_v8f32_undef:
478 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
479 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
480 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
481 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
482 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
483 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
484 ; AVX512-NEXT: vzeroupper
486 %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
490 define float @test_v16f32_undef(<16 x float> %a0) {
491 ; SSE2-LABEL: test_v16f32_undef:
493 ; SSE2-NEXT: mulps %xmm3, %xmm1
494 ; SSE2-NEXT: mulps %xmm2, %xmm0
495 ; SSE2-NEXT: mulps %xmm1, %xmm0
496 ; SSE2-NEXT: movaps %xmm0, %xmm1
497 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
498 ; SSE2-NEXT: mulps %xmm1, %xmm0
499 ; SSE2-NEXT: movaps %xmm0, %xmm1
500 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
501 ; SSE2-NEXT: mulss %xmm1, %xmm0
504 ; SSE41-LABEL: test_v16f32_undef:
506 ; SSE41-NEXT: mulps %xmm3, %xmm1
507 ; SSE41-NEXT: mulps %xmm2, %xmm0
508 ; SSE41-NEXT: mulps %xmm1, %xmm0
509 ; SSE41-NEXT: movaps %xmm0, %xmm1
510 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
511 ; SSE41-NEXT: mulps %xmm1, %xmm0
512 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
513 ; SSE41-NEXT: mulss %xmm1, %xmm0
516 ; AVX-LABEL: test_v16f32_undef:
518 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
519 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
520 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
521 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
522 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
523 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
524 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
525 ; AVX-NEXT: vzeroupper
528 ; AVX512-LABEL: test_v16f32_undef:
530 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
531 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
532 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
533 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
534 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
535 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
536 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
537 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
538 ; AVX512-NEXT: vzeroupper
540 %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
548 define double @test_v2f64(double %a0, <2 x double> %a1) {
549 ; SSE-LABEL: test_v2f64:
551 ; SSE-NEXT: movapd %xmm1, %xmm2
552 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
553 ; SSE-NEXT: mulsd %xmm1, %xmm2
554 ; SSE-NEXT: mulsd %xmm2, %xmm0
557 ; AVX-LABEL: test_v2f64:
559 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
560 ; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
561 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
564 ; AVX512-LABEL: test_v2f64:
566 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
567 ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
568 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
570 %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
574 define double @test_v4f64(double %a0, <4 x double> %a1) {
575 ; SSE-LABEL: test_v4f64:
577 ; SSE-NEXT: mulpd %xmm2, %xmm1
578 ; SSE-NEXT: movapd %xmm1, %xmm2
579 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
580 ; SSE-NEXT: mulsd %xmm1, %xmm2
581 ; SSE-NEXT: mulsd %xmm2, %xmm0
584 ; AVX-LABEL: test_v4f64:
586 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
587 ; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1
588 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
589 ; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
590 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
591 ; AVX-NEXT: vzeroupper
594 ; AVX512-LABEL: test_v4f64:
596 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
597 ; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1
598 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
599 ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
600 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
601 ; AVX512-NEXT: vzeroupper
603 %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
607 define double @test_v8f64(double %a0, <8 x double> %a1) {
608 ; SSE-LABEL: test_v8f64:
610 ; SSE-NEXT: mulpd %xmm4, %xmm2
611 ; SSE-NEXT: mulpd %xmm3, %xmm1
612 ; SSE-NEXT: mulpd %xmm2, %xmm1
613 ; SSE-NEXT: movapd %xmm1, %xmm2
614 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
615 ; SSE-NEXT: mulsd %xmm1, %xmm2
616 ; SSE-NEXT: mulsd %xmm2, %xmm0
619 ; AVX-LABEL: test_v8f64:
621 ; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1
622 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
623 ; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1
624 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
625 ; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
626 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
627 ; AVX-NEXT: vzeroupper
630 ; AVX512-LABEL: test_v8f64:
632 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
633 ; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1
634 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
635 ; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1
636 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
637 ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
638 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
639 ; AVX512-NEXT: vzeroupper
641 %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
645 define double @test_v16f64(double %a0, <16 x double> %a1) {
646 ; SSE-LABEL: test_v16f64:
648 ; SSE-NEXT: mulpd %xmm6, %xmm2
649 ; SSE-NEXT: mulpd %xmm7, %xmm3
650 ; SSE-NEXT: mulpd %xmm5, %xmm1
651 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4
652 ; SSE-NEXT: mulpd %xmm3, %xmm1
653 ; SSE-NEXT: mulpd %xmm2, %xmm4
654 ; SSE-NEXT: mulpd %xmm1, %xmm4
655 ; SSE-NEXT: movapd %xmm4, %xmm1
656 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
657 ; SSE-NEXT: mulsd %xmm4, %xmm1
658 ; SSE-NEXT: mulsd %xmm1, %xmm0
661 ; AVX-LABEL: test_v16f64:
663 ; AVX-NEXT: vmulpd %ymm4, %ymm2, %ymm2
664 ; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1
665 ; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1
666 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
667 ; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1
668 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
669 ; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
670 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
671 ; AVX-NEXT: vzeroupper
674 ; AVX512-LABEL: test_v16f64:
676 ; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1
677 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
678 ; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1
679 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
680 ; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1
681 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
682 ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
683 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
684 ; AVX512-NEXT: vzeroupper
686 %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
694 define double @test_v2f64_zero(<2 x double> %a0) {
695 ; SSE-LABEL: test_v2f64_zero:
697 ; SSE-NEXT: movapd %xmm0, %xmm1
698 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
699 ; SSE-NEXT: mulsd %xmm1, %xmm0
702 ; AVX-LABEL: test_v2f64_zero:
704 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
705 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
708 ; AVX512-LABEL: test_v2f64_zero:
710 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
711 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
713 %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
717 define double @test_v4f64_zero(<4 x double> %a0) {
718 ; SSE-LABEL: test_v4f64_zero:
720 ; SSE-NEXT: mulpd %xmm1, %xmm0
721 ; SSE-NEXT: movapd %xmm0, %xmm1
722 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
723 ; SSE-NEXT: mulsd %xmm1, %xmm0
726 ; AVX-LABEL: test_v4f64_zero:
728 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
729 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
730 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
731 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
732 ; AVX-NEXT: vzeroupper
735 ; AVX512-LABEL: test_v4f64_zero:
737 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
738 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
739 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
740 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
741 ; AVX512-NEXT: vzeroupper
743 %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
747 define double @test_v8f64_zero(<8 x double> %a0) {
748 ; SSE-LABEL: test_v8f64_zero:
750 ; SSE-NEXT: mulpd %xmm3, %xmm1
751 ; SSE-NEXT: mulpd %xmm2, %xmm0
752 ; SSE-NEXT: mulpd %xmm1, %xmm0
753 ; SSE-NEXT: movapd %xmm0, %xmm1
754 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
755 ; SSE-NEXT: mulsd %xmm1, %xmm0
758 ; AVX-LABEL: test_v8f64_zero:
760 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
761 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
762 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
763 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
764 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
765 ; AVX-NEXT: vzeroupper
768 ; AVX512-LABEL: test_v8f64_zero:
770 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
771 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
772 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
773 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
774 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
775 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
776 ; AVX512-NEXT: vzeroupper
778 %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
782 define double @test_v16f64_zero(<16 x double> %a0) {
783 ; SSE-LABEL: test_v16f64_zero:
785 ; SSE-NEXT: mulpd %xmm6, %xmm2
786 ; SSE-NEXT: mulpd %xmm4, %xmm0
787 ; SSE-NEXT: mulpd %xmm2, %xmm0
788 ; SSE-NEXT: mulpd %xmm7, %xmm3
789 ; SSE-NEXT: mulpd %xmm5, %xmm1
790 ; SSE-NEXT: mulpd %xmm3, %xmm1
791 ; SSE-NEXT: mulpd %xmm1, %xmm0
792 ; SSE-NEXT: movapd %xmm0, %xmm1
793 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
794 ; SSE-NEXT: mulsd %xmm1, %xmm0
797 ; AVX-LABEL: test_v16f64_zero:
799 ; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1
800 ; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0
801 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
802 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
803 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
804 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
805 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
806 ; AVX-NEXT: vzeroupper
809 ; AVX512-LABEL: test_v16f64_zero:
811 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
812 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
813 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
814 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
815 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
816 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
817 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
818 ; AVX512-NEXT: vzeroupper
820 %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
828 define double @test_v2f64_undef(<2 x double> %a0) {
829 ; SSE-LABEL: test_v2f64_undef:
831 ; SSE-NEXT: movapd %xmm0, %xmm1
832 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
833 ; SSE-NEXT: mulsd %xmm1, %xmm0
836 ; AVX-LABEL: test_v2f64_undef:
838 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
839 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
842 ; AVX512-LABEL: test_v2f64_undef:
844 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
845 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
847 %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
851 define double @test_v4f64_undef(<4 x double> %a0) {
852 ; SSE-LABEL: test_v4f64_undef:
854 ; SSE-NEXT: mulpd %xmm1, %xmm0
855 ; SSE-NEXT: movapd %xmm0, %xmm1
856 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
857 ; SSE-NEXT: mulsd %xmm1, %xmm0
860 ; AVX-LABEL: test_v4f64_undef:
862 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
863 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
864 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
865 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
866 ; AVX-NEXT: vzeroupper
869 ; AVX512-LABEL: test_v4f64_undef:
871 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
872 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
873 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
874 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
875 ; AVX512-NEXT: vzeroupper
877 %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
881 define double @test_v8f64_undef(<8 x double> %a0) {
882 ; SSE-LABEL: test_v8f64_undef:
884 ; SSE-NEXT: mulpd %xmm3, %xmm1
885 ; SSE-NEXT: mulpd %xmm2, %xmm0
886 ; SSE-NEXT: mulpd %xmm1, %xmm0
887 ; SSE-NEXT: movapd %xmm0, %xmm1
888 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
889 ; SSE-NEXT: mulsd %xmm1, %xmm0
892 ; AVX-LABEL: test_v8f64_undef:
894 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
895 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
896 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
897 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
898 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
899 ; AVX-NEXT: vzeroupper
902 ; AVX512-LABEL: test_v8f64_undef:
904 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
905 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
906 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
907 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
908 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
909 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
910 ; AVX512-NEXT: vzeroupper
912 %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
916 define double @test_v16f64_undef(<16 x double> %a0) {
917 ; SSE-LABEL: test_v16f64_undef:
919 ; SSE-NEXT: mulpd %xmm6, %xmm2
920 ; SSE-NEXT: mulpd %xmm4, %xmm0
921 ; SSE-NEXT: mulpd %xmm2, %xmm0
922 ; SSE-NEXT: mulpd %xmm7, %xmm3
923 ; SSE-NEXT: mulpd %xmm5, %xmm1
924 ; SSE-NEXT: mulpd %xmm3, %xmm1
925 ; SSE-NEXT: mulpd %xmm1, %xmm0
926 ; SSE-NEXT: movapd %xmm0, %xmm1
927 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
928 ; SSE-NEXT: mulsd %xmm1, %xmm0
931 ; AVX-LABEL: test_v16f64_undef:
933 ; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1
934 ; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0
935 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
936 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
937 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
938 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
939 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
940 ; AVX-NEXT: vzeroupper
943 ; AVX512-LABEL: test_v16f64_undef:
945 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
946 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
947 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
948 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
949 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
950 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
951 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
952 ; AVX512-NEXT: vzeroupper
954 %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
958 declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
959 declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
960 declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
961 declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
963 declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
964 declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
965 declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
966 declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>)