1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
14 define float @test_v2f32(float %a0, <2 x float> %a1) {
15 ; SSE2-LABEL: test_v2f32:
17 ; SSE2-NEXT: movaps %xmm1, %xmm2
18 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
19 ; SSE2-NEXT: addss %xmm1, %xmm2
20 ; SSE2-NEXT: addss %xmm2, %xmm0
23 ; SSE41-LABEL: test_v2f32:
25 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
26 ; SSE41-NEXT: addss %xmm1, %xmm2
27 ; SSE41-NEXT: addss %xmm2, %xmm0
30 ; AVX1-SLOW-LABEL: test_v2f32:
32 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
33 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
34 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
35 ; AVX1-SLOW-NEXT: retq
37 ; AVX1-FAST-LABEL: test_v2f32:
39 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
40 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
41 ; AVX1-FAST-NEXT: retq
43 ; AVX2-LABEL: test_v2f32:
45 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
46 ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
47 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
50 ; AVX512-LABEL: test_v2f32:
52 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
53 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
54 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
56 %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %a0, <2 x float> %a1)
60 define float @test_v4f32(float %a0, <4 x float> %a1) {
61 ; SSE2-LABEL: test_v4f32:
63 ; SSE2-NEXT: movaps %xmm1, %xmm2
64 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
65 ; SSE2-NEXT: addps %xmm1, %xmm2
66 ; SSE2-NEXT: movaps %xmm2, %xmm1
67 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
68 ; SSE2-NEXT: addss %xmm2, %xmm1
69 ; SSE2-NEXT: addss %xmm1, %xmm0
72 ; SSE41-LABEL: test_v4f32:
74 ; SSE41-NEXT: movaps %xmm1, %xmm2
75 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
76 ; SSE41-NEXT: addps %xmm1, %xmm2
77 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
78 ; SSE41-NEXT: addss %xmm2, %xmm1
79 ; SSE41-NEXT: addss %xmm1, %xmm0
82 ; AVX1-SLOW-LABEL: test_v4f32:
84 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
85 ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
86 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
87 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
88 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
89 ; AVX1-SLOW-NEXT: retq
91 ; AVX1-FAST-LABEL: test_v4f32:
93 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
94 ; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
95 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
96 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
97 ; AVX1-FAST-NEXT: retq
99 ; AVX2-LABEL: test_v4f32:
101 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
102 ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1
103 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
104 ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
105 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
108 ; AVX512-LABEL: test_v4f32:
110 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
111 ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1
112 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
113 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
114 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
116 %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %a1)
120 define float @test_v8f32(float %a0, <8 x float> %a1) {
121 ; SSE2-LABEL: test_v8f32:
123 ; SSE2-NEXT: addps %xmm2, %xmm1
124 ; SSE2-NEXT: movaps %xmm1, %xmm2
125 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
126 ; SSE2-NEXT: addps %xmm1, %xmm2
127 ; SSE2-NEXT: movaps %xmm2, %xmm1
128 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
129 ; SSE2-NEXT: addss %xmm2, %xmm1
130 ; SSE2-NEXT: addss %xmm1, %xmm0
133 ; SSE41-LABEL: test_v8f32:
135 ; SSE41-NEXT: addps %xmm2, %xmm1
136 ; SSE41-NEXT: movaps %xmm1, %xmm2
137 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
138 ; SSE41-NEXT: addps %xmm1, %xmm2
139 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
140 ; SSE41-NEXT: addss %xmm2, %xmm1
141 ; SSE41-NEXT: addss %xmm1, %xmm0
144 ; AVX1-SLOW-LABEL: test_v8f32:
145 ; AVX1-SLOW: # %bb.0:
146 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
147 ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
148 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
149 ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
150 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
151 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
152 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
153 ; AVX1-SLOW-NEXT: vzeroupper
154 ; AVX1-SLOW-NEXT: retq
156 ; AVX1-FAST-LABEL: test_v8f32:
157 ; AVX1-FAST: # %bb.0:
158 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
159 ; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
160 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
161 ; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
162 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
163 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
164 ; AVX1-FAST-NEXT: vzeroupper
165 ; AVX1-FAST-NEXT: retq
167 ; AVX2-LABEL: test_v8f32:
169 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
170 ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1
171 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
172 ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1
173 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
174 ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
175 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
176 ; AVX2-NEXT: vzeroupper
179 ; AVX512-LABEL: test_v8f32:
181 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
182 ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1
183 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
184 ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1
185 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
186 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
187 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
188 ; AVX512-NEXT: vzeroupper
190 %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
194 define float @test_v16f32(float %a0, <16 x float> %a1) {
195 ; SSE2-LABEL: test_v16f32:
197 ; SSE2-NEXT: addps %xmm4, %xmm2
198 ; SSE2-NEXT: addps %xmm3, %xmm1
199 ; SSE2-NEXT: addps %xmm2, %xmm1
200 ; SSE2-NEXT: movaps %xmm1, %xmm2
201 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
202 ; SSE2-NEXT: addps %xmm1, %xmm2
203 ; SSE2-NEXT: movaps %xmm2, %xmm1
204 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
205 ; SSE2-NEXT: addss %xmm2, %xmm1
206 ; SSE2-NEXT: addss %xmm1, %xmm0
209 ; SSE41-LABEL: test_v16f32:
211 ; SSE41-NEXT: addps %xmm4, %xmm2
212 ; SSE41-NEXT: addps %xmm3, %xmm1
213 ; SSE41-NEXT: addps %xmm2, %xmm1
214 ; SSE41-NEXT: movaps %xmm1, %xmm2
215 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
216 ; SSE41-NEXT: addps %xmm1, %xmm2
217 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
218 ; SSE41-NEXT: addss %xmm2, %xmm1
219 ; SSE41-NEXT: addss %xmm1, %xmm0
222 ; AVX1-SLOW-LABEL: test_v16f32:
223 ; AVX1-SLOW: # %bb.0:
224 ; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm1
225 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
226 ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
227 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
228 ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
229 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
230 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
231 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
232 ; AVX1-SLOW-NEXT: vzeroupper
233 ; AVX1-SLOW-NEXT: retq
235 ; AVX1-FAST-LABEL: test_v16f32:
236 ; AVX1-FAST: # %bb.0:
237 ; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm1
238 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
239 ; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
240 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
241 ; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
242 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
243 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
244 ; AVX1-FAST-NEXT: vzeroupper
245 ; AVX1-FAST-NEXT: retq
247 ; AVX2-LABEL: test_v16f32:
249 ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
250 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
251 ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1
252 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
253 ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1
254 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
255 ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
256 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
257 ; AVX2-NEXT: vzeroupper
260 ; AVX512-LABEL: test_v16f32:
262 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
263 ; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1
264 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
265 ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1
266 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
267 ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1
268 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
269 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
270 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
271 ; AVX512-NEXT: vzeroupper
273 %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float %a0, <16 x float> %a1)
281 define float @test_v2f32_zero(<2 x float> %a0) {
282 ; SSE2-LABEL: test_v2f32_zero:
284 ; SSE2-NEXT: movaps %xmm0, %xmm1
285 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
286 ; SSE2-NEXT: addss %xmm0, %xmm1
287 ; SSE2-NEXT: movaps %xmm1, %xmm0
290 ; SSE41-LABEL: test_v2f32_zero:
292 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
293 ; SSE41-NEXT: addss %xmm1, %xmm0
296 ; AVX1-SLOW-LABEL: test_v2f32_zero:
297 ; AVX1-SLOW: # %bb.0:
298 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
299 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
300 ; AVX1-SLOW-NEXT: retq
302 ; AVX1-FAST-LABEL: test_v2f32_zero:
303 ; AVX1-FAST: # %bb.0:
304 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
305 ; AVX1-FAST-NEXT: retq
307 ; AVX2-LABEL: test_v2f32_zero:
309 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
310 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
313 ; AVX512-LABEL: test_v2f32_zero:
315 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
316 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
318 %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
322 define float @test_v4f32_zero(<4 x float> %a0) {
323 ; SSE2-LABEL: test_v4f32_zero:
325 ; SSE2-NEXT: movaps %xmm0, %xmm1
326 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
327 ; SSE2-NEXT: addps %xmm0, %xmm1
328 ; SSE2-NEXT: movaps %xmm1, %xmm0
329 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
330 ; SSE2-NEXT: addss %xmm1, %xmm0
333 ; SSE41-LABEL: test_v4f32_zero:
335 ; SSE41-NEXT: movaps %xmm0, %xmm1
336 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
337 ; SSE41-NEXT: addps %xmm0, %xmm1
338 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
339 ; SSE41-NEXT: addss %xmm0, %xmm1
340 ; SSE41-NEXT: movaps %xmm1, %xmm0
343 ; AVX1-SLOW-LABEL: test_v4f32_zero:
344 ; AVX1-SLOW: # %bb.0:
345 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
346 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
347 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
348 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
349 ; AVX1-SLOW-NEXT: retq
351 ; AVX1-FAST-LABEL: test_v4f32_zero:
352 ; AVX1-FAST: # %bb.0:
353 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
354 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
355 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
356 ; AVX1-FAST-NEXT: retq
358 ; AVX2-LABEL: test_v4f32_zero:
360 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
361 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
362 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
363 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
366 ; AVX512-LABEL: test_v4f32_zero:
368 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
369 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
370 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
371 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
373 %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
377 define float @test_v8f32_zero(<8 x float> %a0) {
378 ; SSE2-LABEL: test_v8f32_zero:
380 ; SSE2-NEXT: addps %xmm1, %xmm0
381 ; SSE2-NEXT: movaps %xmm0, %xmm1
382 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
383 ; SSE2-NEXT: addps %xmm0, %xmm1
384 ; SSE2-NEXT: movaps %xmm1, %xmm0
385 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
386 ; SSE2-NEXT: addss %xmm1, %xmm0
389 ; SSE41-LABEL: test_v8f32_zero:
391 ; SSE41-NEXT: addps %xmm1, %xmm0
392 ; SSE41-NEXT: movaps %xmm0, %xmm1
393 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
394 ; SSE41-NEXT: addps %xmm0, %xmm1
395 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
396 ; SSE41-NEXT: addss %xmm0, %xmm1
397 ; SSE41-NEXT: movaps %xmm1, %xmm0
400 ; AVX1-SLOW-LABEL: test_v8f32_zero:
401 ; AVX1-SLOW: # %bb.0:
402 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
403 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
404 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
405 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
406 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
407 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
408 ; AVX1-SLOW-NEXT: vzeroupper
409 ; AVX1-SLOW-NEXT: retq
411 ; AVX1-FAST-LABEL: test_v8f32_zero:
412 ; AVX1-FAST: # %bb.0:
413 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
414 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
415 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
416 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
417 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
418 ; AVX1-FAST-NEXT: vzeroupper
419 ; AVX1-FAST-NEXT: retq
421 ; AVX2-LABEL: test_v8f32_zero:
423 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
424 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
425 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
426 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
427 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
428 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
429 ; AVX2-NEXT: vzeroupper
432 ; AVX512-LABEL: test_v8f32_zero:
434 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
435 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
436 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
437 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
438 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
439 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
440 ; AVX512-NEXT: vzeroupper
442 %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
446 define float @test_v16f32_zero(<16 x float> %a0) {
447 ; SSE2-LABEL: test_v16f32_zero:
449 ; SSE2-NEXT: addps %xmm3, %xmm1
450 ; SSE2-NEXT: addps %xmm2, %xmm0
451 ; SSE2-NEXT: addps %xmm1, %xmm0
452 ; SSE2-NEXT: movaps %xmm0, %xmm1
453 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
454 ; SSE2-NEXT: addps %xmm0, %xmm1
455 ; SSE2-NEXT: movaps %xmm1, %xmm0
456 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
457 ; SSE2-NEXT: addss %xmm1, %xmm0
460 ; SSE41-LABEL: test_v16f32_zero:
462 ; SSE41-NEXT: addps %xmm3, %xmm1
463 ; SSE41-NEXT: addps %xmm2, %xmm0
464 ; SSE41-NEXT: addps %xmm1, %xmm0
465 ; SSE41-NEXT: movaps %xmm0, %xmm1
466 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
467 ; SSE41-NEXT: addps %xmm0, %xmm1
468 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
469 ; SSE41-NEXT: addss %xmm0, %xmm1
470 ; SSE41-NEXT: movaps %xmm1, %xmm0
473 ; AVX1-SLOW-LABEL: test_v16f32_zero:
474 ; AVX1-SLOW: # %bb.0:
475 ; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0
476 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
477 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
478 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
479 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
480 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
481 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
482 ; AVX1-SLOW-NEXT: vzeroupper
483 ; AVX1-SLOW-NEXT: retq
485 ; AVX1-FAST-LABEL: test_v16f32_zero:
486 ; AVX1-FAST: # %bb.0:
487 ; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0
488 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
489 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
490 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
491 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
492 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
493 ; AVX1-FAST-NEXT: vzeroupper
494 ; AVX1-FAST-NEXT: retq
496 ; AVX2-LABEL: test_v16f32_zero:
498 ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
499 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
500 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
501 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
502 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
503 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
504 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
505 ; AVX2-NEXT: vzeroupper
508 ; AVX512-LABEL: test_v16f32_zero:
510 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
511 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
512 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
513 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
514 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
515 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
516 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
517 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
518 ; AVX512-NEXT: vzeroupper
520 %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
528 define float @test_v2f32_undef(<2 x float> %a0) {
529 ; SSE2-LABEL: test_v2f32_undef:
531 ; SSE2-NEXT: movaps %xmm0, %xmm1
532 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
533 ; SSE2-NEXT: addss %xmm0, %xmm1
534 ; SSE2-NEXT: movaps %xmm1, %xmm0
537 ; SSE41-LABEL: test_v2f32_undef:
539 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
540 ; SSE41-NEXT: addss %xmm1, %xmm0
543 ; AVX1-SLOW-LABEL: test_v2f32_undef:
544 ; AVX1-SLOW: # %bb.0:
545 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
546 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
547 ; AVX1-SLOW-NEXT: retq
549 ; AVX1-FAST-LABEL: test_v2f32_undef:
550 ; AVX1-FAST: # %bb.0:
551 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
552 ; AVX1-FAST-NEXT: retq
554 ; AVX2-LABEL: test_v2f32_undef:
556 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
557 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
560 ; AVX512-LABEL: test_v2f32_undef:
562 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
563 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
565 %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
569 define float @test_v4f32_undef(<4 x float> %a0) {
570 ; SSE2-LABEL: test_v4f32_undef:
572 ; SSE2-NEXT: movaps %xmm0, %xmm1
573 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
574 ; SSE2-NEXT: addps %xmm0, %xmm1
575 ; SSE2-NEXT: movaps %xmm1, %xmm0
576 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
577 ; SSE2-NEXT: addss %xmm1, %xmm0
580 ; SSE41-LABEL: test_v4f32_undef:
582 ; SSE41-NEXT: movaps %xmm0, %xmm1
583 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
584 ; SSE41-NEXT: addps %xmm0, %xmm1
585 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
586 ; SSE41-NEXT: addss %xmm0, %xmm1
587 ; SSE41-NEXT: movaps %xmm1, %xmm0
590 ; AVX1-SLOW-LABEL: test_v4f32_undef:
591 ; AVX1-SLOW: # %bb.0:
592 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
593 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
594 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
595 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
596 ; AVX1-SLOW-NEXT: retq
598 ; AVX1-FAST-LABEL: test_v4f32_undef:
599 ; AVX1-FAST: # %bb.0:
600 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
601 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
602 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
603 ; AVX1-FAST-NEXT: retq
605 ; AVX2-LABEL: test_v4f32_undef:
607 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
608 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
609 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
610 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
613 ; AVX512-LABEL: test_v4f32_undef:
615 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
616 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
617 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
618 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
620 %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
624 define float @test_v8f32_undef(<8 x float> %a0) {
625 ; SSE2-LABEL: test_v8f32_undef:
627 ; SSE2-NEXT: addps %xmm1, %xmm0
628 ; SSE2-NEXT: movaps %xmm0, %xmm1
629 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
630 ; SSE2-NEXT: addps %xmm0, %xmm1
631 ; SSE2-NEXT: movaps %xmm1, %xmm0
632 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
633 ; SSE2-NEXT: addss %xmm1, %xmm0
636 ; SSE41-LABEL: test_v8f32_undef:
638 ; SSE41-NEXT: addps %xmm1, %xmm0
639 ; SSE41-NEXT: movaps %xmm0, %xmm1
640 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
641 ; SSE41-NEXT: addps %xmm0, %xmm1
642 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
643 ; SSE41-NEXT: addss %xmm0, %xmm1
644 ; SSE41-NEXT: movaps %xmm1, %xmm0
647 ; AVX1-SLOW-LABEL: test_v8f32_undef:
648 ; AVX1-SLOW: # %bb.0:
649 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
650 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
651 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
652 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
653 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
654 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
655 ; AVX1-SLOW-NEXT: vzeroupper
656 ; AVX1-SLOW-NEXT: retq
658 ; AVX1-FAST-LABEL: test_v8f32_undef:
659 ; AVX1-FAST: # %bb.0:
660 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
661 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
662 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
663 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
664 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
665 ; AVX1-FAST-NEXT: vzeroupper
666 ; AVX1-FAST-NEXT: retq
668 ; AVX2-LABEL: test_v8f32_undef:
670 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
671 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
672 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
673 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
674 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
675 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
676 ; AVX2-NEXT: vzeroupper
679 ; AVX512-LABEL: test_v8f32_undef:
681 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
682 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
683 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
684 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
685 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
686 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
687 ; AVX512-NEXT: vzeroupper
689 %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
693 define float @test_v16f32_undef(<16 x float> %a0) {
694 ; SSE2-LABEL: test_v16f32_undef:
696 ; SSE2-NEXT: addps %xmm3, %xmm1
697 ; SSE2-NEXT: addps %xmm2, %xmm0
698 ; SSE2-NEXT: addps %xmm1, %xmm0
699 ; SSE2-NEXT: movaps %xmm0, %xmm1
700 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
701 ; SSE2-NEXT: addps %xmm0, %xmm1
702 ; SSE2-NEXT: movaps %xmm1, %xmm0
703 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
704 ; SSE2-NEXT: addss %xmm1, %xmm0
707 ; SSE41-LABEL: test_v16f32_undef:
709 ; SSE41-NEXT: addps %xmm3, %xmm1
710 ; SSE41-NEXT: addps %xmm2, %xmm0
711 ; SSE41-NEXT: addps %xmm1, %xmm0
712 ; SSE41-NEXT: movaps %xmm0, %xmm1
713 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
714 ; SSE41-NEXT: addps %xmm0, %xmm1
715 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
716 ; SSE41-NEXT: addss %xmm0, %xmm1
717 ; SSE41-NEXT: movaps %xmm1, %xmm0
720 ; AVX1-SLOW-LABEL: test_v16f32_undef:
721 ; AVX1-SLOW: # %bb.0:
722 ; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0
723 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
724 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
725 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
726 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
727 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
728 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
729 ; AVX1-SLOW-NEXT: vzeroupper
730 ; AVX1-SLOW-NEXT: retq
732 ; AVX1-FAST-LABEL: test_v16f32_undef:
733 ; AVX1-FAST: # %bb.0:
734 ; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0
735 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
736 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
737 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
738 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
739 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
740 ; AVX1-FAST-NEXT: vzeroupper
741 ; AVX1-FAST-NEXT: retq
743 ; AVX2-LABEL: test_v16f32_undef:
745 ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
746 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
747 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
748 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
749 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
750 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
751 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
752 ; AVX2-NEXT: vzeroupper
755 ; AVX512-LABEL: test_v16f32_undef:
757 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
758 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
759 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
760 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
761 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
762 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
763 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
764 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
765 ; AVX512-NEXT: vzeroupper
767 %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
775 define double @test_v2f64(double %a0, <2 x double> %a1) {
776 ; SSE-LABEL: test_v2f64:
778 ; SSE-NEXT: movapd %xmm1, %xmm2
779 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
780 ; SSE-NEXT: addsd %xmm1, %xmm2
781 ; SSE-NEXT: addsd %xmm2, %xmm0
784 ; AVX1-SLOW-LABEL: test_v2f64:
785 ; AVX1-SLOW: # %bb.0:
786 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
787 ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
788 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
789 ; AVX1-SLOW-NEXT: retq
791 ; AVX1-FAST-LABEL: test_v2f64:
792 ; AVX1-FAST: # %bb.0:
793 ; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
794 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
795 ; AVX1-FAST-NEXT: retq
797 ; AVX2-LABEL: test_v2f64:
799 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
800 ; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1
801 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
804 ; AVX512-LABEL: test_v2f64:
806 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
807 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
808 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
810 %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %a0, <2 x double> %a1)
814 define double @test_v4f64(double %a0, <4 x double> %a1) {
815 ; SSE-LABEL: test_v4f64:
817 ; SSE-NEXT: addpd %xmm2, %xmm1
818 ; SSE-NEXT: movapd %xmm1, %xmm2
819 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
820 ; SSE-NEXT: addsd %xmm1, %xmm2
821 ; SSE-NEXT: addsd %xmm2, %xmm0
824 ; AVX1-SLOW-LABEL: test_v4f64:
825 ; AVX1-SLOW: # %bb.0:
826 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
827 ; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1
828 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
829 ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
830 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
831 ; AVX1-SLOW-NEXT: vzeroupper
832 ; AVX1-SLOW-NEXT: retq
834 ; AVX1-FAST-LABEL: test_v4f64:
835 ; AVX1-FAST: # %bb.0:
836 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
837 ; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1
838 ; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
839 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
840 ; AVX1-FAST-NEXT: vzeroupper
841 ; AVX1-FAST-NEXT: retq
843 ; AVX2-LABEL: test_v4f64:
845 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
846 ; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1
847 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
848 ; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1
849 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
850 ; AVX2-NEXT: vzeroupper
853 ; AVX512-LABEL: test_v4f64:
855 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
856 ; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1
857 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
858 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
859 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
860 ; AVX512-NEXT: vzeroupper
862 %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
866 define double @test_v8f64(double %a0, <8 x double> %a1) {
867 ; SSE-LABEL: test_v8f64:
869 ; SSE-NEXT: addpd %xmm4, %xmm2
870 ; SSE-NEXT: addpd %xmm3, %xmm1
871 ; SSE-NEXT: addpd %xmm2, %xmm1
872 ; SSE-NEXT: movapd %xmm1, %xmm2
873 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
874 ; SSE-NEXT: addsd %xmm1, %xmm2
875 ; SSE-NEXT: addsd %xmm2, %xmm0
878 ; AVX1-SLOW-LABEL: test_v8f64:
879 ; AVX1-SLOW: # %bb.0:
880 ; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm1
881 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
882 ; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1
883 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
884 ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
885 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
886 ; AVX1-SLOW-NEXT: vzeroupper
887 ; AVX1-SLOW-NEXT: retq
889 ; AVX1-FAST-LABEL: test_v8f64:
890 ; AVX1-FAST: # %bb.0:
891 ; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm1, %ymm1
892 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
893 ; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1
894 ; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
895 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
896 ; AVX1-FAST-NEXT: vzeroupper
897 ; AVX1-FAST-NEXT: retq
899 ; AVX2-LABEL: test_v8f64:
901 ; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1
902 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
903 ; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1
904 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
905 ; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1
906 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
907 ; AVX2-NEXT: vzeroupper
910 ; AVX512-LABEL: test_v8f64:
912 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
913 ; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
914 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
915 ; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1
916 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
917 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
918 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
919 ; AVX512-NEXT: vzeroupper
921 %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double %a0, <8 x double> %a1)
925 define double @test_v16f64(double %a0, <16 x double> %a1) {
926 ; SSE-LABEL: test_v16f64:
928 ; SSE-NEXT: addpd %xmm6, %xmm2
929 ; SSE-NEXT: addpd %xmm7, %xmm3
930 ; SSE-NEXT: addpd %xmm5, %xmm1
931 ; SSE-NEXT: addpd %xmm3, %xmm1
932 ; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4
933 ; SSE-NEXT: addpd %xmm2, %xmm4
934 ; SSE-NEXT: addpd %xmm1, %xmm4
935 ; SSE-NEXT: movapd %xmm4, %xmm1
936 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
937 ; SSE-NEXT: addsd %xmm4, %xmm1
938 ; SSE-NEXT: addsd %xmm1, %xmm0
941 ; AVX1-SLOW-LABEL: test_v16f64:
942 ; AVX1-SLOW: # %bb.0:
943 ; AVX1-SLOW-NEXT: vaddpd %ymm4, %ymm2, %ymm2
944 ; AVX1-SLOW-NEXT: vaddpd %ymm3, %ymm1, %ymm1
945 ; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm1
946 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
947 ; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1
948 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
949 ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
950 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
951 ; AVX1-SLOW-NEXT: vzeroupper
952 ; AVX1-SLOW-NEXT: retq
954 ; AVX1-FAST-LABEL: test_v16f64:
955 ; AVX1-FAST: # %bb.0:
956 ; AVX1-FAST-NEXT: vaddpd %ymm4, %ymm2, %ymm2
957 ; AVX1-FAST-NEXT: vaddpd %ymm3, %ymm1, %ymm1
958 ; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm1, %ymm1
959 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
960 ; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1
961 ; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
962 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
963 ; AVX1-FAST-NEXT: vzeroupper
964 ; AVX1-FAST-NEXT: retq
966 ; AVX2-LABEL: test_v16f64:
968 ; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2
969 ; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1
970 ; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1
971 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
972 ; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1
973 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
974 ; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1
975 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
976 ; AVX2-NEXT: vzeroupper
979 ; AVX512-LABEL: test_v16f64:
981 ; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
982 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
983 ; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
984 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
985 ; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1
986 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
987 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
988 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
989 ; AVX512-NEXT: vzeroupper
991 %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double %a0, <16 x double> %a1)
999 define double @test_v2f64_zero(<2 x double> %a0) {
1000 ; SSE-LABEL: test_v2f64_zero:
1002 ; SSE-NEXT: movapd %xmm0, %xmm1
1003 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1004 ; SSE-NEXT: addsd %xmm0, %xmm1
1005 ; SSE-NEXT: movapd %xmm1, %xmm0
1008 ; AVX1-SLOW-LABEL: test_v2f64_zero:
1009 ; AVX1-SLOW: # %bb.0:
1010 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1011 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1012 ; AVX1-SLOW-NEXT: retq
1014 ; AVX1-FAST-LABEL: test_v2f64_zero:
1015 ; AVX1-FAST: # %bb.0:
1016 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1017 ; AVX1-FAST-NEXT: retq
1019 ; AVX2-LABEL: test_v2f64_zero:
1021 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1022 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1025 ; AVX512-LABEL: test_v2f64_zero:
1027 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1028 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1030 %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
1034 define double @test_v4f64_zero(<4 x double> %a0) {
1035 ; SSE-LABEL: test_v4f64_zero:
1037 ; SSE-NEXT: addpd %xmm1, %xmm0
1038 ; SSE-NEXT: movapd %xmm0, %xmm1
1039 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1040 ; SSE-NEXT: addsd %xmm0, %xmm1
1041 ; SSE-NEXT: movapd %xmm1, %xmm0
1044 ; AVX1-SLOW-LABEL: test_v4f64_zero:
1045 ; AVX1-SLOW: # %bb.0:
1046 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1047 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1048 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1049 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1050 ; AVX1-SLOW-NEXT: vzeroupper
1051 ; AVX1-SLOW-NEXT: retq
1053 ; AVX1-FAST-LABEL: test_v4f64_zero:
1054 ; AVX1-FAST: # %bb.0:
1055 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
1056 ; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1057 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1058 ; AVX1-FAST-NEXT: vzeroupper
1059 ; AVX1-FAST-NEXT: retq
1061 ; AVX2-LABEL: test_v4f64_zero:
1063 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1064 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1065 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1066 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1067 ; AVX2-NEXT: vzeroupper
1070 ; AVX512-LABEL: test_v4f64_zero:
1072 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1073 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1074 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1075 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1076 ; AVX512-NEXT: vzeroupper
1078 %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
1082 define double @test_v8f64_zero(<8 x double> %a0) {
1083 ; SSE-LABEL: test_v8f64_zero:
1085 ; SSE-NEXT: addpd %xmm3, %xmm1
1086 ; SSE-NEXT: addpd %xmm2, %xmm0
1087 ; SSE-NEXT: addpd %xmm1, %xmm0
1088 ; SSE-NEXT: movapd %xmm0, %xmm1
1089 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1090 ; SSE-NEXT: addsd %xmm0, %xmm1
1091 ; SSE-NEXT: movapd %xmm1, %xmm0
1094 ; AVX1-SLOW-LABEL: test_v8f64_zero:
1095 ; AVX1-SLOW: # %bb.0:
1096 ; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1097 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1098 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1099 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1100 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1101 ; AVX1-SLOW-NEXT: vzeroupper
1102 ; AVX1-SLOW-NEXT: retq
1104 ; AVX1-FAST-LABEL: test_v8f64_zero:
1105 ; AVX1-FAST: # %bb.0:
1106 ; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1107 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
1108 ; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1109 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1110 ; AVX1-FAST-NEXT: vzeroupper
1111 ; AVX1-FAST-NEXT: retq
1113 ; AVX2-LABEL: test_v8f64_zero:
1115 ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1116 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1117 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1118 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1119 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1120 ; AVX2-NEXT: vzeroupper
1123 ; AVX512-LABEL: test_v8f64_zero:
1125 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
1126 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1127 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1128 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1129 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1130 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1131 ; AVX512-NEXT: vzeroupper
1133 %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
1137 define double @test_v16f64_zero(<16 x double> %a0) {
1138 ; SSE-LABEL: test_v16f64_zero:
1140 ; SSE-NEXT: addpd %xmm6, %xmm2
1141 ; SSE-NEXT: addpd %xmm4, %xmm0
1142 ; SSE-NEXT: addpd %xmm2, %xmm0
1143 ; SSE-NEXT: addpd %xmm7, %xmm3
1144 ; SSE-NEXT: addpd %xmm5, %xmm1
1145 ; SSE-NEXT: addpd %xmm3, %xmm1
1146 ; SSE-NEXT: addpd %xmm0, %xmm1
1147 ; SSE-NEXT: movapd %xmm1, %xmm0
1148 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1149 ; SSE-NEXT: addsd %xmm1, %xmm0
1152 ; AVX1-SLOW-LABEL: test_v16f64_zero:
1153 ; AVX1-SLOW: # %bb.0:
1154 ; AVX1-SLOW-NEXT: vaddpd %ymm3, %ymm1, %ymm1
1155 ; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm0, %ymm0
1156 ; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1157 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1158 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1159 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1160 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1161 ; AVX1-SLOW-NEXT: vzeroupper
1162 ; AVX1-SLOW-NEXT: retq
1164 ; AVX1-FAST-LABEL: test_v16f64_zero:
1165 ; AVX1-FAST: # %bb.0:
1166 ; AVX1-FAST-NEXT: vaddpd %ymm3, %ymm1, %ymm1
1167 ; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm0, %ymm0
1168 ; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1169 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
1170 ; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1171 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1172 ; AVX1-FAST-NEXT: vzeroupper
1173 ; AVX1-FAST-NEXT: retq
1175 ; AVX2-LABEL: test_v16f64_zero:
1177 ; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1
1178 ; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0
1179 ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1180 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1181 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1182 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1183 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1184 ; AVX2-NEXT: vzeroupper
1187 ; AVX512-LABEL: test_v16f64_zero:
1189 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1190 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
1191 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1192 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1193 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1194 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1195 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1196 ; AVX512-NEXT: vzeroupper
1198 %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
1206 define double @test_v2f64_undef(<2 x double> %a0) {
1207 ; SSE-LABEL: test_v2f64_undef:
1209 ; SSE-NEXT: movapd %xmm0, %xmm1
1210 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1211 ; SSE-NEXT: addsd %xmm0, %xmm1
1212 ; SSE-NEXT: movapd %xmm1, %xmm0
1215 ; AVX1-SLOW-LABEL: test_v2f64_undef:
1216 ; AVX1-SLOW: # %bb.0:
1217 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1218 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1219 ; AVX1-SLOW-NEXT: retq
1221 ; AVX1-FAST-LABEL: test_v2f64_undef:
1222 ; AVX1-FAST: # %bb.0:
1223 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1224 ; AVX1-FAST-NEXT: retq
1226 ; AVX2-LABEL: test_v2f64_undef:
1228 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1229 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1232 ; AVX512-LABEL: test_v2f64_undef:
1234 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1235 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1237 %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
1241 define double @test_v4f64_undef(<4 x double> %a0) {
1242 ; SSE-LABEL: test_v4f64_undef:
1244 ; SSE-NEXT: addpd %xmm1, %xmm0
1245 ; SSE-NEXT: movapd %xmm0, %xmm1
1246 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1247 ; SSE-NEXT: addsd %xmm0, %xmm1
1248 ; SSE-NEXT: movapd %xmm1, %xmm0
1251 ; AVX1-SLOW-LABEL: test_v4f64_undef:
1252 ; AVX1-SLOW: # %bb.0:
1253 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1254 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1255 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1256 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1257 ; AVX1-SLOW-NEXT: vzeroupper
1258 ; AVX1-SLOW-NEXT: retq
1260 ; AVX1-FAST-LABEL: test_v4f64_undef:
1261 ; AVX1-FAST: # %bb.0:
1262 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
1263 ; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1264 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1265 ; AVX1-FAST-NEXT: vzeroupper
1266 ; AVX1-FAST-NEXT: retq
1268 ; AVX2-LABEL: test_v4f64_undef:
1270 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1271 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1272 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1273 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1274 ; AVX2-NEXT: vzeroupper
1277 ; AVX512-LABEL: test_v4f64_undef:
1279 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1280 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1281 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1282 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1283 ; AVX512-NEXT: vzeroupper
1285 %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
1289 define double @test_v8f64_undef(<8 x double> %a0) {
1290 ; SSE-LABEL: test_v8f64_undef:
1292 ; SSE-NEXT: addpd %xmm3, %xmm1
1293 ; SSE-NEXT: addpd %xmm2, %xmm0
1294 ; SSE-NEXT: addpd %xmm1, %xmm0
1295 ; SSE-NEXT: movapd %xmm0, %xmm1
1296 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1297 ; SSE-NEXT: addsd %xmm0, %xmm1
1298 ; SSE-NEXT: movapd %xmm1, %xmm0
1301 ; AVX1-SLOW-LABEL: test_v8f64_undef:
1302 ; AVX1-SLOW: # %bb.0:
1303 ; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1304 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1305 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1306 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1307 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1308 ; AVX1-SLOW-NEXT: vzeroupper
1309 ; AVX1-SLOW-NEXT: retq
1311 ; AVX1-FAST-LABEL: test_v8f64_undef:
1312 ; AVX1-FAST: # %bb.0:
1313 ; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1314 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
1315 ; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1316 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1317 ; AVX1-FAST-NEXT: vzeroupper
1318 ; AVX1-FAST-NEXT: retq
1320 ; AVX2-LABEL: test_v8f64_undef:
1322 ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1323 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1324 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1325 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1326 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1327 ; AVX2-NEXT: vzeroupper
1330 ; AVX512-LABEL: test_v8f64_undef:
1332 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
1333 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1334 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1335 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1336 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1337 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1338 ; AVX512-NEXT: vzeroupper
1340 %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
1344 define double @test_v16f64_undef(<16 x double> %a0) {
1345 ; SSE-LABEL: test_v16f64_undef:
1347 ; SSE-NEXT: addpd %xmm6, %xmm2
1348 ; SSE-NEXT: addpd %xmm4, %xmm0
1349 ; SSE-NEXT: addpd %xmm2, %xmm0
1350 ; SSE-NEXT: addpd %xmm7, %xmm3
1351 ; SSE-NEXT: addpd %xmm5, %xmm1
1352 ; SSE-NEXT: addpd %xmm3, %xmm1
1353 ; SSE-NEXT: addpd %xmm0, %xmm1
1354 ; SSE-NEXT: movapd %xmm1, %xmm0
1355 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1356 ; SSE-NEXT: addsd %xmm1, %xmm0
1359 ; AVX1-SLOW-LABEL: test_v16f64_undef:
1360 ; AVX1-SLOW: # %bb.0:
1361 ; AVX1-SLOW-NEXT: vaddpd %ymm3, %ymm1, %ymm1
1362 ; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm0, %ymm0
1363 ; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1364 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1365 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1366 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1367 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1368 ; AVX1-SLOW-NEXT: vzeroupper
1369 ; AVX1-SLOW-NEXT: retq
1371 ; AVX1-FAST-LABEL: test_v16f64_undef:
1372 ; AVX1-FAST: # %bb.0:
1373 ; AVX1-FAST-NEXT: vaddpd %ymm3, %ymm1, %ymm1
1374 ; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm0, %ymm0
1375 ; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1376 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
1377 ; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1378 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1379 ; AVX1-FAST-NEXT: vzeroupper
1380 ; AVX1-FAST-NEXT: retq
1382 ; AVX2-LABEL: test_v16f64_undef:
1384 ; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1
1385 ; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0
1386 ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1387 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1388 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1389 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1390 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1391 ; AVX2-NEXT: vzeroupper
1394 ; AVX512-LABEL: test_v16f64_undef:
1396 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1397 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
1398 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1399 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1400 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1401 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1402 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1403 ; AVX512-NEXT: vzeroupper
1405 %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
1409 declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
1410 declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
1411 declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
1412 declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
1414 declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
1415 declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
1416 declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double, <8 x double>)
1417 declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double, <16 x double>)