1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
14 define float @test_v2f32(float %a0, <2 x float> %a1) {
15 ; SSE2-LABEL: test_v2f32:
17 ; SSE2-NEXT: addss %xmm1, %xmm0
18 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
19 ; SSE2-NEXT: addss %xmm1, %xmm0
22 ; SSE41-LABEL: test_v2f32:
24 ; SSE41-NEXT: addss %xmm1, %xmm0
25 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
26 ; SSE41-NEXT: addss %xmm1, %xmm0
29 ; AVX-LABEL: test_v2f32:
31 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
32 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
33 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
36 ; AVX512-LABEL: test_v2f32:
38 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
39 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
40 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
42 %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
46 define float @test_v4f32(float %a0, <4 x float> %a1) {
47 ; SSE2-LABEL: test_v4f32:
49 ; SSE2-NEXT: addss %xmm1, %xmm0
50 ; SSE2-NEXT: movaps %xmm1, %xmm2
51 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
52 ; SSE2-NEXT: addss %xmm2, %xmm0
53 ; SSE2-NEXT: movaps %xmm1, %xmm2
54 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
55 ; SSE2-NEXT: addss %xmm2, %xmm0
56 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
57 ; SSE2-NEXT: addss %xmm1, %xmm0
60 ; SSE41-LABEL: test_v4f32:
62 ; SSE41-NEXT: addss %xmm1, %xmm0
63 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
64 ; SSE41-NEXT: addss %xmm2, %xmm0
65 ; SSE41-NEXT: movaps %xmm1, %xmm2
66 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
67 ; SSE41-NEXT: addss %xmm2, %xmm0
68 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
69 ; SSE41-NEXT: addss %xmm1, %xmm0
72 ; AVX-LABEL: test_v4f32:
74 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
75 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
76 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
77 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
78 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
79 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
80 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
83 ; AVX512-LABEL: test_v4f32:
85 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
86 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
87 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
88 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
89 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
90 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
91 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
93 %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
97 define float @test_v8f32(float %a0, <8 x float> %a1) {
98 ; SSE2-LABEL: test_v8f32:
100 ; SSE2-NEXT: addss %xmm1, %xmm0
101 ; SSE2-NEXT: movaps %xmm1, %xmm3
102 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
103 ; SSE2-NEXT: addss %xmm3, %xmm0
104 ; SSE2-NEXT: movaps %xmm1, %xmm3
105 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
106 ; SSE2-NEXT: addss %xmm3, %xmm0
107 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
108 ; SSE2-NEXT: addss %xmm1, %xmm0
109 ; SSE2-NEXT: addss %xmm2, %xmm0
110 ; SSE2-NEXT: movaps %xmm2, %xmm1
111 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
112 ; SSE2-NEXT: addss %xmm1, %xmm0
113 ; SSE2-NEXT: movaps %xmm2, %xmm1
114 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
115 ; SSE2-NEXT: addss %xmm1, %xmm0
116 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
117 ; SSE2-NEXT: addss %xmm2, %xmm0
120 ; SSE41-LABEL: test_v8f32:
122 ; SSE41-NEXT: addss %xmm1, %xmm0
123 ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
124 ; SSE41-NEXT: addss %xmm3, %xmm0
125 ; SSE41-NEXT: movaps %xmm1, %xmm3
126 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
127 ; SSE41-NEXT: addss %xmm3, %xmm0
128 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
129 ; SSE41-NEXT: addss %xmm1, %xmm0
130 ; SSE41-NEXT: addss %xmm2, %xmm0
131 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
132 ; SSE41-NEXT: addss %xmm1, %xmm0
133 ; SSE41-NEXT: movaps %xmm2, %xmm1
134 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
135 ; SSE41-NEXT: addss %xmm1, %xmm0
136 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
137 ; SSE41-NEXT: addss %xmm2, %xmm0
140 ; AVX-LABEL: test_v8f32:
142 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
143 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
144 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
145 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
146 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
147 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
148 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
149 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
150 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
151 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
152 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
153 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
154 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
155 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
156 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
157 ; AVX-NEXT: vzeroupper
160 ; AVX512-LABEL: test_v8f32:
162 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
163 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
164 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
165 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
166 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
167 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
168 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
169 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
170 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
171 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
172 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
173 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
174 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
175 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
176 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
177 ; AVX512-NEXT: vzeroupper
179 %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
183 define float @test_v16f32(float %a0, <16 x float> %a1) {
184 ; SSE2-LABEL: test_v16f32:
186 ; SSE2-NEXT: addss %xmm1, %xmm0
187 ; SSE2-NEXT: movaps %xmm1, %xmm5
188 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1]
189 ; SSE2-NEXT: addss %xmm5, %xmm0
190 ; SSE2-NEXT: movaps %xmm1, %xmm5
191 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
192 ; SSE2-NEXT: addss %xmm5, %xmm0
193 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
194 ; SSE2-NEXT: addss %xmm1, %xmm0
195 ; SSE2-NEXT: addss %xmm2, %xmm0
196 ; SSE2-NEXT: movaps %xmm2, %xmm1
197 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
198 ; SSE2-NEXT: addss %xmm1, %xmm0
199 ; SSE2-NEXT: movaps %xmm2, %xmm1
200 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
201 ; SSE2-NEXT: addss %xmm1, %xmm0
202 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
203 ; SSE2-NEXT: addss %xmm2, %xmm0
204 ; SSE2-NEXT: addss %xmm3, %xmm0
205 ; SSE2-NEXT: movaps %xmm3, %xmm1
206 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
207 ; SSE2-NEXT: addss %xmm1, %xmm0
208 ; SSE2-NEXT: movaps %xmm3, %xmm1
209 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
210 ; SSE2-NEXT: addss %xmm1, %xmm0
211 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
212 ; SSE2-NEXT: addss %xmm3, %xmm0
213 ; SSE2-NEXT: addss %xmm4, %xmm0
214 ; SSE2-NEXT: movaps %xmm4, %xmm1
215 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
216 ; SSE2-NEXT: addss %xmm1, %xmm0
217 ; SSE2-NEXT: movaps %xmm4, %xmm1
218 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
219 ; SSE2-NEXT: addss %xmm1, %xmm0
220 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
221 ; SSE2-NEXT: addss %xmm4, %xmm0
224 ; SSE41-LABEL: test_v16f32:
226 ; SSE41-NEXT: addss %xmm1, %xmm0
227 ; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
228 ; SSE41-NEXT: addss %xmm5, %xmm0
229 ; SSE41-NEXT: movaps %xmm1, %xmm5
230 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
231 ; SSE41-NEXT: addss %xmm5, %xmm0
232 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
233 ; SSE41-NEXT: addss %xmm1, %xmm0
234 ; SSE41-NEXT: addss %xmm2, %xmm0
235 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
236 ; SSE41-NEXT: addss %xmm1, %xmm0
237 ; SSE41-NEXT: movaps %xmm2, %xmm1
238 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
239 ; SSE41-NEXT: addss %xmm1, %xmm0
240 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
241 ; SSE41-NEXT: addss %xmm2, %xmm0
242 ; SSE41-NEXT: addss %xmm3, %xmm0
243 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
244 ; SSE41-NEXT: addss %xmm1, %xmm0
245 ; SSE41-NEXT: movaps %xmm3, %xmm1
246 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
247 ; SSE41-NEXT: addss %xmm1, %xmm0
248 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
249 ; SSE41-NEXT: addss %xmm3, %xmm0
250 ; SSE41-NEXT: addss %xmm4, %xmm0
251 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
252 ; SSE41-NEXT: addss %xmm1, %xmm0
253 ; SSE41-NEXT: movaps %xmm4, %xmm1
254 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
255 ; SSE41-NEXT: addss %xmm1, %xmm0
256 ; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
257 ; SSE41-NEXT: addss %xmm4, %xmm0
260 ; AVX-LABEL: test_v16f32:
262 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
263 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
264 ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0
265 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
266 ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0
267 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3]
268 ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0
269 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
270 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
271 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
272 ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0
273 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
274 ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0
275 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
276 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
277 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
278 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
279 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
280 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
281 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
282 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3]
283 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
284 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
285 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
286 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
287 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
288 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
289 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
290 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
291 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
292 ; AVX-NEXT: vzeroupper
295 ; AVX512-LABEL: test_v16f32:
297 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
298 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
299 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
300 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
301 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
302 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
303 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
304 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
305 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
306 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
307 ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0
308 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
309 ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0
310 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
311 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
312 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
313 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
314 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
315 ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0
316 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
317 ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0
318 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
319 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
320 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
321 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
322 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
323 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
324 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
325 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
326 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
327 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
328 ; AVX512-NEXT: vzeroupper
330 %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
338 define float @test_v2f32_zero(<2 x float> %a0) {
339 ; SSE2-LABEL: test_v2f32_zero:
341 ; SSE2-NEXT: movaps %xmm0, %xmm1
342 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
343 ; SSE2-NEXT: addss %xmm0, %xmm1
344 ; SSE2-NEXT: movaps %xmm1, %xmm0
347 ; SSE41-LABEL: test_v2f32_zero:
349 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
350 ; SSE41-NEXT: addss %xmm1, %xmm0
353 ; AVX1-SLOW-LABEL: test_v2f32_zero:
354 ; AVX1-SLOW: # %bb.0:
355 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
356 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
357 ; AVX1-SLOW-NEXT: retq
359 ; AVX1-FAST-LABEL: test_v2f32_zero:
360 ; AVX1-FAST: # %bb.0:
361 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
362 ; AVX1-FAST-NEXT: retq
364 ; AVX2-LABEL: test_v2f32_zero:
366 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
367 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
370 ; AVX512-LABEL: test_v2f32_zero:
372 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
373 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
375 %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %a0)
379 define float @test_v4f32_zero(<4 x float> %a0) {
380 ; SSE2-LABEL: test_v4f32_zero:
382 ; SSE2-NEXT: movaps %xmm0, %xmm1
383 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
384 ; SSE2-NEXT: addss %xmm0, %xmm1
385 ; SSE2-NEXT: movaps %xmm0, %xmm2
386 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
387 ; SSE2-NEXT: addss %xmm1, %xmm2
388 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
389 ; SSE2-NEXT: addss %xmm2, %xmm0
392 ; SSE41-LABEL: test_v4f32_zero:
394 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
395 ; SSE41-NEXT: addss %xmm0, %xmm1
396 ; SSE41-NEXT: movaps %xmm0, %xmm2
397 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
398 ; SSE41-NEXT: addss %xmm1, %xmm2
399 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
400 ; SSE41-NEXT: addss %xmm2, %xmm0
403 ; AVX1-SLOW-LABEL: test_v4f32_zero:
404 ; AVX1-SLOW: # %bb.0:
405 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
406 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1
407 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
408 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
409 ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
410 ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
411 ; AVX1-SLOW-NEXT: retq
413 ; AVX1-FAST-LABEL: test_v4f32_zero:
414 ; AVX1-FAST: # %bb.0:
415 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1
416 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
417 ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
418 ; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
419 ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0
420 ; AVX1-FAST-NEXT: retq
422 ; AVX2-LABEL: test_v4f32_zero:
424 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
425 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1
426 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
427 ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
428 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
429 ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0
432 ; AVX512-LABEL: test_v4f32_zero:
434 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
435 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1
436 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
437 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
438 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
439 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
441 %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a0)
445 define float @test_v8f32_zero(<8 x float> %a0) {
446 ; SSE2-LABEL: test_v8f32_zero:
448 ; SSE2-NEXT: movaps %xmm0, %xmm2
449 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
450 ; SSE2-NEXT: addss %xmm0, %xmm2
451 ; SSE2-NEXT: movaps %xmm0, %xmm3
452 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
453 ; SSE2-NEXT: addss %xmm2, %xmm3
454 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
455 ; SSE2-NEXT: addss %xmm3, %xmm0
456 ; SSE2-NEXT: addss %xmm1, %xmm0
457 ; SSE2-NEXT: movaps %xmm1, %xmm2
458 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
459 ; SSE2-NEXT: addss %xmm2, %xmm0
460 ; SSE2-NEXT: movaps %xmm1, %xmm2
461 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
462 ; SSE2-NEXT: addss %xmm2, %xmm0
463 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
464 ; SSE2-NEXT: addss %xmm1, %xmm0
467 ; SSE41-LABEL: test_v8f32_zero:
469 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
470 ; SSE41-NEXT: addss %xmm0, %xmm2
471 ; SSE41-NEXT: movaps %xmm0, %xmm3
472 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
473 ; SSE41-NEXT: addss %xmm2, %xmm3
474 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
475 ; SSE41-NEXT: addss %xmm3, %xmm0
476 ; SSE41-NEXT: addss %xmm1, %xmm0
477 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
478 ; SSE41-NEXT: addss %xmm2, %xmm0
479 ; SSE41-NEXT: movaps %xmm1, %xmm2
480 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
481 ; SSE41-NEXT: addss %xmm2, %xmm0
482 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
483 ; SSE41-NEXT: addss %xmm1, %xmm0
486 ; AVX1-SLOW-LABEL: test_v8f32_zero:
487 ; AVX1-SLOW: # %bb.0:
488 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
489 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1
490 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
491 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
492 ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
493 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
494 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
495 ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm1
496 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
497 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
498 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
499 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
500 ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
501 ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
502 ; AVX1-SLOW-NEXT: vzeroupper
503 ; AVX1-SLOW-NEXT: retq
505 ; AVX1-FAST-LABEL: test_v8f32_zero:
506 ; AVX1-FAST: # %bb.0:
507 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1
508 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
509 ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
510 ; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
511 ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
512 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
513 ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm1
514 ; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
515 ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
516 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
517 ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
518 ; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
519 ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0
520 ; AVX1-FAST-NEXT: vzeroupper
521 ; AVX1-FAST-NEXT: retq
523 ; AVX2-LABEL: test_v8f32_zero:
525 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
526 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1
527 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
528 ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
529 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
530 ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
531 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
532 ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm1
533 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
534 ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
535 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
536 ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
537 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
538 ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0
539 ; AVX2-NEXT: vzeroupper
542 ; AVX512-LABEL: test_v8f32_zero:
544 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
545 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1
546 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
547 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
548 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
549 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
550 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
551 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1
552 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
553 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
554 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
555 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
556 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
557 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
558 ; AVX512-NEXT: vzeroupper
560 %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a0)
564 define float @test_v16f32_zero(<16 x float> %a0) {
565 ; SSE2-LABEL: test_v16f32_zero:
567 ; SSE2-NEXT: movaps %xmm0, %xmm4
568 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
569 ; SSE2-NEXT: addss %xmm0, %xmm4
570 ; SSE2-NEXT: movaps %xmm0, %xmm5
571 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
572 ; SSE2-NEXT: addss %xmm4, %xmm5
573 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
574 ; SSE2-NEXT: addss %xmm5, %xmm0
575 ; SSE2-NEXT: addss %xmm1, %xmm0
576 ; SSE2-NEXT: movaps %xmm1, %xmm4
577 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
578 ; SSE2-NEXT: addss %xmm4, %xmm0
579 ; SSE2-NEXT: movaps %xmm1, %xmm4
580 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
581 ; SSE2-NEXT: addss %xmm4, %xmm0
582 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
583 ; SSE2-NEXT: addss %xmm1, %xmm0
584 ; SSE2-NEXT: addss %xmm2, %xmm0
585 ; SSE2-NEXT: movaps %xmm2, %xmm1
586 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
587 ; SSE2-NEXT: addss %xmm1, %xmm0
588 ; SSE2-NEXT: movaps %xmm2, %xmm1
589 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
590 ; SSE2-NEXT: addss %xmm1, %xmm0
591 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
592 ; SSE2-NEXT: addss %xmm2, %xmm0
593 ; SSE2-NEXT: addss %xmm3, %xmm0
594 ; SSE2-NEXT: movaps %xmm3, %xmm1
595 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
596 ; SSE2-NEXT: addss %xmm1, %xmm0
597 ; SSE2-NEXT: movaps %xmm3, %xmm1
598 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
599 ; SSE2-NEXT: addss %xmm1, %xmm0
600 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
601 ; SSE2-NEXT: addss %xmm3, %xmm0
604 ; SSE41-LABEL: test_v16f32_zero:
606 ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
607 ; SSE41-NEXT: addss %xmm0, %xmm4
608 ; SSE41-NEXT: movaps %xmm0, %xmm5
609 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
610 ; SSE41-NEXT: addss %xmm4, %xmm5
611 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
612 ; SSE41-NEXT: addss %xmm5, %xmm0
613 ; SSE41-NEXT: addss %xmm1, %xmm0
614 ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
615 ; SSE41-NEXT: addss %xmm4, %xmm0
616 ; SSE41-NEXT: movaps %xmm1, %xmm4
617 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
618 ; SSE41-NEXT: addss %xmm4, %xmm0
619 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
620 ; SSE41-NEXT: addss %xmm1, %xmm0
621 ; SSE41-NEXT: addss %xmm2, %xmm0
622 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
623 ; SSE41-NEXT: addss %xmm1, %xmm0
624 ; SSE41-NEXT: movaps %xmm2, %xmm1
625 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
626 ; SSE41-NEXT: addss %xmm1, %xmm0
627 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
628 ; SSE41-NEXT: addss %xmm2, %xmm0
629 ; SSE41-NEXT: addss %xmm3, %xmm0
630 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
631 ; SSE41-NEXT: addss %xmm1, %xmm0
632 ; SSE41-NEXT: movaps %xmm3, %xmm1
633 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
634 ; SSE41-NEXT: addss %xmm1, %xmm0
635 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
636 ; SSE41-NEXT: addss %xmm3, %xmm0
639 ; AVX1-SLOW-LABEL: test_v16f32_zero:
640 ; AVX1-SLOW: # %bb.0:
641 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
642 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2
643 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
644 ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2
645 ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
646 ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2
647 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
648 ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm2
649 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
650 ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2
651 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
652 ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2
653 ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
654 ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0
655 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
656 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
657 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
658 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
659 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
660 ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
661 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
662 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
663 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
664 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
665 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
666 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
667 ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
668 ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
669 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
670 ; AVX1-SLOW-NEXT: vzeroupper
671 ; AVX1-SLOW-NEXT: retq
673 ; AVX1-FAST-LABEL: test_v16f32_zero:
674 ; AVX1-FAST: # %bb.0:
675 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm2
676 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
677 ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2
678 ; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
679 ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2
680 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
681 ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm2
682 ; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
683 ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2
684 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
685 ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2
686 ; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
687 ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm0
688 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
689 ; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
690 ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
691 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
692 ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
693 ; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
694 ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
695 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
696 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
697 ; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
698 ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
699 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
700 ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
701 ; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
702 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
703 ; AVX1-FAST-NEXT: vzeroupper
704 ; AVX1-FAST-NEXT: retq
706 ; AVX2-LABEL: test_v16f32_zero:
708 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
709 ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm2
710 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
711 ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2
712 ; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
713 ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2
714 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
715 ; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm2
716 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
717 ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2
718 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
719 ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2
720 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
721 ; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm0
722 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
723 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
724 ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
725 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
726 ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
727 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
728 ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
729 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
730 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
731 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
732 ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
733 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
734 ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
735 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
736 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
737 ; AVX2-NEXT: vzeroupper
740 ; AVX512-LABEL: test_v16f32_zero:
742 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
743 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1
744 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
745 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
746 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
747 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
748 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
749 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
750 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
751 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
752 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
753 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
754 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
755 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
756 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
757 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
758 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
759 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
760 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
761 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
762 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
763 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
764 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
765 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1
766 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
767 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
768 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
769 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
770 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
771 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
772 ; AVX512-NEXT: vzeroupper
774 %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float -0.0, <16 x float> %a0)
782 define float @test_v2f32_undef(<2 x float> %a0) {
783 ; SSE2-LABEL: test_v2f32_undef:
785 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
786 ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
789 ; SSE41-LABEL: test_v2f32_undef:
791 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
792 ; SSE41-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
795 ; AVX-LABEL: test_v2f32_undef:
797 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
798 ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
801 ; AVX512-LABEL: test_v2f32_undef:
803 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
804 ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
806 %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0)
810 define float @test_v4f32_undef(<4 x float> %a0) {
811 ; SSE2-LABEL: test_v4f32_undef:
813 ; SSE2-NEXT: movaps %xmm0, %xmm1
814 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
815 ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
816 ; SSE2-NEXT: movaps %xmm0, %xmm2
817 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
818 ; SSE2-NEXT: addss %xmm1, %xmm2
819 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
820 ; SSE2-NEXT: addss %xmm2, %xmm0
823 ; SSE41-LABEL: test_v4f32_undef:
825 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
826 ; SSE41-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
827 ; SSE41-NEXT: movaps %xmm0, %xmm2
828 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
829 ; SSE41-NEXT: addss %xmm1, %xmm2
830 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
831 ; SSE41-NEXT: addss %xmm2, %xmm0
834 ; AVX-LABEL: test_v4f32_undef:
836 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
837 ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
838 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
839 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
840 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
841 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
844 ; AVX512-LABEL: test_v4f32_undef:
846 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
847 ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
848 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
849 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
850 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
851 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
853 %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0)
857 define float @test_v8f32_undef(<8 x float> %a0) {
858 ; SSE2-LABEL: test_v8f32_undef:
860 ; SSE2-NEXT: movaps %xmm0, %xmm2
861 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
862 ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
863 ; SSE2-NEXT: movaps %xmm0, %xmm3
864 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
865 ; SSE2-NEXT: addss %xmm2, %xmm3
866 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
867 ; SSE2-NEXT: addss %xmm3, %xmm0
868 ; SSE2-NEXT: addss %xmm1, %xmm0
869 ; SSE2-NEXT: movaps %xmm1, %xmm2
870 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
871 ; SSE2-NEXT: addss %xmm2, %xmm0
872 ; SSE2-NEXT: movaps %xmm1, %xmm2
873 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
874 ; SSE2-NEXT: addss %xmm2, %xmm0
875 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
876 ; SSE2-NEXT: addss %xmm1, %xmm0
879 ; SSE41-LABEL: test_v8f32_undef:
881 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
882 ; SSE41-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
883 ; SSE41-NEXT: movaps %xmm0, %xmm3
884 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
885 ; SSE41-NEXT: addss %xmm2, %xmm3
886 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
887 ; SSE41-NEXT: addss %xmm3, %xmm0
888 ; SSE41-NEXT: addss %xmm1, %xmm0
889 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
890 ; SSE41-NEXT: addss %xmm2, %xmm0
891 ; SSE41-NEXT: movaps %xmm1, %xmm2
892 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
893 ; SSE41-NEXT: addss %xmm2, %xmm0
894 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
895 ; SSE41-NEXT: addss %xmm1, %xmm0
898 ; AVX-LABEL: test_v8f32_undef:
900 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
901 ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
902 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
903 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
904 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
905 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
906 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
907 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm1
908 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
909 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
910 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
911 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
912 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
913 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
914 ; AVX-NEXT: vzeroupper
917 ; AVX512-LABEL: test_v8f32_undef:
919 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
920 ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
921 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
922 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
923 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
924 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
925 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
926 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1
927 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
928 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
929 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
930 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
931 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
932 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
933 ; AVX512-NEXT: vzeroupper
935 %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0)
939 define float @test_v16f32_undef(<16 x float> %a0) {
940 ; SSE2-LABEL: test_v16f32_undef:
942 ; SSE2-NEXT: movaps %xmm0, %xmm4
943 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
944 ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
945 ; SSE2-NEXT: movaps %xmm0, %xmm5
946 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
947 ; SSE2-NEXT: addss %xmm4, %xmm5
948 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
949 ; SSE2-NEXT: addss %xmm5, %xmm0
950 ; SSE2-NEXT: addss %xmm1, %xmm0
951 ; SSE2-NEXT: movaps %xmm1, %xmm4
952 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
953 ; SSE2-NEXT: addss %xmm4, %xmm0
954 ; SSE2-NEXT: movaps %xmm1, %xmm4
955 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
956 ; SSE2-NEXT: addss %xmm4, %xmm0
957 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
958 ; SSE2-NEXT: addss %xmm1, %xmm0
959 ; SSE2-NEXT: addss %xmm2, %xmm0
960 ; SSE2-NEXT: movaps %xmm2, %xmm1
961 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
962 ; SSE2-NEXT: addss %xmm1, %xmm0
963 ; SSE2-NEXT: movaps %xmm2, %xmm1
964 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
965 ; SSE2-NEXT: addss %xmm1, %xmm0
966 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
967 ; SSE2-NEXT: addss %xmm2, %xmm0
968 ; SSE2-NEXT: addss %xmm3, %xmm0
969 ; SSE2-NEXT: movaps %xmm3, %xmm1
970 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
971 ; SSE2-NEXT: addss %xmm1, %xmm0
972 ; SSE2-NEXT: movaps %xmm3, %xmm1
973 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
974 ; SSE2-NEXT: addss %xmm1, %xmm0
975 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
976 ; SSE2-NEXT: addss %xmm3, %xmm0
979 ; SSE41-LABEL: test_v16f32_undef:
981 ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
982 ; SSE41-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
983 ; SSE41-NEXT: movaps %xmm0, %xmm5
984 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
985 ; SSE41-NEXT: addss %xmm4, %xmm5
986 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
987 ; SSE41-NEXT: addss %xmm5, %xmm0
988 ; SSE41-NEXT: addss %xmm1, %xmm0
989 ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
990 ; SSE41-NEXT: addss %xmm4, %xmm0
991 ; SSE41-NEXT: movaps %xmm1, %xmm4
992 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
993 ; SSE41-NEXT: addss %xmm4, %xmm0
994 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
995 ; SSE41-NEXT: addss %xmm1, %xmm0
996 ; SSE41-NEXT: addss %xmm2, %xmm0
997 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
998 ; SSE41-NEXT: addss %xmm1, %xmm0
999 ; SSE41-NEXT: movaps %xmm2, %xmm1
1000 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
1001 ; SSE41-NEXT: addss %xmm1, %xmm0
1002 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
1003 ; SSE41-NEXT: addss %xmm2, %xmm0
1004 ; SSE41-NEXT: addss %xmm3, %xmm0
1005 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
1006 ; SSE41-NEXT: addss %xmm1, %xmm0
1007 ; SSE41-NEXT: movaps %xmm3, %xmm1
1008 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
1009 ; SSE41-NEXT: addss %xmm1, %xmm0
1010 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
1011 ; SSE41-NEXT: addss %xmm3, %xmm0
1014 ; AVX-LABEL: test_v16f32_undef:
1016 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
1017 ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1018 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
1019 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2
1020 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
1021 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2
1022 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1023 ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm2
1024 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
1025 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2
1026 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
1027 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2
1028 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1029 ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0
1030 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
1031 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1032 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
1033 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1034 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
1035 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
1036 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
1037 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1038 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
1039 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1040 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
1041 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1042 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
1043 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
1044 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
1045 ; AVX-NEXT: vzeroupper
1048 ; AVX512-LABEL: test_v16f32_undef:
1050 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1051 ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1052 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1053 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
1054 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
1055 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
1056 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
1057 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
1058 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
1059 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
1060 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
1061 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
1062 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
1063 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
1064 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
1065 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
1066 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
1067 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
1068 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
1069 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
1070 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
1071 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
1072 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
1073 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1
1074 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
1075 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
1076 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1077 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
1078 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1079 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
1080 ; AVX512-NEXT: vzeroupper
1082 %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0)
1090 define double @test_v2f64(double %a0, <2 x double> %a1) {
1091 ; SSE-LABEL: test_v2f64:
1093 ; SSE-NEXT: addsd %xmm1, %xmm0
1094 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1095 ; SSE-NEXT: addsd %xmm1, %xmm0
1098 ; AVX-LABEL: test_v2f64:
1100 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1101 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1102 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1105 ; AVX512-LABEL: test_v2f64:
1107 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1108 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1109 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1111 %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
1115 define double @test_v4f64(double %a0, <4 x double> %a1) {
1116 ; SSE-LABEL: test_v4f64:
1118 ; SSE-NEXT: addsd %xmm1, %xmm0
1119 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1120 ; SSE-NEXT: addsd %xmm1, %xmm0
1121 ; SSE-NEXT: addsd %xmm2, %xmm0
1122 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1123 ; SSE-NEXT: addsd %xmm2, %xmm0
1126 ; AVX-LABEL: test_v4f64:
1128 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1129 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1130 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1131 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1132 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1133 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1134 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1135 ; AVX-NEXT: vzeroupper
1138 ; AVX512-LABEL: test_v4f64:
1140 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1141 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1142 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1143 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
1144 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1145 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1146 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1147 ; AVX512-NEXT: vzeroupper
1149 %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
1153 define double @test_v8f64(double %a0, <8 x double> %a1) {
1154 ; SSE-LABEL: test_v8f64:
1156 ; SSE-NEXT: addsd %xmm1, %xmm0
1157 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1158 ; SSE-NEXT: addsd %xmm1, %xmm0
1159 ; SSE-NEXT: addsd %xmm2, %xmm0
1160 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1161 ; SSE-NEXT: addsd %xmm2, %xmm0
1162 ; SSE-NEXT: addsd %xmm3, %xmm0
1163 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1164 ; SSE-NEXT: addsd %xmm3, %xmm0
1165 ; SSE-NEXT: addsd %xmm4, %xmm0
1166 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1167 ; SSE-NEXT: addsd %xmm4, %xmm0
1170 ; AVX-LABEL: test_v8f64:
1172 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1173 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1174 ; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0
1175 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1176 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1177 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1178 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1179 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1180 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1181 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1182 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
1183 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1184 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1185 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1186 ; AVX-NEXT: vzeroupper
1189 ; AVX512-LABEL: test_v8f64:
1191 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1192 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1193 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1194 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
1195 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1196 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1197 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1198 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
1199 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1200 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1201 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1202 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
1203 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1204 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1205 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1206 ; AVX512-NEXT: vzeroupper
1208 %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
1212 define double @test_v16f64(double %a0, <16 x double> %a1) {
1213 ; SSE2-LABEL: test_v16f64:
1215 ; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
1216 ; SSE2-NEXT: addsd %xmm1, %xmm0
1217 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1218 ; SSE2-NEXT: addsd %xmm1, %xmm0
1219 ; SSE2-NEXT: addsd %xmm2, %xmm0
1220 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1221 ; SSE2-NEXT: addsd %xmm2, %xmm0
1222 ; SSE2-NEXT: addsd %xmm3, %xmm0
1223 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1224 ; SSE2-NEXT: addsd %xmm3, %xmm0
1225 ; SSE2-NEXT: addsd %xmm4, %xmm0
1226 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1227 ; SSE2-NEXT: addsd %xmm4, %xmm0
1228 ; SSE2-NEXT: addsd %xmm5, %xmm0
1229 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1230 ; SSE2-NEXT: addsd %xmm5, %xmm0
1231 ; SSE2-NEXT: addsd %xmm6, %xmm0
1232 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1233 ; SSE2-NEXT: addsd %xmm6, %xmm0
1234 ; SSE2-NEXT: addsd %xmm7, %xmm0
1235 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1236 ; SSE2-NEXT: addsd %xmm7, %xmm0
1237 ; SSE2-NEXT: addsd %xmm8, %xmm0
1238 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
1239 ; SSE2-NEXT: addsd %xmm8, %xmm0
1242 ; SSE41-LABEL: test_v16f64:
1244 ; SSE41-NEXT: addsd %xmm1, %xmm0
1245 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1246 ; SSE41-NEXT: addsd %xmm1, %xmm0
1247 ; SSE41-NEXT: addsd %xmm2, %xmm0
1248 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1249 ; SSE41-NEXT: addsd %xmm2, %xmm0
1250 ; SSE41-NEXT: addsd %xmm3, %xmm0
1251 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1252 ; SSE41-NEXT: addsd %xmm3, %xmm0
1253 ; SSE41-NEXT: addsd %xmm4, %xmm0
1254 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1255 ; SSE41-NEXT: addsd %xmm4, %xmm0
1256 ; SSE41-NEXT: addsd %xmm5, %xmm0
1257 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1258 ; SSE41-NEXT: addsd %xmm5, %xmm0
1259 ; SSE41-NEXT: addsd %xmm6, %xmm0
1260 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1261 ; SSE41-NEXT: addsd %xmm6, %xmm0
1262 ; SSE41-NEXT: addsd %xmm7, %xmm0
1263 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1264 ; SSE41-NEXT: addsd %xmm7, %xmm0
1265 ; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0
1266 ; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0
1269 ; AVX-LABEL: test_v16f64:
1271 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1272 ; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
1273 ; AVX-NEXT: vaddsd %xmm5, %xmm0, %xmm0
1274 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1275 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1276 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1277 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1278 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1279 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1280 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1281 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
1282 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1283 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1284 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1285 ; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0
1286 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1287 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1288 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1
1289 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1290 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1291 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1292 ; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0
1293 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm4[1,0]
1294 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1295 ; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1
1296 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1297 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1298 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1299 ; AVX-NEXT: vzeroupper
1302 ; AVX512-LABEL: test_v16f64:
1304 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1305 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1306 ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0
1307 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3
1308 ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0
1309 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1310 ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0
1311 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3
1312 ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0
1313 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1314 ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0
1315 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
1316 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1317 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1318 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1319 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1320 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1321 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1322 ; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm1
1323 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1324 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1325 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1326 ; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm1
1327 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1328 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1329 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1330 ; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm1
1331 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1332 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1333 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1334 ; AVX512-NEXT: vzeroupper
1336 %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
1344 define double @test_v2f64_zero(<2 x double> %a0) {
1345 ; SSE-LABEL: test_v2f64_zero:
1347 ; SSE-NEXT: movapd %xmm0, %xmm1
1348 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1349 ; SSE-NEXT: addsd %xmm0, %xmm1
1350 ; SSE-NEXT: movapd %xmm1, %xmm0
1353 ; AVX1-SLOW-LABEL: test_v2f64_zero:
1354 ; AVX1-SLOW: # %bb.0:
1355 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1356 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1357 ; AVX1-SLOW-NEXT: retq
1359 ; AVX1-FAST-LABEL: test_v2f64_zero:
1360 ; AVX1-FAST: # %bb.0:
1361 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1362 ; AVX1-FAST-NEXT: retq
1364 ; AVX2-LABEL: test_v2f64_zero:
1366 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1367 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1370 ; AVX512-LABEL: test_v2f64_zero:
1372 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1373 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1375 %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %a0)
1379 define double @test_v4f64_zero(<4 x double> %a0) {
1380 ; SSE-LABEL: test_v4f64_zero:
1382 ; SSE-NEXT: movapd %xmm0, %xmm2
1383 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
1384 ; SSE-NEXT: addsd %xmm0, %xmm2
1385 ; SSE-NEXT: addsd %xmm1, %xmm2
1386 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1387 ; SSE-NEXT: addsd %xmm1, %xmm2
1388 ; SSE-NEXT: movapd %xmm2, %xmm0
1391 ; AVX1-SLOW-LABEL: test_v4f64_zero:
1392 ; AVX1-SLOW: # %bb.0:
1393 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1394 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm1
1395 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1396 ; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm1
1397 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1398 ; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1399 ; AVX1-SLOW-NEXT: vzeroupper
1400 ; AVX1-SLOW-NEXT: retq
1402 ; AVX1-FAST-LABEL: test_v4f64_zero:
1403 ; AVX1-FAST: # %bb.0:
1404 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm1
1405 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1406 ; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm1, %xmm1
1407 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1408 ; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1409 ; AVX1-FAST-NEXT: vzeroupper
1410 ; AVX1-FAST-NEXT: retq
1412 ; AVX2-LABEL: test_v4f64_zero:
1414 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1415 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm1
1416 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
1417 ; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm1
1418 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1419 ; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1420 ; AVX2-NEXT: vzeroupper
1423 ; AVX512-LABEL: test_v4f64_zero:
1425 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1426 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1
1427 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
1428 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1
1429 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1430 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1431 ; AVX512-NEXT: vzeroupper
1433 %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a0)
1437 define double @test_v8f64_zero(<8 x double> %a0) {
1438 ; SSE-LABEL: test_v8f64_zero:
1440 ; SSE-NEXT: movapd %xmm0, %xmm4
1441 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
1442 ; SSE-NEXT: addsd %xmm0, %xmm4
1443 ; SSE-NEXT: addsd %xmm1, %xmm4
1444 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1445 ; SSE-NEXT: addsd %xmm1, %xmm4
1446 ; SSE-NEXT: addsd %xmm2, %xmm4
1447 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1448 ; SSE-NEXT: addsd %xmm2, %xmm4
1449 ; SSE-NEXT: addsd %xmm3, %xmm4
1450 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1451 ; SSE-NEXT: addsd %xmm3, %xmm4
1452 ; SSE-NEXT: movapd %xmm4, %xmm0
1455 ; AVX1-SLOW-LABEL: test_v8f64_zero:
1456 ; AVX1-SLOW: # %bb.0:
1457 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1458 ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm2
1459 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1460 ; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm2, %xmm2
1461 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1462 ; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm2, %xmm0
1463 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1464 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1465 ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1466 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
1467 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1468 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1469 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1470 ; AVX1-SLOW-NEXT: vzeroupper
1471 ; AVX1-SLOW-NEXT: retq
1473 ; AVX1-FAST-LABEL: test_v8f64_zero:
1474 ; AVX1-FAST: # %bb.0:
1475 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm2
1476 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1477 ; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm2, %xmm2
1478 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1479 ; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm2, %xmm0
1480 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1481 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1482 ; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1483 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
1484 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1485 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1486 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1487 ; AVX1-FAST-NEXT: vzeroupper
1488 ; AVX1-FAST-NEXT: retq
1490 ; AVX2-LABEL: test_v8f64_zero:
1492 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1493 ; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm2
1494 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
1495 ; AVX2-NEXT: vaddsd %xmm0, %xmm2, %xmm2
1496 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1497 ; AVX2-NEXT: vaddsd %xmm0, %xmm2, %xmm0
1498 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1499 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1500 ; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1501 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
1502 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1503 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1504 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1505 ; AVX2-NEXT: vzeroupper
1508 ; AVX512-LABEL: test_v8f64_zero:
1510 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1511 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1
1512 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
1513 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1514 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1515 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1516 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
1517 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1518 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1519 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1520 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
1521 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1
1522 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1523 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1524 ; AVX512-NEXT: vzeroupper
1526 %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double -0.0, <8 x double> %a0)
1530 define double @test_v16f64_zero(<16 x double> %a0) {
1531 ; SSE-LABEL: test_v16f64_zero:
1533 ; SSE-NEXT: movapd %xmm0, %xmm8
1534 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1535 ; SSE-NEXT: addsd %xmm8, %xmm0
1536 ; SSE-NEXT: addsd %xmm1, %xmm0
1537 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1538 ; SSE-NEXT: addsd %xmm1, %xmm0
1539 ; SSE-NEXT: addsd %xmm2, %xmm0
1540 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1541 ; SSE-NEXT: addsd %xmm2, %xmm0
1542 ; SSE-NEXT: addsd %xmm3, %xmm0
1543 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1544 ; SSE-NEXT: addsd %xmm3, %xmm0
1545 ; SSE-NEXT: addsd %xmm4, %xmm0
1546 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1547 ; SSE-NEXT: addsd %xmm4, %xmm0
1548 ; SSE-NEXT: addsd %xmm5, %xmm0
1549 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1550 ; SSE-NEXT: addsd %xmm5, %xmm0
1551 ; SSE-NEXT: addsd %xmm6, %xmm0
1552 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1553 ; SSE-NEXT: addsd %xmm6, %xmm0
1554 ; SSE-NEXT: addsd %xmm7, %xmm0
1555 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1556 ; SSE-NEXT: addsd %xmm7, %xmm0
1559 ; AVX1-SLOW-LABEL: test_v16f64_zero:
1560 ; AVX1-SLOW: # %bb.0:
1561 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
1562 ; AVX1-SLOW-NEXT: vaddsd %xmm4, %xmm0, %xmm4
1563 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1564 ; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm4, %xmm4
1565 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1566 ; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm4, %xmm0
1567 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1568 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1569 ; AVX1-SLOW-NEXT: vaddsd %xmm4, %xmm0, %xmm0
1570 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
1571 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1572 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1573 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1574 ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1575 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1576 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1577 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm1
1578 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1579 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1580 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1581 ; AVX1-SLOW-NEXT: vaddsd %xmm3, %xmm0, %xmm0
1582 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1583 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1584 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm1
1585 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1586 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1587 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1588 ; AVX1-SLOW-NEXT: vzeroupper
1589 ; AVX1-SLOW-NEXT: retq
1591 ; AVX1-FAST-LABEL: test_v16f64_zero:
1592 ; AVX1-FAST: # %bb.0:
1593 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm4
1594 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1595 ; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm4, %xmm4
1596 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1597 ; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm4, %xmm0
1598 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1599 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1600 ; AVX1-FAST-NEXT: vaddsd %xmm4, %xmm0, %xmm0
1601 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
1602 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1603 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1604 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1605 ; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1606 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1607 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1608 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm1
1609 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1610 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1611 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1612 ; AVX1-FAST-NEXT: vaddsd %xmm3, %xmm0, %xmm0
1613 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1614 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1615 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm1
1616 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1617 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1618 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1619 ; AVX1-FAST-NEXT: vzeroupper
1620 ; AVX1-FAST-NEXT: retq
1622 ; AVX2-LABEL: test_v16f64_zero:
1624 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
1625 ; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm4
1626 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
1627 ; AVX2-NEXT: vaddsd %xmm0, %xmm4, %xmm4
1628 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1629 ; AVX2-NEXT: vaddsd %xmm0, %xmm4, %xmm0
1630 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1631 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1632 ; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm0
1633 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
1634 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1635 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1636 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1637 ; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1638 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1639 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1640 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1
1641 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1642 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1643 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1644 ; AVX2-NEXT: vaddsd %xmm3, %xmm0, %xmm0
1645 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1646 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1647 ; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm1
1648 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1649 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1650 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1651 ; AVX2-NEXT: vzeroupper
1654 ; AVX512-LABEL: test_v16f64_zero:
1656 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1657 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm2
1658 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3
1659 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
1660 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1661 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
1662 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3
1663 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
1664 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1665 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
1666 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
1667 ; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm2
1668 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1669 ; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm0
1670 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1671 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1672 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1673 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
1674 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1675 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1676 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1677 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
1678 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1679 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1680 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1681 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
1682 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1683 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1684 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1685 ; AVX512-NEXT: vzeroupper
1687 %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double -0.0, <16 x double> %a0)
1695 define double @test_v2f64_undef(<2 x double> %a0) {
1696 ; SSE-LABEL: test_v2f64_undef:
1698 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1699 ; SSE-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1702 ; AVX-LABEL: test_v2f64_undef:
1704 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1705 ; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1708 ; AVX512-LABEL: test_v2f64_undef:
1710 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1711 ; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1713 %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0)
1717 define double @test_v4f64_undef(<4 x double> %a0) {
1718 ; SSE-LABEL: test_v4f64_undef:
1720 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1721 ; SSE-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1722 ; SSE-NEXT: addsd %xmm1, %xmm0
1723 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1724 ; SSE-NEXT: addsd %xmm1, %xmm0
1727 ; AVX-LABEL: test_v4f64_undef:
1729 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1730 ; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1731 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1732 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm1
1733 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1734 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1735 ; AVX-NEXT: vzeroupper
1738 ; AVX512-LABEL: test_v4f64_undef:
1740 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1741 ; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1742 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
1743 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1
1744 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1745 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1746 ; AVX512-NEXT: vzeroupper
1748 %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0)
1752 define double @test_v8f64_undef(<8 x double> %a0) {
1753 ; SSE-LABEL: test_v8f64_undef:
1755 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1756 ; SSE-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1757 ; SSE-NEXT: addsd %xmm1, %xmm0
1758 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1759 ; SSE-NEXT: addsd %xmm1, %xmm0
1760 ; SSE-NEXT: addsd %xmm2, %xmm0
1761 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1762 ; SSE-NEXT: addsd %xmm2, %xmm0
1763 ; SSE-NEXT: addsd %xmm3, %xmm0
1764 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1765 ; SSE-NEXT: addsd %xmm3, %xmm0
1768 ; AVX-LABEL: test_v8f64_undef:
1770 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1771 ; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1772 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1773 ; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm2
1774 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1775 ; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm0
1776 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1777 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1778 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1779 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1780 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1781 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1782 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1783 ; AVX-NEXT: vzeroupper
1786 ; AVX512-LABEL: test_v8f64_undef:
1788 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1789 ; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1790 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
1791 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1792 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1793 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1794 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
1795 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1796 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1797 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1798 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
1799 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1
1800 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1801 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1802 ; AVX512-NEXT: vzeroupper
1804 %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0)
1808 define double @test_v16f64_undef(<16 x double> %a0) {
1809 ; SSE-LABEL: test_v16f64_undef:
1811 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1812 ; SSE-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1813 ; SSE-NEXT: addsd %xmm1, %xmm0
1814 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1815 ; SSE-NEXT: addsd %xmm1, %xmm0
1816 ; SSE-NEXT: addsd %xmm2, %xmm0
1817 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1818 ; SSE-NEXT: addsd %xmm2, %xmm0
1819 ; SSE-NEXT: addsd %xmm3, %xmm0
1820 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1821 ; SSE-NEXT: addsd %xmm3, %xmm0
1822 ; SSE-NEXT: addsd %xmm4, %xmm0
1823 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1824 ; SSE-NEXT: addsd %xmm4, %xmm0
1825 ; SSE-NEXT: addsd %xmm5, %xmm0
1826 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1827 ; SSE-NEXT: addsd %xmm5, %xmm0
1828 ; SSE-NEXT: addsd %xmm6, %xmm0
1829 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1830 ; SSE-NEXT: addsd %xmm6, %xmm0
1831 ; SSE-NEXT: addsd %xmm7, %xmm0
1832 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1833 ; SSE-NEXT: addsd %xmm7, %xmm0
1836 ; AVX-LABEL: test_v16f64_undef:
1838 ; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
1839 ; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
1840 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1841 ; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm4
1842 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1843 ; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm0
1844 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1845 ; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1846 ; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0
1847 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1848 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1849 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1850 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1851 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1852 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1853 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1854 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
1855 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1856 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1857 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1858 ; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0
1859 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1860 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1861 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1
1862 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1863 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1864 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1865 ; AVX-NEXT: vzeroupper
1868 ; AVX512-LABEL: test_v16f64_undef:
1870 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1871 ; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1872 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3
1873 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
1874 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1875 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
1876 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3
1877 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
1878 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1879 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
1880 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
1881 ; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm2
1882 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1883 ; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm0
1884 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1885 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1886 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1887 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
1888 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1889 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1890 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1891 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
1892 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1893 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1894 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
1895 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
1896 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1897 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1898 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1899 ; AVX512-NEXT: vzeroupper
1901 %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0)
1905 declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
1906 declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
1907 declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
1908 declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
1910 declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
1911 declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
1912 declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
1913 declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>)