1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSSE3-FAST
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1-SLOW
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefix=AVX1-FAST
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
9 ; PR37890 - subvector reduction followed by shuffle reduction
11 define float @PR37890_v4f32(<4 x float> %a) {
12 ; SSE2-LABEL: PR37890_v4f32:
14 ; SSE2-NEXT: movaps %xmm0, %xmm1
15 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
16 ; SSE2-NEXT: addps %xmm1, %xmm0
17 ; SSE2-NEXT: movaps %xmm0, %xmm1
18 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
19 ; SSE2-NEXT: addss %xmm1, %xmm0
22 ; SSSE3-SLOW-LABEL: PR37890_v4f32:
23 ; SSSE3-SLOW: # %bb.0:
24 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
25 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
26 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
27 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
28 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
29 ; SSSE3-SLOW-NEXT: retq
31 ; SSSE3-FAST-LABEL: PR37890_v4f32:
32 ; SSSE3-FAST: # %bb.0:
33 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
34 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
35 ; SSSE3-FAST-NEXT: retq
37 ; AVX1-SLOW-LABEL: PR37890_v4f32:
39 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
40 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
41 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
42 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
43 ; AVX1-SLOW-NEXT: retq
45 ; AVX1-FAST-LABEL: PR37890_v4f32:
47 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
48 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
49 ; AVX1-FAST-NEXT: retq
51 ; AVX2-LABEL: PR37890_v4f32:
53 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
54 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
55 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
56 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
58 %hi0 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
59 %lo0 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
60 %sum0 = fadd fast <2 x float> %lo0, %hi0
61 %hi1 = shufflevector <2 x float> %sum0, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
62 %sum1 = fadd fast <2 x float> %sum0, %hi1
63 %e = extractelement <2 x float> %sum1, i32 0
67 define double @PR37890_v4f64(<4 x double> %a) {
68 ; SSE2-LABEL: PR37890_v4f64:
70 ; SSE2-NEXT: addpd %xmm1, %xmm0
71 ; SSE2-NEXT: movapd %xmm0, %xmm1
72 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
73 ; SSE2-NEXT: addsd %xmm1, %xmm0
76 ; SSSE3-SLOW-LABEL: PR37890_v4f64:
77 ; SSSE3-SLOW: # %bb.0:
78 ; SSSE3-SLOW-NEXT: addpd %xmm1, %xmm0
79 ; SSSE3-SLOW-NEXT: movapd %xmm0, %xmm1
80 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
81 ; SSSE3-SLOW-NEXT: addsd %xmm1, %xmm0
82 ; SSSE3-SLOW-NEXT: retq
84 ; SSSE3-FAST-LABEL: PR37890_v4f64:
85 ; SSSE3-FAST: # %bb.0:
86 ; SSSE3-FAST-NEXT: addpd %xmm1, %xmm0
87 ; SSSE3-FAST-NEXT: haddpd %xmm0, %xmm0
88 ; SSSE3-FAST-NEXT: retq
90 ; AVX1-SLOW-LABEL: PR37890_v4f64:
92 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
93 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
94 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
95 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
96 ; AVX1-SLOW-NEXT: vzeroupper
97 ; AVX1-SLOW-NEXT: retq
99 ; AVX1-FAST-LABEL: PR37890_v4f64:
100 ; AVX1-FAST: # %bb.0:
101 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
102 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0
103 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
104 ; AVX1-FAST-NEXT: vzeroupper
105 ; AVX1-FAST-NEXT: retq
107 ; AVX2-LABEL: PR37890_v4f64:
109 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
110 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
111 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
112 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
113 ; AVX2-NEXT: vzeroupper
115 %hi0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3>
116 %lo0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1>
117 %sum0 = fadd fast <2 x double> %lo0, %hi0
118 %hi1 = shufflevector <2 x double> %sum0, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
119 %sum1 = fadd fast <2 x double> %sum0, %hi1
120 %e = extractelement <2 x double> %sum1, i32 0
124 define float @PR37890_v8f32(<8 x float> %a) {
125 ; SSE2-LABEL: PR37890_v8f32:
127 ; SSE2-NEXT: addps %xmm1, %xmm0
128 ; SSE2-NEXT: movaps %xmm0, %xmm1
129 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
130 ; SSE2-NEXT: addps %xmm1, %xmm0
131 ; SSE2-NEXT: movaps %xmm0, %xmm1
132 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
133 ; SSE2-NEXT: addss %xmm1, %xmm0
136 ; SSSE3-SLOW-LABEL: PR37890_v8f32:
137 ; SSSE3-SLOW: # %bb.0:
138 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
139 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
140 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
141 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
142 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
143 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
144 ; SSSE3-SLOW-NEXT: retq
146 ; SSSE3-FAST-LABEL: PR37890_v8f32:
147 ; SSSE3-FAST: # %bb.0:
148 ; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
149 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
150 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
151 ; SSSE3-FAST-NEXT: retq
153 ; AVX1-SLOW-LABEL: PR37890_v8f32:
154 ; AVX1-SLOW: # %bb.0:
155 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
156 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
157 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
158 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
159 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
160 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
161 ; AVX1-SLOW-NEXT: vzeroupper
162 ; AVX1-SLOW-NEXT: retq
164 ; AVX1-FAST-LABEL: PR37890_v8f32:
165 ; AVX1-FAST: # %bb.0:
166 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
167 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0
168 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
169 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
170 ; AVX1-FAST-NEXT: vzeroupper
171 ; AVX1-FAST-NEXT: retq
173 ; AVX2-LABEL: PR37890_v8f32:
175 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
176 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
177 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
178 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
179 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
180 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
181 ; AVX2-NEXT: vzeroupper
183 %hi0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
184 %lo0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
185 %sum0 = fadd fast <4 x float> %lo0, %hi0
186 %hi1 = shufflevector <4 x float> %sum0, <4 x float> undef, <2 x i32> <i32 2, i32 3>
187 %lo1 = shufflevector <4 x float> %sum0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
188 %sum1 = fadd fast <2 x float> %lo1, %hi1
189 %hi2 = shufflevector <2 x float> %sum1, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
190 %sum2 = fadd fast <2 x float> %sum1, %hi2
191 %e = extractelement <2 x float> %sum2, i32 0
195 define double @PR37890_v8f64(<8 x double> %a) {
196 ; SSE2-LABEL: PR37890_v8f64:
198 ; SSE2-NEXT: addpd %xmm3, %xmm1
199 ; SSE2-NEXT: addpd %xmm2, %xmm0
200 ; SSE2-NEXT: addpd %xmm1, %xmm0
201 ; SSE2-NEXT: movapd %xmm0, %xmm1
202 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
203 ; SSE2-NEXT: addsd %xmm1, %xmm0
206 ; SSSE3-SLOW-LABEL: PR37890_v8f64:
207 ; SSSE3-SLOW: # %bb.0:
208 ; SSSE3-SLOW-NEXT: addpd %xmm3, %xmm1
209 ; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm0
210 ; SSSE3-SLOW-NEXT: addpd %xmm1, %xmm0
211 ; SSSE3-SLOW-NEXT: movapd %xmm0, %xmm1
212 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
213 ; SSSE3-SLOW-NEXT: addsd %xmm1, %xmm0
214 ; SSSE3-SLOW-NEXT: retq
216 ; SSSE3-FAST-LABEL: PR37890_v8f64:
217 ; SSSE3-FAST: # %bb.0:
218 ; SSSE3-FAST-NEXT: addpd %xmm3, %xmm1
219 ; SSSE3-FAST-NEXT: addpd %xmm2, %xmm0
220 ; SSSE3-FAST-NEXT: addpd %xmm1, %xmm0
221 ; SSSE3-FAST-NEXT: haddpd %xmm0, %xmm0
222 ; SSSE3-FAST-NEXT: retq
224 ; AVX1-SLOW-LABEL: PR37890_v8f64:
225 ; AVX1-SLOW: # %bb.0:
226 ; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
227 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
228 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
229 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
230 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
231 ; AVX1-SLOW-NEXT: vzeroupper
232 ; AVX1-SLOW-NEXT: retq
234 ; AVX1-FAST-LABEL: PR37890_v8f64:
235 ; AVX1-FAST: # %bb.0:
236 ; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0
237 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
238 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0
239 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
240 ; AVX1-FAST-NEXT: vzeroupper
241 ; AVX1-FAST-NEXT: retq
243 ; AVX2-LABEL: PR37890_v8f64:
245 ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
246 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
247 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
248 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
249 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
250 ; AVX2-NEXT: vzeroupper
252 %hi0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
253 %lo0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
254 %sum0 = fadd fast <4 x double> %lo0, %hi0
255 %hi1 = shufflevector <4 x double> %sum0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
256 %lo1 = shufflevector <4 x double> %sum0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
257 %sum1 = fadd fast <2 x double> %lo1, %hi1
258 %hi2 = shufflevector <2 x double> %sum1, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
259 %sum2 = fadd fast <2 x double> %sum1, %hi2
260 %e = extractelement <2 x double> %sum2, i32 0
264 define float @PR37890_v16f32(<16 x float> %a) {
265 ; SSE2-LABEL: PR37890_v16f32:
267 ; SSE2-NEXT: addps %xmm3, %xmm1
268 ; SSE2-NEXT: addps %xmm2, %xmm0
269 ; SSE2-NEXT: addps %xmm1, %xmm0
270 ; SSE2-NEXT: movaps %xmm0, %xmm1
271 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
272 ; SSE2-NEXT: addps %xmm1, %xmm0
273 ; SSE2-NEXT: movaps %xmm0, %xmm1
274 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
275 ; SSE2-NEXT: addss %xmm1, %xmm0
278 ; SSSE3-SLOW-LABEL: PR37890_v16f32:
279 ; SSSE3-SLOW: # %bb.0:
280 ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
281 ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0
282 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
283 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
284 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
285 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
286 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
287 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
288 ; SSSE3-SLOW-NEXT: retq
290 ; SSSE3-FAST-LABEL: PR37890_v16f32:
291 ; SSSE3-FAST: # %bb.0:
292 ; SSSE3-FAST-NEXT: addps %xmm3, %xmm1
293 ; SSSE3-FAST-NEXT: addps %xmm2, %xmm0
294 ; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
295 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1
296 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
297 ; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
298 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
299 ; SSSE3-FAST-NEXT: retq
301 ; AVX1-SLOW-LABEL: PR37890_v16f32:
302 ; AVX1-SLOW: # %bb.0:
303 ; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0
304 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
305 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
306 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
307 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
308 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
309 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
310 ; AVX1-SLOW-NEXT: vzeroupper
311 ; AVX1-SLOW-NEXT: retq
313 ; AVX1-FAST-LABEL: PR37890_v16f32:
314 ; AVX1-FAST: # %bb.0:
315 ; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0
316 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
317 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0
318 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
319 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
320 ; AVX1-FAST-NEXT: vzeroupper
321 ; AVX1-FAST-NEXT: retq
323 ; AVX2-LABEL: PR37890_v16f32:
325 ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
326 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
327 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
328 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
329 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
330 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
331 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
332 ; AVX2-NEXT: vzeroupper
334 %hi0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
335 %lo0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
336 %sum0 = fadd fast <8 x float> %lo0, %hi0
337 %hi1 = shufflevector <8 x float> %sum0, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
338 %lo1 = shufflevector <8 x float> %sum0, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339 %sum1 = fadd fast <4 x float> %lo1, %hi1
340 %hi2 = shufflevector <4 x float> %sum1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
341 %lo2 = shufflevector <4 x float> %sum1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
342 %sum2 = fadd fast <2 x float> %lo2, %hi2
343 %hi3 = shufflevector <2 x float> %sum2, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
344 %sum3 = fadd fast <2 x float> %sum2, %hi3
345 %e = extractelement <2 x float> %sum3, i32 0