1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSSE3-FAST
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1-SLOW
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefix=AVX1-FAST
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
9 ; PR37890 - subvector reduction followed by shuffle reduction
11 define i32 @PR37890_v4i32(<4 x i32> %a) {
12 ; SSE2-LABEL: PR37890_v4i32:
14 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
15 ; SSE2-NEXT: paddd %xmm0, %xmm1
16 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
17 ; SSE2-NEXT: paddd %xmm1, %xmm0
18 ; SSE2-NEXT: movd %xmm0, %eax
21 ; SSSE3-SLOW-LABEL: PR37890_v4i32:
22 ; SSSE3-SLOW: # %bb.0:
23 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
24 ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
25 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
26 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
27 ; SSSE3-SLOW-NEXT: movd %xmm0, %eax
28 ; SSSE3-SLOW-NEXT: retq
30 ; SSSE3-FAST-LABEL: PR37890_v4i32:
31 ; SSSE3-FAST: # %bb.0:
32 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
33 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
34 ; SSSE3-FAST-NEXT: movd %xmm0, %eax
35 ; SSSE3-FAST-NEXT: retq
37 ; AVX1-SLOW-LABEL: PR37890_v4i32:
39 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
40 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
41 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
42 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
43 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
44 ; AVX1-SLOW-NEXT: retq
46 ; AVX1-FAST-LABEL: PR37890_v4i32:
48 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
49 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
50 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
51 ; AVX1-FAST-NEXT: retq
53 ; AVX2-LABEL: PR37890_v4i32:
55 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
56 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
57 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
58 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
59 ; AVX2-NEXT: vmovd %xmm0, %eax
61 %hi0 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
62 %lo0 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
63 %sum0 = add <2 x i32> %lo0, %hi0
64 %hi1 = shufflevector <2 x i32> %sum0, <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
65 %sum1 = add <2 x i32> %sum0, %hi1
66 %e = extractelement <2 x i32> %sum1, i32 0
70 define i16 @PR37890_v8i16(<8 x i16> %a) {
71 ; SSE2-LABEL: PR37890_v8i16:
73 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
74 ; SSE2-NEXT: paddw %xmm0, %xmm1
75 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
76 ; SSE2-NEXT: paddw %xmm1, %xmm0
77 ; SSE2-NEXT: movdqa %xmm0, %xmm1
78 ; SSE2-NEXT: psrld $16, %xmm1
79 ; SSE2-NEXT: paddw %xmm0, %xmm1
80 ; SSE2-NEXT: movd %xmm1, %eax
81 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
84 ; SSSE3-SLOW-LABEL: PR37890_v8i16:
85 ; SSSE3-SLOW: # %bb.0:
86 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
87 ; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1
88 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
89 ; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
90 ; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1
91 ; SSSE3-SLOW-NEXT: psrld $16, %xmm1
92 ; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1
93 ; SSSE3-SLOW-NEXT: movd %xmm1, %eax
94 ; SSSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
95 ; SSSE3-SLOW-NEXT: retq
97 ; SSSE3-FAST-LABEL: PR37890_v8i16:
98 ; SSSE3-FAST: # %bb.0:
99 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
100 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
101 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
102 ; SSSE3-FAST-NEXT: movd %xmm0, %eax
103 ; SSSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax
104 ; SSSE3-FAST-NEXT: retq
106 ; AVX1-SLOW-LABEL: PR37890_v8i16:
107 ; AVX1-SLOW: # %bb.0:
108 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
109 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
110 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
111 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
112 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
113 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
114 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
115 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
116 ; AVX1-SLOW-NEXT: retq
118 ; AVX1-FAST-LABEL: PR37890_v8i16:
119 ; AVX1-FAST: # %bb.0:
120 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
121 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
122 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
123 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
124 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
125 ; AVX1-FAST-NEXT: retq
127 ; AVX2-LABEL: PR37890_v8i16:
129 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
130 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
131 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
132 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
133 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
134 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
135 ; AVX2-NEXT: vmovd %xmm0, %eax
136 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
138 %hi0 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
139 %lo0 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
140 %sum0 = add <4 x i16> %lo0, %hi0
141 %hi1 = shufflevector <4 x i16> %sum0, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
142 %lo1 = shufflevector <4 x i16> %sum0, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
143 %sum1 = add <2 x i16> %lo1, %hi1
144 %hi2 = shufflevector <2 x i16> %sum1, <2 x i16> undef, <2 x i32> <i32 1, i32 undef>
145 %sum2 = add <2 x i16> %sum1, %hi2
146 %e = extractelement <2 x i16> %sum2, i32 0
150 define i32 @PR37890_v8i32(<8 x i32> %a) {
151 ; SSE2-LABEL: PR37890_v8i32:
153 ; SSE2-NEXT: paddd %xmm1, %xmm0
154 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
155 ; SSE2-NEXT: paddd %xmm0, %xmm1
156 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
157 ; SSE2-NEXT: paddd %xmm1, %xmm0
158 ; SSE2-NEXT: movd %xmm0, %eax
161 ; SSSE3-SLOW-LABEL: PR37890_v8i32:
162 ; SSSE3-SLOW: # %bb.0:
163 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
164 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
165 ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
166 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
167 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
168 ; SSSE3-SLOW-NEXT: movd %xmm0, %eax
169 ; SSSE3-SLOW-NEXT: retq
171 ; SSSE3-FAST-LABEL: PR37890_v8i32:
172 ; SSSE3-FAST: # %bb.0:
173 ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0
174 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
175 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
176 ; SSSE3-FAST-NEXT: movd %xmm0, %eax
177 ; SSSE3-FAST-NEXT: retq
179 ; AVX1-SLOW-LABEL: PR37890_v8i32:
180 ; AVX1-SLOW: # %bb.0:
181 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
182 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
183 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
184 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
185 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
186 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
187 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
188 ; AVX1-SLOW-NEXT: vzeroupper
189 ; AVX1-SLOW-NEXT: retq
191 ; AVX1-FAST-LABEL: PR37890_v8i32:
192 ; AVX1-FAST: # %bb.0:
193 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
194 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0
195 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
196 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
197 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
198 ; AVX1-FAST-NEXT: vzeroupper
199 ; AVX1-FAST-NEXT: retq
201 ; AVX2-LABEL: PR37890_v8i32:
203 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
204 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
205 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
206 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
207 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
208 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
209 ; AVX2-NEXT: vmovd %xmm0, %eax
210 ; AVX2-NEXT: vzeroupper
212 %hi0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
213 %lo0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
214 %sum0 = add <4 x i32> %lo0, %hi0
215 %hi1 = shufflevector <4 x i32> %sum0, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
216 %lo1 = shufflevector <4 x i32> %sum0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
217 %sum1 = add <2 x i32> %lo1, %hi1
218 %hi2 = shufflevector <2 x i32> %sum1, <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
219 %sum2 = add <2 x i32> %sum1, %hi2
220 %e = extractelement <2 x i32> %sum2, i32 0
224 define i16 @PR37890_v16i16(<16 x i16> %a) {
225 ; SSE2-LABEL: PR37890_v16i16:
227 ; SSE2-NEXT: paddw %xmm1, %xmm0
228 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
229 ; SSE2-NEXT: paddw %xmm0, %xmm1
230 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
231 ; SSE2-NEXT: paddw %xmm1, %xmm0
232 ; SSE2-NEXT: movdqa %xmm0, %xmm1
233 ; SSE2-NEXT: psrld $16, %xmm1
234 ; SSE2-NEXT: paddw %xmm0, %xmm1
235 ; SSE2-NEXT: movd %xmm1, %eax
236 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
239 ; SSSE3-SLOW-LABEL: PR37890_v16i16:
240 ; SSSE3-SLOW: # %bb.0:
241 ; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
242 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
243 ; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1
244 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
245 ; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
246 ; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1
247 ; SSSE3-SLOW-NEXT: psrld $16, %xmm1
248 ; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1
249 ; SSSE3-SLOW-NEXT: movd %xmm1, %eax
250 ; SSSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
251 ; SSSE3-SLOW-NEXT: retq
253 ; SSSE3-FAST-LABEL: PR37890_v16i16:
254 ; SSSE3-FAST: # %bb.0:
255 ; SSSE3-FAST-NEXT: paddw %xmm1, %xmm0
256 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
257 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
258 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
259 ; SSSE3-FAST-NEXT: movd %xmm0, %eax
260 ; SSSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax
261 ; SSSE3-FAST-NEXT: retq
263 ; AVX1-SLOW-LABEL: PR37890_v16i16:
264 ; AVX1-SLOW: # %bb.0:
265 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
266 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
267 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
268 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
269 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
270 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
271 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
272 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
273 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
274 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
275 ; AVX1-SLOW-NEXT: vzeroupper
276 ; AVX1-SLOW-NEXT: retq
278 ; AVX1-FAST-LABEL: PR37890_v16i16:
279 ; AVX1-FAST: # %bb.0:
280 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
281 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0
282 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
283 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
284 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
285 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
286 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
287 ; AVX1-FAST-NEXT: vzeroupper
288 ; AVX1-FAST-NEXT: retq
290 ; AVX2-LABEL: PR37890_v16i16:
292 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
293 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
294 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
295 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
296 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
297 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
298 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
299 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
300 ; AVX2-NEXT: vmovd %xmm0, %eax
301 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
302 ; AVX2-NEXT: vzeroupper
304 %hi0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
305 %lo0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
306 %sum0 = add <8 x i16> %lo0, %hi0
307 %hi1 = shufflevector <8 x i16> %sum0, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
308 %lo1 = shufflevector <8 x i16> %sum0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
309 %sum1 = add <4 x i16> %lo1, %hi1
310 %hi2 = shufflevector <4 x i16> %sum1, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
311 %lo2 = shufflevector <4 x i16> %sum1, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
312 %sum2 = add <2 x i16> %lo2, %hi2
313 %hi3 = shufflevector <2 x i16> %sum2, <2 x i16> undef, <2 x i32> <i32 1, i32 undef>
314 %sum3 = add <2 x i16> %sum2, %hi3
315 %e = extractelement <2 x i16> %sum3, i32 0
319 define i32 @PR37890_v16i32(<16 x i32> %a) {
320 ; SSE2-LABEL: PR37890_v16i32:
322 ; SSE2-NEXT: paddd %xmm3, %xmm1
323 ; SSE2-NEXT: paddd %xmm2, %xmm1
324 ; SSE2-NEXT: paddd %xmm0, %xmm1
325 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
326 ; SSE2-NEXT: paddd %xmm1, %xmm0
327 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
328 ; SSE2-NEXT: paddd %xmm0, %xmm1
329 ; SSE2-NEXT: movd %xmm1, %eax
332 ; SSSE3-SLOW-LABEL: PR37890_v16i32:
333 ; SSSE3-SLOW: # %bb.0:
334 ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
335 ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
336 ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
337 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
338 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
339 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
340 ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
341 ; SSSE3-SLOW-NEXT: movd %xmm1, %eax
342 ; SSSE3-SLOW-NEXT: retq
344 ; SSSE3-FAST-LABEL: PR37890_v16i32:
345 ; SSSE3-FAST: # %bb.0:
346 ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm1
347 ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
348 ; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1
349 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
350 ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0
351 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
352 ; SSSE3-FAST-NEXT: movd %xmm0, %eax
353 ; SSSE3-FAST-NEXT: retq
355 ; AVX1-SLOW-LABEL: PR37890_v16i32:
356 ; AVX1-SLOW: # %bb.0:
357 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
358 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
359 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
360 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
361 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
362 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
363 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
364 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
365 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
366 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
367 ; AVX1-SLOW-NEXT: vzeroupper
368 ; AVX1-SLOW-NEXT: retq
370 ; AVX1-FAST-LABEL: PR37890_v16i32:
371 ; AVX1-FAST: # %bb.0:
372 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm2
373 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
374 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
375 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
376 ; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0
377 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
378 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
379 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
380 ; AVX1-FAST-NEXT: vzeroupper
381 ; AVX1-FAST-NEXT: retq
383 ; AVX2-LABEL: PR37890_v16i32:
385 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
386 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
387 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
388 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
389 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
390 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
391 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
392 ; AVX2-NEXT: vmovd %xmm0, %eax
393 ; AVX2-NEXT: vzeroupper
395 %hi0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
396 %lo0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
397 %sum0 = add <8 x i32> %lo0, %hi0
398 %hi1 = shufflevector <8 x i32> %sum0, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
399 %lo1 = shufflevector <8 x i32> %sum0, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
400 %sum1 = add <4 x i32> %lo1, %hi1
401 %hi2 = shufflevector <4 x i32> %sum1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
402 %lo2 = shufflevector <4 x i32> %sum1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
403 %sum2 = add <2 x i32> %lo2, %hi2
404 %hi3 = shufflevector <2 x i32> %sum2, <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
405 %sum3 = add <2 x i32> %sum2, %hi3
406 %e = extractelement <2 x i32> %sum3, i32 0