1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
5 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
6 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
8 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
14 define i64 @test_v2i64(<2 x i64> %a0) {
15 ; SSE-LABEL: test_v2i64:
17 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
18 ; SSE-NEXT: paddq %xmm0, %xmm1
19 ; SSE-NEXT: movq %xmm1, %rax
22 ; AVX-LABEL: test_v2i64:
24 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
25 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
26 ; AVX-NEXT: vmovq %xmm0, %rax
29 ; AVX512-LABEL: test_v2i64:
31 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
32 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
33 ; AVX512-NEXT: vmovq %xmm0, %rax
35 %1 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a0)
39 define i64 @test_v4i64(<4 x i64> %a0) {
40 ; SSE-LABEL: test_v4i64:
42 ; SSE-NEXT: paddq %xmm1, %xmm0
43 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
44 ; SSE-NEXT: paddq %xmm0, %xmm1
45 ; SSE-NEXT: movq %xmm1, %rax
48 ; AVX1-LABEL: test_v4i64:
50 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
51 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
52 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
53 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
54 ; AVX1-NEXT: vmovq %xmm0, %rax
55 ; AVX1-NEXT: vzeroupper
58 ; AVX2-LABEL: test_v4i64:
60 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
61 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
62 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
63 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
64 ; AVX2-NEXT: vmovq %xmm0, %rax
65 ; AVX2-NEXT: vzeroupper
68 ; AVX512-LABEL: test_v4i64:
70 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
71 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
72 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
73 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
74 ; AVX512-NEXT: vmovq %xmm0, %rax
75 ; AVX512-NEXT: vzeroupper
77 %1 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %a0)
81 define i64 @test_v8i64(<8 x i64> %a0) {
82 ; SSE-LABEL: test_v8i64:
84 ; SSE-NEXT: paddq %xmm3, %xmm1
85 ; SSE-NEXT: paddq %xmm2, %xmm1
86 ; SSE-NEXT: paddq %xmm0, %xmm1
87 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
88 ; SSE-NEXT: paddq %xmm1, %xmm0
89 ; SSE-NEXT: movq %xmm0, %rax
92 ; AVX1-LABEL: test_v8i64:
94 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
95 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
96 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
97 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
98 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
99 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
100 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
101 ; AVX1-NEXT: vmovq %xmm0, %rax
102 ; AVX1-NEXT: vzeroupper
105 ; AVX2-LABEL: test_v8i64:
107 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
108 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
109 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
110 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
111 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
112 ; AVX2-NEXT: vmovq %xmm0, %rax
113 ; AVX2-NEXT: vzeroupper
116 ; AVX512-LABEL: test_v8i64:
118 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
119 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
120 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
121 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
122 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
123 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
124 ; AVX512-NEXT: vmovq %xmm0, %rax
125 ; AVX512-NEXT: vzeroupper
127 %1 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %a0)
131 define i64 @test_v16i64(<16 x i64> %a0) {
132 ; SSE-LABEL: test_v16i64:
134 ; SSE-NEXT: paddq %xmm6, %xmm2
135 ; SSE-NEXT: paddq %xmm7, %xmm3
136 ; SSE-NEXT: paddq %xmm5, %xmm3
137 ; SSE-NEXT: paddq %xmm1, %xmm3
138 ; SSE-NEXT: paddq %xmm4, %xmm2
139 ; SSE-NEXT: paddq %xmm3, %xmm2
140 ; SSE-NEXT: paddq %xmm0, %xmm2
141 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
142 ; SSE-NEXT: paddq %xmm2, %xmm0
143 ; SSE-NEXT: movq %xmm0, %rax
146 ; AVX1-LABEL: test_v16i64:
148 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4
149 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
150 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
151 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
152 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
153 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
154 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
155 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
156 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
157 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
158 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
159 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
160 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
161 ; AVX1-NEXT: vmovq %xmm0, %rax
162 ; AVX1-NEXT: vzeroupper
165 ; AVX2-LABEL: test_v16i64:
167 ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
168 ; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
169 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
170 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
171 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
172 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
173 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
174 ; AVX2-NEXT: vmovq %xmm0, %rax
175 ; AVX2-NEXT: vzeroupper
178 ; AVX512-LABEL: test_v16i64:
180 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
181 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
182 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
183 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
184 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
185 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
186 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
187 ; AVX512-NEXT: vmovq %xmm0, %rax
188 ; AVX512-NEXT: vzeroupper
190 %1 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %a0)
198 define i32 @test_v2i32(<2 x i32> %a0) {
199 ; SSE-LABEL: test_v2i32:
201 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
202 ; SSE-NEXT: paddd %xmm0, %xmm1
203 ; SSE-NEXT: movd %xmm1, %eax
206 ; AVX1-SLOW-LABEL: test_v2i32:
207 ; AVX1-SLOW: # %bb.0:
208 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
209 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
210 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
211 ; AVX1-SLOW-NEXT: retq
213 ; AVX1-FAST-LABEL: test_v2i32:
214 ; AVX1-FAST: # %bb.0:
215 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
216 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
217 ; AVX1-FAST-NEXT: retq
219 ; AVX2-LABEL: test_v2i32:
221 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
222 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
223 ; AVX2-NEXT: vmovd %xmm0, %eax
226 ; AVX512-LABEL: test_v2i32:
228 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
229 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
230 ; AVX512-NEXT: vmovd %xmm0, %eax
232 %1 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a0)
236 define i32 @test_v4i32(<4 x i32> %a0) {
237 ; SSE-LABEL: test_v4i32:
239 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
240 ; SSE-NEXT: paddd %xmm0, %xmm1
241 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
242 ; SSE-NEXT: paddd %xmm1, %xmm0
243 ; SSE-NEXT: movd %xmm0, %eax
246 ; AVX1-SLOW-LABEL: test_v4i32:
247 ; AVX1-SLOW: # %bb.0:
248 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
249 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
250 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
251 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
252 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
253 ; AVX1-SLOW-NEXT: retq
255 ; AVX1-FAST-LABEL: test_v4i32:
256 ; AVX1-FAST: # %bb.0:
257 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
258 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
259 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
260 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
261 ; AVX1-FAST-NEXT: retq
263 ; AVX2-LABEL: test_v4i32:
265 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
266 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
267 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
268 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
269 ; AVX2-NEXT: vmovd %xmm0, %eax
272 ; AVX512-LABEL: test_v4i32:
274 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
275 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
276 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
277 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
278 ; AVX512-NEXT: vmovd %xmm0, %eax
280 %1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a0)
284 define i32 @test_v8i32(<8 x i32> %a0) {
285 ; SSE-LABEL: test_v8i32:
287 ; SSE-NEXT: paddd %xmm1, %xmm0
288 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
289 ; SSE-NEXT: paddd %xmm0, %xmm1
290 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
291 ; SSE-NEXT: paddd %xmm1, %xmm0
292 ; SSE-NEXT: movd %xmm0, %eax
295 ; AVX1-SLOW-LABEL: test_v8i32:
296 ; AVX1-SLOW: # %bb.0:
297 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
298 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
299 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
300 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
301 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
302 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
303 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
304 ; AVX1-SLOW-NEXT: vzeroupper
305 ; AVX1-SLOW-NEXT: retq
307 ; AVX1-FAST-LABEL: test_v8i32:
308 ; AVX1-FAST: # %bb.0:
309 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
310 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
311 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
312 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
313 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
314 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
315 ; AVX1-FAST-NEXT: vzeroupper
316 ; AVX1-FAST-NEXT: retq
318 ; AVX2-LABEL: test_v8i32:
320 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
321 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
322 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
323 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
324 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
325 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
326 ; AVX2-NEXT: vmovd %xmm0, %eax
327 ; AVX2-NEXT: vzeroupper
330 ; AVX512-LABEL: test_v8i32:
332 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
333 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
334 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
335 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
336 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
337 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
338 ; AVX512-NEXT: vmovd %xmm0, %eax
339 ; AVX512-NEXT: vzeroupper
341 %1 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a0)
345 define i32 @test_v16i32(<16 x i32> %a0) {
346 ; SSE-LABEL: test_v16i32:
348 ; SSE-NEXT: paddd %xmm3, %xmm1
349 ; SSE-NEXT: paddd %xmm2, %xmm1
350 ; SSE-NEXT: paddd %xmm0, %xmm1
351 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
352 ; SSE-NEXT: paddd %xmm1, %xmm0
353 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
354 ; SSE-NEXT: paddd %xmm0, %xmm1
355 ; SSE-NEXT: movd %xmm1, %eax
358 ; AVX1-SLOW-LABEL: test_v16i32:
359 ; AVX1-SLOW: # %bb.0:
360 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
361 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
362 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
363 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
364 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
365 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
366 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
367 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
368 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
369 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
370 ; AVX1-SLOW-NEXT: vzeroupper
371 ; AVX1-SLOW-NEXT: retq
373 ; AVX1-FAST-LABEL: test_v16i32:
374 ; AVX1-FAST: # %bb.0:
375 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
376 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
377 ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
378 ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
379 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
380 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
381 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
382 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
383 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
384 ; AVX1-FAST-NEXT: vzeroupper
385 ; AVX1-FAST-NEXT: retq
387 ; AVX2-LABEL: test_v16i32:
389 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
390 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
391 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
392 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
393 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
394 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
395 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
396 ; AVX2-NEXT: vmovd %xmm0, %eax
397 ; AVX2-NEXT: vzeroupper
400 ; AVX512-LABEL: test_v16i32:
402 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
403 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
404 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
405 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
406 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
407 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
408 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
409 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
410 ; AVX512-NEXT: vmovd %xmm0, %eax
411 ; AVX512-NEXT: vzeroupper
413 %1 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a0)
417 define i32 @test_v32i32(<32 x i32> %a0) {
418 ; SSE-LABEL: test_v32i32:
420 ; SSE-NEXT: paddd %xmm6, %xmm2
421 ; SSE-NEXT: paddd %xmm7, %xmm3
422 ; SSE-NEXT: paddd %xmm5, %xmm3
423 ; SSE-NEXT: paddd %xmm1, %xmm3
424 ; SSE-NEXT: paddd %xmm4, %xmm2
425 ; SSE-NEXT: paddd %xmm3, %xmm2
426 ; SSE-NEXT: paddd %xmm0, %xmm2
427 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
428 ; SSE-NEXT: paddd %xmm2, %xmm0
429 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
430 ; SSE-NEXT: paddd %xmm0, %xmm1
431 ; SSE-NEXT: movd %xmm1, %eax
434 ; AVX1-SLOW-LABEL: test_v32i32:
435 ; AVX1-SLOW: # %bb.0:
436 ; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4
437 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3
438 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
439 ; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1
440 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3
441 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
442 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
443 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
444 ; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2
445 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
446 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
447 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
448 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
449 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
450 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
451 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
452 ; AVX1-SLOW-NEXT: vzeroupper
453 ; AVX1-SLOW-NEXT: retq
455 ; AVX1-FAST-LABEL: test_v32i32:
456 ; AVX1-FAST: # %bb.0:
457 ; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4
458 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3
459 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
460 ; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1
461 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3
462 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
463 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
464 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
465 ; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2
466 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
467 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
468 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
469 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
470 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
471 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
472 ; AVX1-FAST-NEXT: vzeroupper
473 ; AVX1-FAST-NEXT: retq
475 ; AVX2-LABEL: test_v32i32:
477 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
478 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
479 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
480 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
481 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
482 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
483 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
484 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
485 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
486 ; AVX2-NEXT: vmovd %xmm0, %eax
487 ; AVX2-NEXT: vzeroupper
490 ; AVX512-LABEL: test_v32i32:
492 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
493 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
494 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
495 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
496 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
497 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
498 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
499 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
500 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
501 ; AVX512-NEXT: vmovd %xmm0, %eax
502 ; AVX512-NEXT: vzeroupper
504 %1 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> %a0)
512 define i16 @test_v2i16(<2 x i16> %a0) {
513 ; SSE-LABEL: test_v2i16:
515 ; SSE-NEXT: movdqa %xmm0, %xmm1
516 ; SSE-NEXT: psrld $16, %xmm1
517 ; SSE-NEXT: paddw %xmm0, %xmm1
518 ; SSE-NEXT: movd %xmm1, %eax
519 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
522 ; AVX1-SLOW-LABEL: test_v2i16:
523 ; AVX1-SLOW: # %bb.0:
524 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
525 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
526 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
527 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
528 ; AVX1-SLOW-NEXT: retq
530 ; AVX1-FAST-LABEL: test_v2i16:
531 ; AVX1-FAST: # %bb.0:
532 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
533 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
534 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
535 ; AVX1-FAST-NEXT: retq
537 ; AVX2-LABEL: test_v2i16:
539 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
540 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
541 ; AVX2-NEXT: vmovd %xmm0, %eax
542 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
545 ; AVX512-LABEL: test_v2i16:
547 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
548 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
549 ; AVX512-NEXT: vmovd %xmm0, %eax
550 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
552 %1 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> %a0)
556 define i16 @test_v4i16(<4 x i16> %a0) {
557 ; SSE-LABEL: test_v4i16:
559 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
560 ; SSE-NEXT: paddw %xmm0, %xmm1
561 ; SSE-NEXT: movdqa %xmm1, %xmm0
562 ; SSE-NEXT: psrld $16, %xmm0
563 ; SSE-NEXT: paddw %xmm1, %xmm0
564 ; SSE-NEXT: movd %xmm0, %eax
565 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
568 ; AVX1-SLOW-LABEL: test_v4i16:
569 ; AVX1-SLOW: # %bb.0:
570 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
571 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
572 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
573 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
574 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
575 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
576 ; AVX1-SLOW-NEXT: retq
578 ; AVX1-FAST-LABEL: test_v4i16:
579 ; AVX1-FAST: # %bb.0:
580 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
581 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
582 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
583 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
584 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
585 ; AVX1-FAST-NEXT: retq
587 ; AVX2-LABEL: test_v4i16:
589 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
590 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
591 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
592 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
593 ; AVX2-NEXT: vmovd %xmm0, %eax
594 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
597 ; AVX512-LABEL: test_v4i16:
599 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
600 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
601 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
602 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
603 ; AVX512-NEXT: vmovd %xmm0, %eax
604 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
606 %1 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %a0)
610 define i16 @test_v8i16(<8 x i16> %a0) {
611 ; SSE-LABEL: test_v8i16:
613 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
614 ; SSE-NEXT: paddw %xmm0, %xmm1
615 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
616 ; SSE-NEXT: paddw %xmm1, %xmm0
617 ; SSE-NEXT: movdqa %xmm0, %xmm1
618 ; SSE-NEXT: psrld $16, %xmm1
619 ; SSE-NEXT: paddw %xmm0, %xmm1
620 ; SSE-NEXT: movd %xmm1, %eax
621 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
624 ; AVX1-SLOW-LABEL: test_v8i16:
625 ; AVX1-SLOW: # %bb.0:
626 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
627 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
628 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
629 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
630 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
631 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
632 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
633 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
634 ; AVX1-SLOW-NEXT: retq
636 ; AVX1-FAST-LABEL: test_v8i16:
637 ; AVX1-FAST: # %bb.0:
638 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
639 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
640 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
641 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
642 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
643 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
644 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
645 ; AVX1-FAST-NEXT: retq
647 ; AVX2-LABEL: test_v8i16:
649 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
650 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
651 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
652 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
653 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
654 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
655 ; AVX2-NEXT: vmovd %xmm0, %eax
656 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
659 ; AVX512-LABEL: test_v8i16:
661 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
662 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
663 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
664 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
665 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
666 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
667 ; AVX512-NEXT: vmovd %xmm0, %eax
668 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
670 %1 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %a0)
674 define i16 @test_v16i16(<16 x i16> %a0) {
675 ; SSE-LABEL: test_v16i16:
677 ; SSE-NEXT: paddw %xmm1, %xmm0
678 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
679 ; SSE-NEXT: paddw %xmm0, %xmm1
680 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
681 ; SSE-NEXT: paddw %xmm1, %xmm0
682 ; SSE-NEXT: movdqa %xmm0, %xmm1
683 ; SSE-NEXT: psrld $16, %xmm1
684 ; SSE-NEXT: paddw %xmm0, %xmm1
685 ; SSE-NEXT: movd %xmm1, %eax
686 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
689 ; AVX1-SLOW-LABEL: test_v16i16:
690 ; AVX1-SLOW: # %bb.0:
691 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
692 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
693 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
694 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
695 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
696 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
697 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
698 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
699 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
700 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
701 ; AVX1-SLOW-NEXT: vzeroupper
702 ; AVX1-SLOW-NEXT: retq
704 ; AVX1-FAST-LABEL: test_v16i16:
705 ; AVX1-FAST: # %bb.0:
706 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
707 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
708 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
709 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
710 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
711 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
712 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
713 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
714 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
715 ; AVX1-FAST-NEXT: vzeroupper
716 ; AVX1-FAST-NEXT: retq
718 ; AVX2-LABEL: test_v16i16:
720 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
721 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
722 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
723 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
724 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
725 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
726 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
727 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
728 ; AVX2-NEXT: vmovd %xmm0, %eax
729 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
730 ; AVX2-NEXT: vzeroupper
733 ; AVX512-LABEL: test_v16i16:
735 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
736 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
737 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
738 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
739 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
740 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
741 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
742 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
743 ; AVX512-NEXT: vmovd %xmm0, %eax
744 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
745 ; AVX512-NEXT: vzeroupper
747 %1 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %a0)
751 define i16 @test_v32i16(<32 x i16> %a0) {
752 ; SSE-LABEL: test_v32i16:
754 ; SSE-NEXT: paddw %xmm3, %xmm1
755 ; SSE-NEXT: paddw %xmm2, %xmm1
756 ; SSE-NEXT: paddw %xmm0, %xmm1
757 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
758 ; SSE-NEXT: paddw %xmm1, %xmm0
759 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
760 ; SSE-NEXT: paddw %xmm0, %xmm1
761 ; SSE-NEXT: movdqa %xmm1, %xmm0
762 ; SSE-NEXT: psrld $16, %xmm0
763 ; SSE-NEXT: paddw %xmm1, %xmm0
764 ; SSE-NEXT: movd %xmm0, %eax
765 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
768 ; AVX1-SLOW-LABEL: test_v32i16:
769 ; AVX1-SLOW: # %bb.0:
770 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
771 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
772 ; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2
773 ; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1
774 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
775 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
776 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
777 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
778 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
779 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
780 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
781 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
782 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
783 ; AVX1-SLOW-NEXT: vzeroupper
784 ; AVX1-SLOW-NEXT: retq
786 ; AVX1-FAST-LABEL: test_v32i16:
787 ; AVX1-FAST: # %bb.0:
788 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
789 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
790 ; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2
791 ; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1
792 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
793 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
794 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
795 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
796 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
797 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
798 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
799 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
800 ; AVX1-FAST-NEXT: vzeroupper
801 ; AVX1-FAST-NEXT: retq
803 ; AVX2-LABEL: test_v32i16:
805 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
806 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
807 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
808 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
809 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
810 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
811 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
812 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
813 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
814 ; AVX2-NEXT: vmovd %xmm0, %eax
815 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
816 ; AVX2-NEXT: vzeroupper
819 ; AVX512-LABEL: test_v32i16:
821 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
822 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
823 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
824 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
825 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
826 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
827 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
828 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
829 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
830 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
831 ; AVX512-NEXT: vmovd %xmm0, %eax
832 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
833 ; AVX512-NEXT: vzeroupper
835 %1 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> %a0)
839 define i16 @test_v64i16(<64 x i16> %a0) {
840 ; SSE-LABEL: test_v64i16:
842 ; SSE-NEXT: paddw %xmm6, %xmm2
843 ; SSE-NEXT: paddw %xmm7, %xmm3
844 ; SSE-NEXT: paddw %xmm5, %xmm3
845 ; SSE-NEXT: paddw %xmm1, %xmm3
846 ; SSE-NEXT: paddw %xmm4, %xmm2
847 ; SSE-NEXT: paddw %xmm3, %xmm2
848 ; SSE-NEXT: paddw %xmm0, %xmm2
849 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
850 ; SSE-NEXT: paddw %xmm2, %xmm0
851 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
852 ; SSE-NEXT: paddw %xmm0, %xmm1
853 ; SSE-NEXT: movdqa %xmm1, %xmm0
854 ; SSE-NEXT: psrld $16, %xmm0
855 ; SSE-NEXT: paddw %xmm1, %xmm0
856 ; SSE-NEXT: movd %xmm0, %eax
857 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
860 ; AVX1-SLOW-LABEL: test_v64i16:
861 ; AVX1-SLOW: # %bb.0:
862 ; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm4
863 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3
864 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
865 ; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm1
866 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3
867 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1
868 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
869 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1
870 ; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2
871 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm2, %xmm1
872 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
873 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
874 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
875 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
876 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
877 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
878 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
879 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
880 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
881 ; AVX1-SLOW-NEXT: vzeroupper
882 ; AVX1-SLOW-NEXT: retq
884 ; AVX1-FAST-LABEL: test_v64i16:
885 ; AVX1-FAST: # %bb.0:
886 ; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm4
887 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3
888 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
889 ; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm1
890 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3
891 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1
892 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
893 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1
894 ; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2
895 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm2, %xmm1
896 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
897 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
898 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
899 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
900 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
901 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
902 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
903 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
904 ; AVX1-FAST-NEXT: vzeroupper
905 ; AVX1-FAST-NEXT: retq
907 ; AVX2-LABEL: test_v64i16:
909 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
910 ; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
911 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
912 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
913 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
914 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
915 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
916 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
917 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
918 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
919 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
920 ; AVX2-NEXT: vmovd %xmm0, %eax
921 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
922 ; AVX2-NEXT: vzeroupper
925 ; AVX512-LABEL: test_v64i16:
927 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
928 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
929 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
930 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
931 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
932 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
933 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
934 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
935 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
936 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
937 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
938 ; AVX512-NEXT: vmovd %xmm0, %eax
939 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
940 ; AVX512-NEXT: vzeroupper
942 %1 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> %a0)
950 define i8 @test_v2i8(<2 x i8> %a0) {
951 ; SSE2-LABEL: test_v2i8:
953 ; SSE2-NEXT: movdqa %xmm0, %xmm1
954 ; SSE2-NEXT: psrlw $8, %xmm1
955 ; SSE2-NEXT: paddb %xmm0, %xmm1
956 ; SSE2-NEXT: movd %xmm1, %eax
957 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
960 ; SSE41-LABEL: test_v2i8:
962 ; SSE41-NEXT: movdqa %xmm0, %xmm1
963 ; SSE41-NEXT: psrlw $8, %xmm1
964 ; SSE41-NEXT: paddb %xmm0, %xmm1
965 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
966 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
969 ; AVX-LABEL: test_v2i8:
971 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
972 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
973 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
974 ; AVX-NEXT: # kill: def $al killed $al killed $eax
977 ; AVX512-LABEL: test_v2i8:
979 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
980 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
981 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
982 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
984 %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0)
988 define i8 @test_v4i8(<4 x i8> %a0) {
989 ; SSE2-LABEL: test_v4i8:
991 ; SSE2-NEXT: movdqa %xmm0, %xmm1
992 ; SSE2-NEXT: psrld $16, %xmm1
993 ; SSE2-NEXT: paddb %xmm0, %xmm1
994 ; SSE2-NEXT: movdqa %xmm1, %xmm0
995 ; SSE2-NEXT: psrlw $8, %xmm0
996 ; SSE2-NEXT: paddb %xmm1, %xmm0
997 ; SSE2-NEXT: movd %xmm0, %eax
998 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1001 ; SSE41-LABEL: test_v4i8:
1003 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1004 ; SSE41-NEXT: psrld $16, %xmm1
1005 ; SSE41-NEXT: paddb %xmm0, %xmm1
1006 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1007 ; SSE41-NEXT: psrlw $8, %xmm0
1008 ; SSE41-NEXT: paddb %xmm1, %xmm0
1009 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
1010 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1013 ; AVX-LABEL: test_v4i8:
1015 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
1016 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1017 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
1018 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1019 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1020 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1023 ; AVX512-LABEL: test_v4i8:
1025 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1026 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1027 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1028 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1029 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1030 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1032 %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)
1036 define i8 @test_v8i8(<8 x i8> %a0) {
1037 ; SSE2-LABEL: test_v8i8:
1039 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1040 ; SSE2-NEXT: paddb %xmm0, %xmm1
1041 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1042 ; SSE2-NEXT: psrld $16, %xmm0
1043 ; SSE2-NEXT: paddb %xmm1, %xmm0
1044 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1045 ; SSE2-NEXT: psrlw $8, %xmm1
1046 ; SSE2-NEXT: paddb %xmm0, %xmm1
1047 ; SSE2-NEXT: movd %xmm1, %eax
1048 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1051 ; SSE41-LABEL: test_v8i8:
1053 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1054 ; SSE41-NEXT: paddb %xmm0, %xmm1
1055 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1056 ; SSE41-NEXT: psrld $16, %xmm0
1057 ; SSE41-NEXT: paddb %xmm1, %xmm0
1058 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1059 ; SSE41-NEXT: psrlw $8, %xmm1
1060 ; SSE41-NEXT: paddb %xmm0, %xmm1
1061 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1062 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1065 ; AVX-LABEL: test_v8i8:
1067 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1068 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1069 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
1070 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1071 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
1072 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1073 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1074 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1077 ; AVX512-LABEL: test_v8i8:
1079 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1080 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1081 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1082 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1083 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1084 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1085 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1086 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1088 %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
1092 define i8 @test_v16i8(<16 x i8> %a0) {
1093 ; SSE2-LABEL: test_v16i8:
1095 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1096 ; SSE2-NEXT: paddb %xmm0, %xmm1
1097 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1098 ; SSE2-NEXT: paddb %xmm1, %xmm0
1099 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1100 ; SSE2-NEXT: psrld $16, %xmm1
1101 ; SSE2-NEXT: paddb %xmm0, %xmm1
1102 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1103 ; SSE2-NEXT: psrlw $8, %xmm0
1104 ; SSE2-NEXT: paddb %xmm1, %xmm0
1105 ; SSE2-NEXT: movd %xmm0, %eax
1106 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1109 ; SSE41-LABEL: test_v16i8:
1111 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1112 ; SSE41-NEXT: paddb %xmm0, %xmm1
1113 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1114 ; SSE41-NEXT: paddb %xmm1, %xmm0
1115 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1116 ; SSE41-NEXT: psrld $16, %xmm1
1117 ; SSE41-NEXT: paddb %xmm0, %xmm1
1118 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1119 ; SSE41-NEXT: psrlw $8, %xmm0
1120 ; SSE41-NEXT: paddb %xmm1, %xmm0
1121 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
1122 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1125 ; AVX-LABEL: test_v16i8:
1127 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1128 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1129 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1130 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1131 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
1132 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1133 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
1134 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1135 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1136 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1139 ; AVX512-LABEL: test_v16i8:
1141 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1142 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1143 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1144 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1145 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1146 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1147 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1148 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1149 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1150 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1152 %1 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %a0)
1156 define i8 @test_v32i8(<32 x i8> %a0) {
1157 ; SSE2-LABEL: test_v32i8:
1159 ; SSE2-NEXT: paddb %xmm1, %xmm0
1160 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1161 ; SSE2-NEXT: paddb %xmm0, %xmm1
1162 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1163 ; SSE2-NEXT: paddb %xmm1, %xmm0
1164 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1165 ; SSE2-NEXT: psrld $16, %xmm1
1166 ; SSE2-NEXT: paddb %xmm0, %xmm1
1167 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1168 ; SSE2-NEXT: psrlw $8, %xmm0
1169 ; SSE2-NEXT: paddb %xmm1, %xmm0
1170 ; SSE2-NEXT: movd %xmm0, %eax
1171 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1174 ; SSE41-LABEL: test_v32i8:
1176 ; SSE41-NEXT: paddb %xmm1, %xmm0
1177 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1178 ; SSE41-NEXT: paddb %xmm0, %xmm1
1179 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1180 ; SSE41-NEXT: paddb %xmm1, %xmm0
1181 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1182 ; SSE41-NEXT: psrld $16, %xmm1
1183 ; SSE41-NEXT: paddb %xmm0, %xmm1
1184 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1185 ; SSE41-NEXT: psrlw $8, %xmm0
1186 ; SSE41-NEXT: paddb %xmm1, %xmm0
1187 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
1188 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1191 ; AVX1-LABEL: test_v32i8:
1193 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1194 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1195 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1196 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1197 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1198 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1199 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1200 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1201 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1202 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1203 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1204 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1205 ; AVX1-NEXT: vzeroupper
1208 ; AVX2-LABEL: test_v32i8:
1210 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1211 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1212 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1213 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1214 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1215 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1216 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1217 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1218 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1219 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1220 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1221 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1222 ; AVX2-NEXT: vzeroupper
1225 ; AVX512-LABEL: test_v32i8:
1227 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1228 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1229 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1230 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1231 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1232 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1233 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1234 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1235 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1236 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1237 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1238 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1239 ; AVX512-NEXT: vzeroupper
1241 %1 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> %a0)
1245 define i8 @test_v64i8(<64 x i8> %a0) {
1246 ; SSE2-LABEL: test_v64i8:
1248 ; SSE2-NEXT: paddb %xmm3, %xmm1
1249 ; SSE2-NEXT: paddb %xmm2, %xmm1
1250 ; SSE2-NEXT: paddb %xmm0, %xmm1
1251 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1252 ; SSE2-NEXT: paddb %xmm1, %xmm0
1253 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1254 ; SSE2-NEXT: paddb %xmm0, %xmm1
1255 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1256 ; SSE2-NEXT: psrld $16, %xmm0
1257 ; SSE2-NEXT: paddb %xmm1, %xmm0
1258 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1259 ; SSE2-NEXT: psrlw $8, %xmm1
1260 ; SSE2-NEXT: paddb %xmm0, %xmm1
1261 ; SSE2-NEXT: movd %xmm1, %eax
1262 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1265 ; SSE41-LABEL: test_v64i8:
1267 ; SSE41-NEXT: paddb %xmm3, %xmm1
1268 ; SSE41-NEXT: paddb %xmm2, %xmm1
1269 ; SSE41-NEXT: paddb %xmm0, %xmm1
1270 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1271 ; SSE41-NEXT: paddb %xmm1, %xmm0
1272 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1273 ; SSE41-NEXT: paddb %xmm0, %xmm1
1274 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1275 ; SSE41-NEXT: psrld $16, %xmm0
1276 ; SSE41-NEXT: paddb %xmm1, %xmm0
1277 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1278 ; SSE41-NEXT: psrlw $8, %xmm1
1279 ; SSE41-NEXT: paddb %xmm0, %xmm1
1280 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1281 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1284 ; AVX1-LABEL: test_v64i8:
1286 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1287 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1288 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
1289 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
1290 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1291 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1292 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1293 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1294 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1295 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1296 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1297 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1298 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1299 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1300 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1301 ; AVX1-NEXT: vzeroupper
1304 ; AVX2-LABEL: test_v64i8:
1306 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1307 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1308 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1309 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1310 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1311 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1312 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1313 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1314 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1315 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1316 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1317 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1318 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1319 ; AVX2-NEXT: vzeroupper
1322 ; AVX512-LABEL: test_v64i8:
1324 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1325 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1326 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1327 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1328 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1329 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1330 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1331 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1332 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1333 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1334 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1335 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1336 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1337 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1338 ; AVX512-NEXT: vzeroupper
1340 %1 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> %a0)
1344 define i8 @test_v128i8(<128 x i8> %a0) {
1345 ; SSE2-LABEL: test_v128i8:
1347 ; SSE2-NEXT: paddb %xmm6, %xmm2
1348 ; SSE2-NEXT: paddb %xmm7, %xmm3
1349 ; SSE2-NEXT: paddb %xmm5, %xmm3
1350 ; SSE2-NEXT: paddb %xmm1, %xmm3
1351 ; SSE2-NEXT: paddb %xmm4, %xmm2
1352 ; SSE2-NEXT: paddb %xmm3, %xmm2
1353 ; SSE2-NEXT: paddb %xmm0, %xmm2
1354 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1355 ; SSE2-NEXT: paddb %xmm2, %xmm0
1356 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1357 ; SSE2-NEXT: paddb %xmm0, %xmm1
1358 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1359 ; SSE2-NEXT: psrld $16, %xmm0
1360 ; SSE2-NEXT: paddb %xmm1, %xmm0
1361 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1362 ; SSE2-NEXT: psrlw $8, %xmm1
1363 ; SSE2-NEXT: paddb %xmm0, %xmm1
1364 ; SSE2-NEXT: movd %xmm1, %eax
1365 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1368 ; SSE41-LABEL: test_v128i8:
1370 ; SSE41-NEXT: paddb %xmm6, %xmm2
1371 ; SSE41-NEXT: paddb %xmm7, %xmm3
1372 ; SSE41-NEXT: paddb %xmm5, %xmm3
1373 ; SSE41-NEXT: paddb %xmm1, %xmm3
1374 ; SSE41-NEXT: paddb %xmm4, %xmm2
1375 ; SSE41-NEXT: paddb %xmm3, %xmm2
1376 ; SSE41-NEXT: paddb %xmm0, %xmm2
1377 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1378 ; SSE41-NEXT: paddb %xmm2, %xmm0
1379 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1380 ; SSE41-NEXT: paddb %xmm0, %xmm1
1381 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1382 ; SSE41-NEXT: psrld $16, %xmm0
1383 ; SSE41-NEXT: paddb %xmm1, %xmm0
1384 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1385 ; SSE41-NEXT: psrlw $8, %xmm1
1386 ; SSE41-NEXT: paddb %xmm0, %xmm1
1387 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1388 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1391 ; AVX1-LABEL: test_v128i8:
1393 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm4
1394 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1395 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1396 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
1397 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1398 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
1399 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1400 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
1401 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
1402 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
1403 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1404 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1405 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1406 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1407 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1408 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1409 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1410 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1411 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1412 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1413 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1414 ; AVX1-NEXT: vzeroupper
1417 ; AVX2-LABEL: test_v128i8:
1419 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1
1420 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
1421 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1422 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1423 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1424 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1425 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1426 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1427 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1428 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1429 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1430 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1431 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1432 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1433 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1434 ; AVX2-NEXT: vzeroupper
1437 ; AVX512-LABEL: test_v128i8:
1439 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1440 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1441 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1442 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1443 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1444 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1445 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1446 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1447 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1448 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1449 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1450 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1451 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1452 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1453 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1454 ; AVX512-NEXT: vzeroupper
1456 %1 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> %a0)
1460 declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
1461 declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
1462 declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
1463 declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
1465 declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
1466 declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
1467 declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
1468 declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
1469 declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
1471 declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>)
1472 declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
1473 declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
1474 declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
1475 declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
1476 declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
1478 declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>)
1479 declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>)
1480 declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
1481 declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
1482 declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
1483 declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
1484 declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)