1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
14 define i64 @test_v2i64(<2 x i64> %a0) {
15 ; SSE-LABEL: test_v2i64:
17 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
18 ; SSE-NEXT: paddq %xmm0, %xmm1
19 ; SSE-NEXT: movq %xmm1, %rax
22 ; AVX-LABEL: test_v2i64:
24 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
25 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
26 ; AVX-NEXT: vmovq %xmm0, %rax
29 ; AVX512-LABEL: test_v2i64:
31 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
32 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
33 ; AVX512-NEXT: vmovq %xmm0, %rax
35 %1 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a0)
39 define i64 @test_v4i64(<4 x i64> %a0) {
40 ; SSE-LABEL: test_v4i64:
42 ; SSE-NEXT: paddq %xmm1, %xmm0
43 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
44 ; SSE-NEXT: paddq %xmm0, %xmm1
45 ; SSE-NEXT: movq %xmm1, %rax
48 ; AVX1-LABEL: test_v4i64:
50 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
51 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
52 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
53 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
54 ; AVX1-NEXT: vmovq %xmm0, %rax
55 ; AVX1-NEXT: vzeroupper
58 ; AVX2-LABEL: test_v4i64:
60 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
61 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
62 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
63 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
64 ; AVX2-NEXT: vmovq %xmm0, %rax
65 ; AVX2-NEXT: vzeroupper
68 ; AVX512-LABEL: test_v4i64:
70 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
71 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
72 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
73 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
74 ; AVX512-NEXT: vmovq %xmm0, %rax
75 ; AVX512-NEXT: vzeroupper
77 %1 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %a0)
81 define i64 @test_v8i64(<8 x i64> %a0) {
82 ; SSE-LABEL: test_v8i64:
84 ; SSE-NEXT: paddq %xmm3, %xmm1
85 ; SSE-NEXT: paddq %xmm2, %xmm1
86 ; SSE-NEXT: paddq %xmm0, %xmm1
87 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
88 ; SSE-NEXT: paddq %xmm1, %xmm0
89 ; SSE-NEXT: movq %xmm0, %rax
92 ; AVX1-LABEL: test_v8i64:
94 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
95 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
96 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
97 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
98 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
99 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
100 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
101 ; AVX1-NEXT: vmovq %xmm0, %rax
102 ; AVX1-NEXT: vzeroupper
105 ; AVX2-LABEL: test_v8i64:
107 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
108 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
109 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
110 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
111 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
112 ; AVX2-NEXT: vmovq %xmm0, %rax
113 ; AVX2-NEXT: vzeroupper
116 ; AVX512-LABEL: test_v8i64:
118 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
119 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
120 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
121 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
122 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
123 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
124 ; AVX512-NEXT: vmovq %xmm0, %rax
125 ; AVX512-NEXT: vzeroupper
127 %1 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %a0)
131 define i64 @test_v16i64(<16 x i64> %a0) {
132 ; SSE-LABEL: test_v16i64:
134 ; SSE-NEXT: paddq %xmm6, %xmm2
135 ; SSE-NEXT: paddq %xmm7, %xmm3
136 ; SSE-NEXT: paddq %xmm5, %xmm3
137 ; SSE-NEXT: paddq %xmm1, %xmm3
138 ; SSE-NEXT: paddq %xmm4, %xmm2
139 ; SSE-NEXT: paddq %xmm3, %xmm2
140 ; SSE-NEXT: paddq %xmm0, %xmm2
141 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
142 ; SSE-NEXT: paddq %xmm2, %xmm0
143 ; SSE-NEXT: movq %xmm0, %rax
146 ; AVX1-LABEL: test_v16i64:
148 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4
149 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
150 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
151 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
152 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
153 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
154 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
155 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
156 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
157 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
158 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
159 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
160 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
161 ; AVX1-NEXT: vmovq %xmm0, %rax
162 ; AVX1-NEXT: vzeroupper
165 ; AVX2-LABEL: test_v16i64:
167 ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
168 ; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
169 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
170 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
171 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
172 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
173 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
174 ; AVX2-NEXT: vmovq %xmm0, %rax
175 ; AVX2-NEXT: vzeroupper
178 ; AVX512-LABEL: test_v16i64:
180 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
181 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
182 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
183 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
184 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
185 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
186 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
187 ; AVX512-NEXT: vmovq %xmm0, %rax
188 ; AVX512-NEXT: vzeroupper
190 %1 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %a0)
198 define i32 @test_v2i32(<2 x i32> %a0) {
199 ; SSE-LABEL: test_v2i32:
201 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
202 ; SSE-NEXT: paddd %xmm0, %xmm1
203 ; SSE-NEXT: movd %xmm1, %eax
206 ; AVX1-SLOW-LABEL: test_v2i32:
207 ; AVX1-SLOW: # %bb.0:
208 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
209 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
210 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
211 ; AVX1-SLOW-NEXT: retq
213 ; AVX1-FAST-LABEL: test_v2i32:
214 ; AVX1-FAST: # %bb.0:
215 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
216 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
217 ; AVX1-FAST-NEXT: retq
219 ; AVX2-LABEL: test_v2i32:
221 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
222 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
223 ; AVX2-NEXT: vmovd %xmm0, %eax
226 ; AVX512-LABEL: test_v2i32:
228 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
229 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
230 ; AVX512-NEXT: vmovd %xmm0, %eax
232 %1 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a0)
236 define i32 @test_v4i32(<4 x i32> %a0) {
237 ; SSE-LABEL: test_v4i32:
239 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
240 ; SSE-NEXT: paddd %xmm0, %xmm1
241 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
242 ; SSE-NEXT: paddd %xmm1, %xmm0
243 ; SSE-NEXT: movd %xmm0, %eax
246 ; AVX1-SLOW-LABEL: test_v4i32:
247 ; AVX1-SLOW: # %bb.0:
248 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
249 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
250 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
251 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
252 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
253 ; AVX1-SLOW-NEXT: retq
255 ; AVX1-FAST-LABEL: test_v4i32:
256 ; AVX1-FAST: # %bb.0:
257 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
258 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
259 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
260 ; AVX1-FAST-NEXT: retq
262 ; AVX2-LABEL: test_v4i32:
264 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
265 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
266 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
267 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
268 ; AVX2-NEXT: vmovd %xmm0, %eax
271 ; AVX512-LABEL: test_v4i32:
273 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
274 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
275 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
276 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
277 ; AVX512-NEXT: vmovd %xmm0, %eax
279 %1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a0)
283 define i32 @test_v8i32(<8 x i32> %a0) {
284 ; SSE-LABEL: test_v8i32:
286 ; SSE-NEXT: paddd %xmm1, %xmm0
287 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
288 ; SSE-NEXT: paddd %xmm0, %xmm1
289 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
290 ; SSE-NEXT: paddd %xmm1, %xmm0
291 ; SSE-NEXT: movd %xmm0, %eax
294 ; AVX1-SLOW-LABEL: test_v8i32:
295 ; AVX1-SLOW: # %bb.0:
296 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
297 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
298 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
299 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
300 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
301 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
302 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
303 ; AVX1-SLOW-NEXT: vzeroupper
304 ; AVX1-SLOW-NEXT: retq
306 ; AVX1-FAST-LABEL: test_v8i32:
307 ; AVX1-FAST: # %bb.0:
308 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
309 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0
310 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
311 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
312 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
313 ; AVX1-FAST-NEXT: vzeroupper
314 ; AVX1-FAST-NEXT: retq
316 ; AVX2-LABEL: test_v8i32:
318 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
319 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
320 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
321 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
322 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
323 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
324 ; AVX2-NEXT: vmovd %xmm0, %eax
325 ; AVX2-NEXT: vzeroupper
328 ; AVX512-LABEL: test_v8i32:
330 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
331 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
332 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
333 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
334 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
335 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
336 ; AVX512-NEXT: vmovd %xmm0, %eax
337 ; AVX512-NEXT: vzeroupper
339 %1 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a0)
343 define i32 @test_v16i32(<16 x i32> %a0) {
344 ; SSE-LABEL: test_v16i32:
346 ; SSE-NEXT: paddd %xmm3, %xmm1
347 ; SSE-NEXT: paddd %xmm2, %xmm1
348 ; SSE-NEXT: paddd %xmm0, %xmm1
349 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
350 ; SSE-NEXT: paddd %xmm1, %xmm0
351 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
352 ; SSE-NEXT: paddd %xmm0, %xmm1
353 ; SSE-NEXT: movd %xmm1, %eax
356 ; AVX1-SLOW-LABEL: test_v16i32:
357 ; AVX1-SLOW: # %bb.0:
358 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
359 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
360 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
361 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
362 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
363 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
364 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
365 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
366 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
367 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
368 ; AVX1-SLOW-NEXT: vzeroupper
369 ; AVX1-SLOW-NEXT: retq
371 ; AVX1-FAST-LABEL: test_v16i32:
372 ; AVX1-FAST: # %bb.0:
373 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
374 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
375 ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
376 ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
377 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
378 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
379 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
380 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
381 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
382 ; AVX1-FAST-NEXT: vzeroupper
383 ; AVX1-FAST-NEXT: retq
385 ; AVX2-LABEL: test_v16i32:
387 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
388 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
389 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
390 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
391 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
392 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
393 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
394 ; AVX2-NEXT: vmovd %xmm0, %eax
395 ; AVX2-NEXT: vzeroupper
398 ; AVX512-LABEL: test_v16i32:
400 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
401 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
402 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
403 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
406 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
407 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
408 ; AVX512-NEXT: vmovd %xmm0, %eax
409 ; AVX512-NEXT: vzeroupper
411 %1 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a0)
415 define i32 @test_v32i32(<32 x i32> %a0) {
416 ; SSE-LABEL: test_v32i32:
418 ; SSE-NEXT: paddd %xmm6, %xmm2
419 ; SSE-NEXT: paddd %xmm7, %xmm3
420 ; SSE-NEXT: paddd %xmm5, %xmm3
421 ; SSE-NEXT: paddd %xmm1, %xmm3
422 ; SSE-NEXT: paddd %xmm4, %xmm2
423 ; SSE-NEXT: paddd %xmm3, %xmm2
424 ; SSE-NEXT: paddd %xmm0, %xmm2
425 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
426 ; SSE-NEXT: paddd %xmm2, %xmm0
427 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
428 ; SSE-NEXT: paddd %xmm0, %xmm1
429 ; SSE-NEXT: movd %xmm1, %eax
432 ; AVX1-SLOW-LABEL: test_v32i32:
433 ; AVX1-SLOW: # %bb.0:
434 ; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4
435 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3
436 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
437 ; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1
438 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3
439 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
440 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
441 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
442 ; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2
443 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
444 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
445 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
446 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
447 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
448 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
449 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
450 ; AVX1-SLOW-NEXT: vzeroupper
451 ; AVX1-SLOW-NEXT: retq
453 ; AVX1-FAST-LABEL: test_v32i32:
454 ; AVX1-FAST: # %bb.0:
455 ; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4
456 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3
457 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
458 ; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1
459 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3
460 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
461 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
462 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
463 ; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2
464 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
465 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
466 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
467 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
468 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
469 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
470 ; AVX1-FAST-NEXT: vzeroupper
471 ; AVX1-FAST-NEXT: retq
473 ; AVX2-LABEL: test_v32i32:
475 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
476 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
477 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
478 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
479 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
480 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
481 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
482 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
483 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
484 ; AVX2-NEXT: vmovd %xmm0, %eax
485 ; AVX2-NEXT: vzeroupper
488 ; AVX512-LABEL: test_v32i32:
490 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
491 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
492 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
493 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
494 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
495 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
496 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
497 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
498 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
499 ; AVX512-NEXT: vmovd %xmm0, %eax
500 ; AVX512-NEXT: vzeroupper
502 %1 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> %a0)
510 define i16 @test_v2i16(<2 x i16> %a0) {
511 ; SSE-LABEL: test_v2i16:
513 ; SSE-NEXT: movdqa %xmm0, %xmm1
514 ; SSE-NEXT: psrld $16, %xmm1
515 ; SSE-NEXT: paddw %xmm0, %xmm1
516 ; SSE-NEXT: movd %xmm1, %eax
517 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
520 ; AVX1-SLOW-LABEL: test_v2i16:
521 ; AVX1-SLOW: # %bb.0:
522 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
523 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
524 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
525 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
526 ; AVX1-SLOW-NEXT: retq
528 ; AVX1-FAST-LABEL: test_v2i16:
529 ; AVX1-FAST: # %bb.0:
530 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
531 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
532 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
533 ; AVX1-FAST-NEXT: retq
535 ; AVX2-LABEL: test_v2i16:
537 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
538 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
539 ; AVX2-NEXT: vmovd %xmm0, %eax
540 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
543 ; AVX512-LABEL: test_v2i16:
545 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
546 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
547 ; AVX512-NEXT: vmovd %xmm0, %eax
548 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
550 %1 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> %a0)
554 define i16 @test_v4i16(<4 x i16> %a0) {
555 ; SSE-LABEL: test_v4i16:
557 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
558 ; SSE-NEXT: paddw %xmm0, %xmm1
559 ; SSE-NEXT: movdqa %xmm1, %xmm0
560 ; SSE-NEXT: psrld $16, %xmm0
561 ; SSE-NEXT: paddw %xmm1, %xmm0
562 ; SSE-NEXT: movd %xmm0, %eax
563 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
566 ; AVX1-SLOW-LABEL: test_v4i16:
567 ; AVX1-SLOW: # %bb.0:
568 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
569 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
570 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
571 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
572 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
573 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
574 ; AVX1-SLOW-NEXT: retq
576 ; AVX1-FAST-LABEL: test_v4i16:
577 ; AVX1-FAST: # %bb.0:
578 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
579 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
580 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
581 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
582 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
583 ; AVX1-FAST-NEXT: retq
585 ; AVX2-LABEL: test_v4i16:
587 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
588 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
589 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
590 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
591 ; AVX2-NEXT: vmovd %xmm0, %eax
592 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
595 ; AVX512-LABEL: test_v4i16:
597 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
598 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
599 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
600 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
601 ; AVX512-NEXT: vmovd %xmm0, %eax
602 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
604 %1 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %a0)
608 define i16 @test_v8i16(<8 x i16> %a0) {
609 ; SSE-LABEL: test_v8i16:
611 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
612 ; SSE-NEXT: paddw %xmm0, %xmm1
613 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
614 ; SSE-NEXT: paddw %xmm1, %xmm0
615 ; SSE-NEXT: movdqa %xmm0, %xmm1
616 ; SSE-NEXT: psrld $16, %xmm1
617 ; SSE-NEXT: paddw %xmm0, %xmm1
618 ; SSE-NEXT: movd %xmm1, %eax
619 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
622 ; AVX1-SLOW-LABEL: test_v8i16:
623 ; AVX1-SLOW: # %bb.0:
624 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
625 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
626 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
627 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
628 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
629 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
630 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
631 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
632 ; AVX1-SLOW-NEXT: retq
634 ; AVX1-FAST-LABEL: test_v8i16:
635 ; AVX1-FAST: # %bb.0:
636 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
637 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
638 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
639 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
640 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
641 ; AVX1-FAST-NEXT: retq
643 ; AVX2-LABEL: test_v8i16:
645 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
646 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
647 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
648 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
649 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
650 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
651 ; AVX2-NEXT: vmovd %xmm0, %eax
652 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
655 ; AVX512-LABEL: test_v8i16:
657 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
658 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
659 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
660 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
661 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
662 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
663 ; AVX512-NEXT: vmovd %xmm0, %eax
664 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
666 %1 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %a0)
670 define i16 @test_v16i16(<16 x i16> %a0) {
671 ; SSE-LABEL: test_v16i16:
673 ; SSE-NEXT: paddw %xmm1, %xmm0
674 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
675 ; SSE-NEXT: paddw %xmm0, %xmm1
676 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
677 ; SSE-NEXT: paddw %xmm1, %xmm0
678 ; SSE-NEXT: movdqa %xmm0, %xmm1
679 ; SSE-NEXT: psrld $16, %xmm1
680 ; SSE-NEXT: paddw %xmm0, %xmm1
681 ; SSE-NEXT: movd %xmm1, %eax
682 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
685 ; AVX1-SLOW-LABEL: test_v16i16:
686 ; AVX1-SLOW: # %bb.0:
687 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
688 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
689 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
690 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
691 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
692 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
693 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
694 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
695 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
696 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
697 ; AVX1-SLOW-NEXT: vzeroupper
698 ; AVX1-SLOW-NEXT: retq
700 ; AVX1-FAST-LABEL: test_v16i16:
701 ; AVX1-FAST: # %bb.0:
702 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
703 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0
704 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
705 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
706 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
707 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
708 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
709 ; AVX1-FAST-NEXT: vzeroupper
710 ; AVX1-FAST-NEXT: retq
712 ; AVX2-LABEL: test_v16i16:
714 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
715 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
716 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
717 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
718 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
719 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
720 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
721 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
722 ; AVX2-NEXT: vmovd %xmm0, %eax
723 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
724 ; AVX2-NEXT: vzeroupper
727 ; AVX512-LABEL: test_v16i16:
729 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
730 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
731 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
732 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
733 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
734 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
735 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
736 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
737 ; AVX512-NEXT: vmovd %xmm0, %eax
738 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
739 ; AVX512-NEXT: vzeroupper
741 %1 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %a0)
745 define i16 @test_v32i16(<32 x i16> %a0) {
746 ; SSE-LABEL: test_v32i16:
748 ; SSE-NEXT: paddw %xmm3, %xmm1
749 ; SSE-NEXT: paddw %xmm2, %xmm1
750 ; SSE-NEXT: paddw %xmm0, %xmm1
751 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
752 ; SSE-NEXT: paddw %xmm1, %xmm0
753 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
754 ; SSE-NEXT: paddw %xmm0, %xmm1
755 ; SSE-NEXT: movdqa %xmm1, %xmm0
756 ; SSE-NEXT: psrld $16, %xmm0
757 ; SSE-NEXT: paddw %xmm1, %xmm0
758 ; SSE-NEXT: movd %xmm0, %eax
759 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
762 ; AVX1-SLOW-LABEL: test_v32i16:
763 ; AVX1-SLOW: # %bb.0:
764 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
765 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
766 ; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2
767 ; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1
768 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
769 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
770 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
771 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
772 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
773 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
774 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
775 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
776 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
777 ; AVX1-SLOW-NEXT: vzeroupper
778 ; AVX1-SLOW-NEXT: retq
780 ; AVX1-FAST-LABEL: test_v32i16:
781 ; AVX1-FAST: # %bb.0:
782 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
783 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
784 ; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2
785 ; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1
786 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
787 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
788 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
789 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
790 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
791 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
792 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
793 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
794 ; AVX1-FAST-NEXT: vzeroupper
795 ; AVX1-FAST-NEXT: retq
797 ; AVX2-LABEL: test_v32i16:
799 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
800 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
801 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
802 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
803 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
804 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
805 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
806 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
807 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
808 ; AVX2-NEXT: vmovd %xmm0, %eax
809 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
810 ; AVX2-NEXT: vzeroupper
813 ; AVX512-LABEL: test_v32i16:
815 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
816 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
817 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
818 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
819 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
820 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
821 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
822 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
823 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
824 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
825 ; AVX512-NEXT: vmovd %xmm0, %eax
826 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
827 ; AVX512-NEXT: vzeroupper
829 %1 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> %a0)
833 define i16 @test_v64i16(<64 x i16> %a0) {
834 ; SSE-LABEL: test_v64i16:
836 ; SSE-NEXT: paddw %xmm6, %xmm2
837 ; SSE-NEXT: paddw %xmm7, %xmm3
838 ; SSE-NEXT: paddw %xmm5, %xmm3
839 ; SSE-NEXT: paddw %xmm1, %xmm3
840 ; SSE-NEXT: paddw %xmm4, %xmm2
841 ; SSE-NEXT: paddw %xmm3, %xmm2
842 ; SSE-NEXT: paddw %xmm0, %xmm2
843 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
844 ; SSE-NEXT: paddw %xmm2, %xmm0
845 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
846 ; SSE-NEXT: paddw %xmm0, %xmm1
847 ; SSE-NEXT: movdqa %xmm1, %xmm0
848 ; SSE-NEXT: psrld $16, %xmm0
849 ; SSE-NEXT: paddw %xmm1, %xmm0
850 ; SSE-NEXT: movd %xmm0, %eax
851 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
854 ; AVX1-SLOW-LABEL: test_v64i16:
855 ; AVX1-SLOW: # %bb.0:
856 ; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm4
857 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3
858 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
859 ; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm1
860 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3
861 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1
862 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
863 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1
864 ; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2
865 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm2, %xmm1
866 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
867 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
868 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
869 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
870 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
871 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
872 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
873 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
874 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
875 ; AVX1-SLOW-NEXT: vzeroupper
876 ; AVX1-SLOW-NEXT: retq
878 ; AVX1-FAST-LABEL: test_v64i16:
879 ; AVX1-FAST: # %bb.0:
880 ; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm4
881 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3
882 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
883 ; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm1
884 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3
885 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1
886 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
887 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1
888 ; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2
889 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm2, %xmm1
890 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
891 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
892 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
893 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
894 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
895 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
896 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
897 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
898 ; AVX1-FAST-NEXT: vzeroupper
899 ; AVX1-FAST-NEXT: retq
901 ; AVX2-LABEL: test_v64i16:
903 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
904 ; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
905 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
906 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
907 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
908 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
909 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
910 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
911 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
912 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
913 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
914 ; AVX2-NEXT: vmovd %xmm0, %eax
915 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
916 ; AVX2-NEXT: vzeroupper
919 ; AVX512-LABEL: test_v64i16:
921 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
922 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
923 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
924 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
925 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
926 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
927 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
928 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
929 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
930 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
931 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
932 ; AVX512-NEXT: vmovd %xmm0, %eax
933 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
934 ; AVX512-NEXT: vzeroupper
936 %1 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> %a0)
944 define i8 @test_v2i8(<2 x i8> %a0) {
945 ; SSE2-LABEL: test_v2i8:
947 ; SSE2-NEXT: movdqa %xmm0, %xmm1
948 ; SSE2-NEXT: psrlw $8, %xmm1
949 ; SSE2-NEXT: paddb %xmm0, %xmm1
950 ; SSE2-NEXT: movd %xmm1, %eax
951 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
954 ; SSE41-LABEL: test_v2i8:
956 ; SSE41-NEXT: movdqa %xmm0, %xmm1
957 ; SSE41-NEXT: psrlw $8, %xmm1
958 ; SSE41-NEXT: paddb %xmm0, %xmm1
959 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
960 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
963 ; AVX-LABEL: test_v2i8:
965 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
966 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
967 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
968 ; AVX-NEXT: # kill: def $al killed $al killed $eax
971 ; AVX512-LABEL: test_v2i8:
973 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
974 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
975 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
976 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
978 %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0)
982 define i8 @test_v2i8_load(<2 x i8>* %p) {
983 ; SSE2-LABEL: test_v2i8_load:
985 ; SSE2-NEXT: movzwl (%rdi), %eax
986 ; SSE2-NEXT: movd %eax, %xmm0
987 ; SSE2-NEXT: movdqa %xmm0, %xmm1
988 ; SSE2-NEXT: psrlw $8, %xmm1
989 ; SSE2-NEXT: paddb %xmm0, %xmm1
990 ; SSE2-NEXT: movd %xmm1, %eax
991 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
994 ; SSE41-LABEL: test_v2i8_load:
996 ; SSE41-NEXT: movzwl (%rdi), %eax
997 ; SSE41-NEXT: movd %eax, %xmm0
998 ; SSE41-NEXT: movdqa %xmm0, %xmm1
999 ; SSE41-NEXT: psrlw $8, %xmm1
1000 ; SSE41-NEXT: paddb %xmm0, %xmm1
1001 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1002 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1005 ; AVX-LABEL: test_v2i8_load:
1007 ; AVX-NEXT: movzwl (%rdi), %eax
1008 ; AVX-NEXT: vmovd %eax, %xmm0
1009 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
1010 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1011 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1012 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1015 ; AVX512-LABEL: test_v2i8_load:
1017 ; AVX512-NEXT: movzwl (%rdi), %eax
1018 ; AVX512-NEXT: vmovd %eax, %xmm0
1019 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1020 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1021 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1022 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1024 %a0 = load <2 x i8>, <2 x i8>* %p
1025 %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0)
1029 define i8 @test_v4i8(<4 x i8> %a0) {
1030 ; SSE2-LABEL: test_v4i8:
1032 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1033 ; SSE2-NEXT: psrld $16, %xmm1
1034 ; SSE2-NEXT: paddb %xmm0, %xmm1
1035 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1036 ; SSE2-NEXT: psrlw $8, %xmm0
1037 ; SSE2-NEXT: paddb %xmm1, %xmm0
1038 ; SSE2-NEXT: movd %xmm0, %eax
1039 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1042 ; SSE41-LABEL: test_v4i8:
1044 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1045 ; SSE41-NEXT: psrld $16, %xmm1
1046 ; SSE41-NEXT: paddb %xmm0, %xmm1
1047 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1048 ; SSE41-NEXT: psrlw $8, %xmm0
1049 ; SSE41-NEXT: paddb %xmm1, %xmm0
1050 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
1051 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1054 ; AVX-LABEL: test_v4i8:
1056 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
1057 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1058 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
1059 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1060 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1061 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1064 ; AVX512-LABEL: test_v4i8:
1066 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1067 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1068 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1069 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1070 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1071 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1073 %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)
1077 define i8 @test_v4i8_load(<4 x i8>* %p) {
1078 ; SSE2-LABEL: test_v4i8_load:
1080 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1081 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1082 ; SSE2-NEXT: psrld $16, %xmm1
1083 ; SSE2-NEXT: paddb %xmm0, %xmm1
1084 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1085 ; SSE2-NEXT: psrlw $8, %xmm0
1086 ; SSE2-NEXT: paddb %xmm1, %xmm0
1087 ; SSE2-NEXT: movd %xmm0, %eax
1088 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1091 ; SSE41-LABEL: test_v4i8_load:
1093 ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1094 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1095 ; SSE41-NEXT: psrld $16, %xmm1
1096 ; SSE41-NEXT: paddb %xmm0, %xmm1
1097 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1098 ; SSE41-NEXT: psrlw $8, %xmm0
1099 ; SSE41-NEXT: paddb %xmm1, %xmm0
1100 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
1101 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1104 ; AVX-LABEL: test_v4i8_load:
1106 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1107 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
1108 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1109 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
1110 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1111 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1112 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1115 ; AVX512-LABEL: test_v4i8_load:
1117 ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1118 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1119 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1120 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1121 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1122 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1123 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1125 %a0 = load <4 x i8>, <4 x i8>* %p
1126 %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)
1130 define i8 @test_v8i8(<8 x i8> %a0) {
1131 ; SSE2-LABEL: test_v8i8:
1133 ; SSE2-NEXT: pxor %xmm1, %xmm1
1134 ; SSE2-NEXT: psadbw %xmm0, %xmm1
1135 ; SSE2-NEXT: movd %xmm1, %eax
1136 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1139 ; SSE41-LABEL: test_v8i8:
1141 ; SSE41-NEXT: pxor %xmm1, %xmm1
1142 ; SSE41-NEXT: psadbw %xmm0, %xmm1
1143 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1144 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1147 ; AVX-LABEL: test_v8i8:
1149 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
1150 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1151 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1152 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1155 ; AVX512-LABEL: test_v8i8:
1157 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1158 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1159 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1160 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1162 %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
1166 define i8 @test_v8i8_load(<8 x i8>* %p) {
1167 ; SSE2-LABEL: test_v8i8_load:
1169 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1170 ; SSE2-NEXT: pxor %xmm1, %xmm1
1171 ; SSE2-NEXT: psadbw %xmm0, %xmm1
1172 ; SSE2-NEXT: movd %xmm1, %eax
1173 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1176 ; SSE41-LABEL: test_v8i8_load:
1178 ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1179 ; SSE41-NEXT: pxor %xmm1, %xmm1
1180 ; SSE41-NEXT: psadbw %xmm0, %xmm1
1181 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1182 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1185 ; AVX-LABEL: test_v8i8_load:
1187 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1188 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
1189 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1190 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1191 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1194 ; AVX512-LABEL: test_v8i8_load:
1196 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1197 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1198 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1199 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1200 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1202 %a0 = load <8 x i8>, <8 x i8>* %p
1203 %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
1207 define i8 @test_v16i8(<16 x i8> %a0) {
1208 ; SSE2-LABEL: test_v16i8:
1210 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1211 ; SSE2-NEXT: paddb %xmm0, %xmm1
1212 ; SSE2-NEXT: pxor %xmm0, %xmm0
1213 ; SSE2-NEXT: psadbw %xmm1, %xmm0
1214 ; SSE2-NEXT: movd %xmm0, %eax
1215 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1218 ; SSE41-LABEL: test_v16i8:
1220 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1221 ; SSE41-NEXT: paddb %xmm0, %xmm1
1222 ; SSE41-NEXT: pxor %xmm0, %xmm0
1223 ; SSE41-NEXT: psadbw %xmm1, %xmm0
1224 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
1225 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1228 ; AVX-LABEL: test_v16i8:
1230 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1231 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1232 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
1233 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1234 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1235 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1238 ; AVX512-LABEL: test_v16i8:
1240 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1241 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1242 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1243 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1244 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1245 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1247 %1 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %a0)
1251 define i8 @test_v32i8(<32 x i8> %a0) {
1252 ; SSE2-LABEL: test_v32i8:
1254 ; SSE2-NEXT: paddb %xmm1, %xmm0
1255 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1256 ; SSE2-NEXT: paddb %xmm0, %xmm1
1257 ; SSE2-NEXT: pxor %xmm0, %xmm0
1258 ; SSE2-NEXT: psadbw %xmm1, %xmm0
1259 ; SSE2-NEXT: movd %xmm0, %eax
1260 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1263 ; SSE41-LABEL: test_v32i8:
1265 ; SSE41-NEXT: paddb %xmm1, %xmm0
1266 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1267 ; SSE41-NEXT: paddb %xmm0, %xmm1
1268 ; SSE41-NEXT: pxor %xmm0, %xmm0
1269 ; SSE41-NEXT: psadbw %xmm1, %xmm0
1270 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
1271 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1274 ; AVX1-LABEL: test_v32i8:
1276 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1277 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1278 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1279 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1280 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1281 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1282 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1283 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1284 ; AVX1-NEXT: vzeroupper
1287 ; AVX2-LABEL: test_v32i8:
1289 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1290 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1291 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1292 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1293 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1294 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1295 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1296 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1297 ; AVX2-NEXT: vzeroupper
1300 ; AVX512-LABEL: test_v32i8:
1302 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1303 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1304 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1305 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1306 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1307 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1308 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1309 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1310 ; AVX512-NEXT: vzeroupper
1312 %1 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> %a0)
1316 define i8 @test_v64i8(<64 x i8> %a0) {
1317 ; SSE2-LABEL: test_v64i8:
1319 ; SSE2-NEXT: paddb %xmm3, %xmm1
1320 ; SSE2-NEXT: paddb %xmm2, %xmm1
1321 ; SSE2-NEXT: paddb %xmm0, %xmm1
1322 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1323 ; SSE2-NEXT: paddb %xmm1, %xmm0
1324 ; SSE2-NEXT: pxor %xmm1, %xmm1
1325 ; SSE2-NEXT: psadbw %xmm0, %xmm1
1326 ; SSE2-NEXT: movd %xmm1, %eax
1327 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1330 ; SSE41-LABEL: test_v64i8:
1332 ; SSE41-NEXT: paddb %xmm3, %xmm1
1333 ; SSE41-NEXT: paddb %xmm2, %xmm1
1334 ; SSE41-NEXT: paddb %xmm0, %xmm1
1335 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1336 ; SSE41-NEXT: paddb %xmm1, %xmm0
1337 ; SSE41-NEXT: pxor %xmm1, %xmm1
1338 ; SSE41-NEXT: psadbw %xmm0, %xmm1
1339 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1340 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1343 ; AVX1-LABEL: test_v64i8:
1345 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1346 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1347 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
1348 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
1349 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1350 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1351 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1352 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1353 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1354 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1355 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1356 ; AVX1-NEXT: vzeroupper
1359 ; AVX2-LABEL: test_v64i8:
1361 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1362 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1363 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1364 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1365 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1366 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1367 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1368 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1369 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1370 ; AVX2-NEXT: vzeroupper
1373 ; AVX512-LABEL: test_v64i8:
1375 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1376 ; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1377 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1378 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1379 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1380 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1381 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1382 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1383 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1384 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1385 ; AVX512-NEXT: vzeroupper
1387 %1 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> %a0)
1391 define i8 @test_v128i8(<128 x i8> %a0) {
1392 ; SSE2-LABEL: test_v128i8:
1394 ; SSE2-NEXT: paddb %xmm7, %xmm3
1395 ; SSE2-NEXT: paddb %xmm5, %xmm3
1396 ; SSE2-NEXT: paddb %xmm1, %xmm3
1397 ; SSE2-NEXT: paddb %xmm6, %xmm2
1398 ; SSE2-NEXT: paddb %xmm4, %xmm2
1399 ; SSE2-NEXT: paddb %xmm3, %xmm2
1400 ; SSE2-NEXT: paddb %xmm0, %xmm2
1401 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1402 ; SSE2-NEXT: paddb %xmm2, %xmm0
1403 ; SSE2-NEXT: pxor %xmm1, %xmm1
1404 ; SSE2-NEXT: psadbw %xmm0, %xmm1
1405 ; SSE2-NEXT: movd %xmm1, %eax
1406 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1409 ; SSE41-LABEL: test_v128i8:
1411 ; SSE41-NEXT: paddb %xmm7, %xmm3
1412 ; SSE41-NEXT: paddb %xmm5, %xmm3
1413 ; SSE41-NEXT: paddb %xmm1, %xmm3
1414 ; SSE41-NEXT: paddb %xmm6, %xmm2
1415 ; SSE41-NEXT: paddb %xmm4, %xmm2
1416 ; SSE41-NEXT: paddb %xmm3, %xmm2
1417 ; SSE41-NEXT: paddb %xmm0, %xmm2
1418 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1419 ; SSE41-NEXT: paddb %xmm2, %xmm0
1420 ; SSE41-NEXT: pxor %xmm1, %xmm1
1421 ; SSE41-NEXT: psadbw %xmm0, %xmm1
1422 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1423 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1426 ; AVX1-LABEL: test_v128i8:
1428 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
1429 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
1430 ; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
1431 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1432 ; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
1433 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
1434 ; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
1435 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
1436 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
1437 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
1438 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1439 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1440 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1441 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1442 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1443 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1444 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1445 ; AVX1-NEXT: vzeroupper
1448 ; AVX2-LABEL: test_v128i8:
1450 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1
1451 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
1452 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1453 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1454 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1455 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1456 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1457 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1458 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1459 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1460 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1461 ; AVX2-NEXT: vzeroupper
1464 ; AVX512-LABEL: test_v128i8:
1466 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1467 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1468 ; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1469 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1470 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1471 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1472 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1473 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1474 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1475 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1476 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1477 ; AVX512-NEXT: vzeroupper
1479 %1 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> %a0)
1483 declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
1484 declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
1485 declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
1486 declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
1488 declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
1489 declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
1490 declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
1491 declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
1492 declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
1494 declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>)
1495 declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
1496 declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
1497 declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
1498 declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
1499 declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
1501 declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>)
1502 declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>)
1503 declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
1504 declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
1505 declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
1506 declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
1507 declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)