1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
13 define i64 @test_v2i64(<2 x i64> %a0) {
14 ; SSE-LABEL: test_v2i64:
16 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
17 ; SSE-NEXT: paddq %xmm0, %xmm1
18 ; SSE-NEXT: movq %xmm1, %rax
21 ; AVX-LABEL: test_v2i64:
23 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
24 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
25 ; AVX-NEXT: vmovq %xmm0, %rax
28 ; AVX512-LABEL: test_v2i64:
30 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
31 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
32 ; AVX512-NEXT: vmovq %xmm0, %rax
34 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %a0)
38 define i64 @test_v4i64(<4 x i64> %a0) {
39 ; SSE-LABEL: test_v4i64:
41 ; SSE-NEXT: paddq %xmm1, %xmm0
42 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
43 ; SSE-NEXT: paddq %xmm0, %xmm1
44 ; SSE-NEXT: movq %xmm1, %rax
47 ; AVX1-LABEL: test_v4i64:
49 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
50 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
51 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
52 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
53 ; AVX1-NEXT: vmovq %xmm0, %rax
54 ; AVX1-NEXT: vzeroupper
57 ; AVX2-LABEL: test_v4i64:
59 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
60 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
61 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
62 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
63 ; AVX2-NEXT: vmovq %xmm0, %rax
64 ; AVX2-NEXT: vzeroupper
67 ; AVX512-LABEL: test_v4i64:
69 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
70 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
71 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
72 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
73 ; AVX512-NEXT: vmovq %xmm0, %rax
74 ; AVX512-NEXT: vzeroupper
76 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> %a0)
80 define i64 @test_v8i64(<8 x i64> %a0) {
81 ; SSE-LABEL: test_v8i64:
83 ; SSE-NEXT: paddq %xmm3, %xmm1
84 ; SSE-NEXT: paddq %xmm2, %xmm1
85 ; SSE-NEXT: paddq %xmm0, %xmm1
86 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
87 ; SSE-NEXT: paddq %xmm1, %xmm0
88 ; SSE-NEXT: movq %xmm0, %rax
91 ; AVX1-LABEL: test_v8i64:
93 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
94 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
95 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
96 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
97 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
98 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
99 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
100 ; AVX1-NEXT: vmovq %xmm0, %rax
101 ; AVX1-NEXT: vzeroupper
104 ; AVX2-LABEL: test_v8i64:
106 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
107 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
108 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
109 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
110 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
111 ; AVX2-NEXT: vmovq %xmm0, %rax
112 ; AVX2-NEXT: vzeroupper
115 ; AVX512-LABEL: test_v8i64:
117 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
118 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
119 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
120 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
121 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
122 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
123 ; AVX512-NEXT: vmovq %xmm0, %rax
124 ; AVX512-NEXT: vzeroupper
126 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> %a0)
130 define i64 @test_v16i64(<16 x i64> %a0) {
131 ; SSE-LABEL: test_v16i64:
133 ; SSE-NEXT: paddq %xmm6, %xmm2
134 ; SSE-NEXT: paddq %xmm7, %xmm3
135 ; SSE-NEXT: paddq %xmm5, %xmm3
136 ; SSE-NEXT: paddq %xmm1, %xmm3
137 ; SSE-NEXT: paddq %xmm4, %xmm2
138 ; SSE-NEXT: paddq %xmm3, %xmm2
139 ; SSE-NEXT: paddq %xmm0, %xmm2
140 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
141 ; SSE-NEXT: paddq %xmm2, %xmm0
142 ; SSE-NEXT: movq %xmm0, %rax
145 ; AVX1-LABEL: test_v16i64:
147 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4
148 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
149 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
150 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
151 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
152 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
153 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
154 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
155 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
156 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
157 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
158 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
159 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
160 ; AVX1-NEXT: vmovq %xmm0, %rax
161 ; AVX1-NEXT: vzeroupper
164 ; AVX2-LABEL: test_v16i64:
166 ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
167 ; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
168 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
169 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
170 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
171 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
172 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
173 ; AVX2-NEXT: vmovq %xmm0, %rax
174 ; AVX2-NEXT: vzeroupper
177 ; AVX512-LABEL: test_v16i64:
179 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
180 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
181 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
182 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
183 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
184 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
185 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
186 ; AVX512-NEXT: vmovq %xmm0, %rax
187 ; AVX512-NEXT: vzeroupper
189 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> %a0)
197 define i32 @test_v2i32(<2 x i32> %a0) {
198 ; SSE-LABEL: test_v2i32:
200 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
201 ; SSE-NEXT: paddq %xmm0, %xmm1
202 ; SSE-NEXT: movd %xmm1, %eax
205 ; AVX-LABEL: test_v2i32:
207 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
208 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
209 ; AVX-NEXT: vmovd %xmm0, %eax
212 ; AVX512-LABEL: test_v2i32:
214 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
215 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
216 ; AVX512-NEXT: vmovd %xmm0, %eax
218 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> %a0)
222 define i32 @test_v4i32(<4 x i32> %a0) {
223 ; SSE-LABEL: test_v4i32:
225 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
226 ; SSE-NEXT: paddd %xmm0, %xmm1
227 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
228 ; SSE-NEXT: paddd %xmm1, %xmm0
229 ; SSE-NEXT: movd %xmm0, %eax
232 ; AVX-LABEL: test_v4i32:
234 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
235 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
236 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
237 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
238 ; AVX-NEXT: vmovd %xmm0, %eax
241 ; AVX512-LABEL: test_v4i32:
243 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
244 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
245 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
246 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
247 ; AVX512-NEXT: vmovd %xmm0, %eax
249 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a0)
253 define i32 @test_v8i32(<8 x i32> %a0) {
254 ; SSE-LABEL: test_v8i32:
256 ; SSE-NEXT: paddd %xmm1, %xmm0
257 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
258 ; SSE-NEXT: paddd %xmm0, %xmm1
259 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
260 ; SSE-NEXT: paddd %xmm1, %xmm0
261 ; SSE-NEXT: movd %xmm0, %eax
264 ; AVX1-LABEL: test_v8i32:
266 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
267 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
268 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
269 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
270 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
271 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
272 ; AVX1-NEXT: vmovd %xmm0, %eax
273 ; AVX1-NEXT: vzeroupper
276 ; AVX2-LABEL: test_v8i32:
278 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
279 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
280 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
281 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
282 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
283 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
284 ; AVX2-NEXT: vmovd %xmm0, %eax
285 ; AVX2-NEXT: vzeroupper
288 ; AVX512-LABEL: test_v8i32:
290 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
291 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
292 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
293 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
294 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
295 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
296 ; AVX512-NEXT: vmovd %xmm0, %eax
297 ; AVX512-NEXT: vzeroupper
299 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %a0)
303 define i32 @test_v16i32(<16 x i32> %a0) {
304 ; SSE-LABEL: test_v16i32:
306 ; SSE-NEXT: paddd %xmm3, %xmm1
307 ; SSE-NEXT: paddd %xmm2, %xmm1
308 ; SSE-NEXT: paddd %xmm0, %xmm1
309 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
310 ; SSE-NEXT: paddd %xmm1, %xmm0
311 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
312 ; SSE-NEXT: paddd %xmm0, %xmm1
313 ; SSE-NEXT: movd %xmm1, %eax
316 ; AVX1-LABEL: test_v16i32:
318 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
319 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
320 ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
321 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
322 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
323 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
324 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
325 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
326 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
327 ; AVX1-NEXT: vmovd %xmm0, %eax
328 ; AVX1-NEXT: vzeroupper
331 ; AVX2-LABEL: test_v16i32:
333 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
334 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
335 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
336 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
337 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
338 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
339 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
340 ; AVX2-NEXT: vmovd %xmm0, %eax
341 ; AVX2-NEXT: vzeroupper
344 ; AVX512-LABEL: test_v16i32:
346 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
347 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
348 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
349 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
350 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
351 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
352 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
353 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
354 ; AVX512-NEXT: vmovd %xmm0, %eax
355 ; AVX512-NEXT: vzeroupper
357 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> %a0)
361 define i32 @test_v32i32(<32 x i32> %a0) {
362 ; SSE-LABEL: test_v32i32:
364 ; SSE-NEXT: paddd %xmm6, %xmm2
365 ; SSE-NEXT: paddd %xmm7, %xmm3
366 ; SSE-NEXT: paddd %xmm5, %xmm3
367 ; SSE-NEXT: paddd %xmm1, %xmm3
368 ; SSE-NEXT: paddd %xmm4, %xmm2
369 ; SSE-NEXT: paddd %xmm3, %xmm2
370 ; SSE-NEXT: paddd %xmm0, %xmm2
371 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
372 ; SSE-NEXT: paddd %xmm2, %xmm0
373 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
374 ; SSE-NEXT: paddd %xmm0, %xmm1
375 ; SSE-NEXT: movd %xmm1, %eax
378 ; AVX1-LABEL: test_v32i32:
380 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm4
381 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
382 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
383 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
384 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
385 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
386 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
387 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
388 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
389 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
390 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
391 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
392 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
393 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
394 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
395 ; AVX1-NEXT: vmovd %xmm0, %eax
396 ; AVX1-NEXT: vzeroupper
399 ; AVX2-LABEL: test_v32i32:
401 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
402 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
403 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
404 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
405 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
406 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
407 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
408 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
409 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
410 ; AVX2-NEXT: vmovd %xmm0, %eax
411 ; AVX2-NEXT: vzeroupper
414 ; AVX512-LABEL: test_v32i32:
416 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
417 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
418 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
419 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
420 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
421 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
422 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
423 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
424 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
425 ; AVX512-NEXT: vmovd %xmm0, %eax
426 ; AVX512-NEXT: vzeroupper
428 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> %a0)
436 define i16 @test_v2i16(<2 x i16> %a0) {
437 ; SSE-LABEL: test_v2i16:
439 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
440 ; SSE-NEXT: paddq %xmm0, %xmm1
441 ; SSE-NEXT: movd %xmm1, %eax
442 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
445 ; AVX-LABEL: test_v2i16:
447 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
448 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
449 ; AVX-NEXT: vmovd %xmm0, %eax
450 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
453 ; AVX512-LABEL: test_v2i16:
455 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
456 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
457 ; AVX512-NEXT: vmovd %xmm0, %eax
458 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
460 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> %a0)
464 define i16 @test_v4i16(<4 x i16> %a0) {
465 ; SSE-LABEL: test_v4i16:
467 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
468 ; SSE-NEXT: paddd %xmm0, %xmm1
469 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
470 ; SSE-NEXT: paddd %xmm1, %xmm0
471 ; SSE-NEXT: movd %xmm0, %eax
472 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
475 ; AVX-LABEL: test_v4i16:
477 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
478 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
479 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
480 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
481 ; AVX-NEXT: vmovd %xmm0, %eax
482 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
485 ; AVX512-LABEL: test_v4i16:
487 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
488 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
489 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
490 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
491 ; AVX512-NEXT: vmovd %xmm0, %eax
492 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
494 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> %a0)
498 define i16 @test_v8i16(<8 x i16> %a0) {
499 ; SSE-LABEL: test_v8i16:
501 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
502 ; SSE-NEXT: paddw %xmm0, %xmm1
503 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
504 ; SSE-NEXT: paddw %xmm1, %xmm0
505 ; SSE-NEXT: movdqa %xmm0, %xmm1
506 ; SSE-NEXT: psrld $16, %xmm1
507 ; SSE-NEXT: paddw %xmm0, %xmm1
508 ; SSE-NEXT: movd %xmm1, %eax
509 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
512 ; AVX-LABEL: test_v8i16:
514 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
515 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
516 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
517 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
518 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
519 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
520 ; AVX-NEXT: vmovd %xmm0, %eax
521 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
524 ; AVX512-LABEL: test_v8i16:
526 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
527 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
528 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
529 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
530 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
531 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
532 ; AVX512-NEXT: vmovd %xmm0, %eax
533 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
535 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %a0)
539 define i16 @test_v16i16(<16 x i16> %a0) {
540 ; SSE-LABEL: test_v16i16:
542 ; SSE-NEXT: paddw %xmm1, %xmm0
543 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
544 ; SSE-NEXT: paddw %xmm0, %xmm1
545 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
546 ; SSE-NEXT: paddw %xmm1, %xmm0
547 ; SSE-NEXT: movdqa %xmm0, %xmm1
548 ; SSE-NEXT: psrld $16, %xmm1
549 ; SSE-NEXT: paddw %xmm0, %xmm1
550 ; SSE-NEXT: movd %xmm1, %eax
551 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
554 ; AVX1-LABEL: test_v16i16:
556 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
557 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
558 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
559 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
560 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
561 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
562 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
563 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
564 ; AVX1-NEXT: vmovd %xmm0, %eax
565 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
566 ; AVX1-NEXT: vzeroupper
569 ; AVX2-LABEL: test_v16i16:
571 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
572 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
573 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
574 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
575 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
576 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
577 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
578 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
579 ; AVX2-NEXT: vmovd %xmm0, %eax
580 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
581 ; AVX2-NEXT: vzeroupper
584 ; AVX512-LABEL: test_v16i16:
586 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
587 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
588 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
589 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
590 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
591 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
592 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
593 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
594 ; AVX512-NEXT: vmovd %xmm0, %eax
595 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
596 ; AVX512-NEXT: vzeroupper
598 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %a0)
602 define i16 @test_v32i16(<32 x i16> %a0) {
603 ; SSE-LABEL: test_v32i16:
605 ; SSE-NEXT: paddw %xmm3, %xmm1
606 ; SSE-NEXT: paddw %xmm2, %xmm1
607 ; SSE-NEXT: paddw %xmm0, %xmm1
608 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
609 ; SSE-NEXT: paddw %xmm1, %xmm0
610 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
611 ; SSE-NEXT: paddw %xmm0, %xmm1
612 ; SSE-NEXT: movdqa %xmm1, %xmm0
613 ; SSE-NEXT: psrld $16, %xmm0
614 ; SSE-NEXT: paddw %xmm1, %xmm0
615 ; SSE-NEXT: movd %xmm0, %eax
616 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
619 ; AVX1-LABEL: test_v32i16:
621 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
622 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
623 ; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
624 ; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
625 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
626 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
627 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
628 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
629 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
630 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
631 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
632 ; AVX1-NEXT: vmovd %xmm0, %eax
633 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
634 ; AVX1-NEXT: vzeroupper
637 ; AVX2-LABEL: test_v32i16:
639 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
640 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
641 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
642 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
643 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
644 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
645 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
646 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
647 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
648 ; AVX2-NEXT: vmovd %xmm0, %eax
649 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
650 ; AVX2-NEXT: vzeroupper
653 ; AVX512-LABEL: test_v32i16:
655 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
656 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
657 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
658 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
659 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
660 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
661 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
662 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
663 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
664 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
665 ; AVX512-NEXT: vmovd %xmm0, %eax
666 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
667 ; AVX512-NEXT: vzeroupper
669 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> %a0)
673 define i16 @test_v64i16(<64 x i16> %a0) {
674 ; SSE-LABEL: test_v64i16:
676 ; SSE-NEXT: paddw %xmm6, %xmm2
677 ; SSE-NEXT: paddw %xmm7, %xmm3
678 ; SSE-NEXT: paddw %xmm5, %xmm3
679 ; SSE-NEXT: paddw %xmm1, %xmm3
680 ; SSE-NEXT: paddw %xmm4, %xmm2
681 ; SSE-NEXT: paddw %xmm3, %xmm2
682 ; SSE-NEXT: paddw %xmm0, %xmm2
683 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
684 ; SSE-NEXT: paddw %xmm2, %xmm0
685 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
686 ; SSE-NEXT: paddw %xmm0, %xmm1
687 ; SSE-NEXT: movdqa %xmm1, %xmm0
688 ; SSE-NEXT: psrld $16, %xmm0
689 ; SSE-NEXT: paddw %xmm1, %xmm0
690 ; SSE-NEXT: movd %xmm0, %eax
691 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
694 ; AVX1-LABEL: test_v64i16:
696 ; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm4
697 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
698 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
699 ; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
700 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
701 ; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
702 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
703 ; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
704 ; AVX1-NEXT: vpaddw %xmm4, %xmm2, %xmm2
705 ; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1
706 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
707 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
708 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
709 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
710 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
711 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
712 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
713 ; AVX1-NEXT: vmovd %xmm0, %eax
714 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
715 ; AVX1-NEXT: vzeroupper
718 ; AVX2-LABEL: test_v64i16:
720 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
721 ; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
722 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
723 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
724 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
725 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
726 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
727 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
728 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
729 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
730 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
731 ; AVX2-NEXT: vmovd %xmm0, %eax
732 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
733 ; AVX2-NEXT: vzeroupper
736 ; AVX512-LABEL: test_v64i16:
738 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
739 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
740 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
741 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
742 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
743 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
744 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
745 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
746 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
747 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
748 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
749 ; AVX512-NEXT: vmovd %xmm0, %eax
750 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
751 ; AVX512-NEXT: vzeroupper
753 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> %a0)
761 define i8 @test_v2i8(<2 x i8> %a0) {
762 ; SSE2-LABEL: test_v2i8:
764 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
765 ; SSE2-NEXT: paddq %xmm0, %xmm1
766 ; SSE2-NEXT: movd %xmm1, %eax
767 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
770 ; SSE41-LABEL: test_v2i8:
772 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
773 ; SSE41-NEXT: paddq %xmm0, %xmm1
774 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
775 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
778 ; AVX-LABEL: test_v2i8:
780 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
781 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
782 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
783 ; AVX-NEXT: # kill: def $al killed $al killed $eax
786 ; AVX512-LABEL: test_v2i8:
788 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
789 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
790 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
791 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
793 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> %a0)
797 define i8 @test_v4i8(<4 x i8> %a0) {
798 ; SSE2-LABEL: test_v4i8:
800 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
801 ; SSE2-NEXT: paddd %xmm0, %xmm1
802 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
803 ; SSE2-NEXT: paddd %xmm1, %xmm0
804 ; SSE2-NEXT: movd %xmm0, %eax
805 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
808 ; SSE41-LABEL: test_v4i8:
810 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
811 ; SSE41-NEXT: paddd %xmm0, %xmm1
812 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
813 ; SSE41-NEXT: paddd %xmm1, %xmm0
814 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
815 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
818 ; AVX-LABEL: test_v4i8:
820 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
821 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
822 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
823 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
824 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
825 ; AVX-NEXT: # kill: def $al killed $al killed $eax
828 ; AVX512-LABEL: test_v4i8:
830 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
831 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
832 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
833 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
834 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
835 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
837 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> %a0)
841 define i8 @test_v8i8(<8 x i8> %a0) {
842 ; SSE2-LABEL: test_v8i8:
844 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
845 ; SSE2-NEXT: paddw %xmm0, %xmm1
846 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
847 ; SSE2-NEXT: paddw %xmm1, %xmm0
848 ; SSE2-NEXT: movdqa %xmm0, %xmm1
849 ; SSE2-NEXT: psrld $16, %xmm1
850 ; SSE2-NEXT: paddw %xmm0, %xmm1
851 ; SSE2-NEXT: movd %xmm1, %eax
852 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
855 ; SSE41-LABEL: test_v8i8:
857 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
858 ; SSE41-NEXT: paddw %xmm0, %xmm1
859 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
860 ; SSE41-NEXT: paddw %xmm1, %xmm0
861 ; SSE41-NEXT: movdqa %xmm0, %xmm1
862 ; SSE41-NEXT: psrld $16, %xmm1
863 ; SSE41-NEXT: paddw %xmm0, %xmm1
864 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
865 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
868 ; AVX-LABEL: test_v8i8:
870 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
871 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
872 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
873 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
874 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
875 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
876 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
877 ; AVX-NEXT: # kill: def $al killed $al killed $eax
880 ; AVX512-LABEL: test_v8i8:
882 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
883 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
884 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
885 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
886 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
887 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
888 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
889 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
891 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> %a0)
895 define i8 @test_v16i8(<16 x i8> %a0) {
896 ; SSE2-LABEL: test_v16i8:
898 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
899 ; SSE2-NEXT: paddb %xmm0, %xmm1
900 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
901 ; SSE2-NEXT: paddb %xmm1, %xmm0
902 ; SSE2-NEXT: movdqa %xmm0, %xmm1
903 ; SSE2-NEXT: psrld $16, %xmm1
904 ; SSE2-NEXT: paddb %xmm0, %xmm1
905 ; SSE2-NEXT: movdqa %xmm1, %xmm0
906 ; SSE2-NEXT: psrlw $8, %xmm0
907 ; SSE2-NEXT: paddb %xmm1, %xmm0
908 ; SSE2-NEXT: movd %xmm0, %eax
909 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
912 ; SSE41-LABEL: test_v16i8:
914 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
915 ; SSE41-NEXT: paddb %xmm0, %xmm1
916 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
917 ; SSE41-NEXT: paddb %xmm1, %xmm0
918 ; SSE41-NEXT: movdqa %xmm0, %xmm1
919 ; SSE41-NEXT: psrld $16, %xmm1
920 ; SSE41-NEXT: paddb %xmm0, %xmm1
921 ; SSE41-NEXT: movdqa %xmm1, %xmm0
922 ; SSE41-NEXT: psrlw $8, %xmm0
923 ; SSE41-NEXT: paddb %xmm1, %xmm0
924 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
925 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
928 ; AVX-LABEL: test_v16i8:
930 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
931 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
932 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
933 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
934 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
935 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
936 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
937 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
938 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
939 ; AVX-NEXT: # kill: def $al killed $al killed $eax
942 ; AVX512-LABEL: test_v16i8:
944 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
945 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
946 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
947 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
948 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
949 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
950 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
951 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
952 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
953 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
955 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %a0)
959 define i8 @test_v32i8(<32 x i8> %a0) {
960 ; SSE2-LABEL: test_v32i8:
962 ; SSE2-NEXT: paddb %xmm1, %xmm0
963 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
964 ; SSE2-NEXT: paddb %xmm0, %xmm1
965 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
966 ; SSE2-NEXT: paddb %xmm1, %xmm0
967 ; SSE2-NEXT: movdqa %xmm0, %xmm1
968 ; SSE2-NEXT: psrld $16, %xmm1
969 ; SSE2-NEXT: paddb %xmm0, %xmm1
970 ; SSE2-NEXT: movdqa %xmm1, %xmm0
971 ; SSE2-NEXT: psrlw $8, %xmm0
972 ; SSE2-NEXT: paddb %xmm1, %xmm0
973 ; SSE2-NEXT: movd %xmm0, %eax
974 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
977 ; SSE41-LABEL: test_v32i8:
979 ; SSE41-NEXT: paddb %xmm1, %xmm0
980 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
981 ; SSE41-NEXT: paddb %xmm0, %xmm1
982 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
983 ; SSE41-NEXT: paddb %xmm1, %xmm0
984 ; SSE41-NEXT: movdqa %xmm0, %xmm1
985 ; SSE41-NEXT: psrld $16, %xmm1
986 ; SSE41-NEXT: paddb %xmm0, %xmm1
987 ; SSE41-NEXT: movdqa %xmm1, %xmm0
988 ; SSE41-NEXT: psrlw $8, %xmm0
989 ; SSE41-NEXT: paddb %xmm1, %xmm0
990 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
991 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
994 ; AVX1-LABEL: test_v32i8:
996 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
997 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
998 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
999 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1000 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1001 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1002 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1003 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1004 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1005 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1006 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1007 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1008 ; AVX1-NEXT: vzeroupper
1011 ; AVX2-LABEL: test_v32i8:
1013 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1014 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1015 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1016 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1017 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1018 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1019 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1020 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1021 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1022 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1023 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1024 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1025 ; AVX2-NEXT: vzeroupper
1028 ; AVX512-LABEL: test_v32i8:
1030 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1031 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1032 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1033 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1034 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1035 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1036 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1037 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1038 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1039 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1040 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1041 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1042 ; AVX512-NEXT: vzeroupper
1044 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> %a0)
1048 define i8 @test_v64i8(<64 x i8> %a0) {
1049 ; SSE2-LABEL: test_v64i8:
1051 ; SSE2-NEXT: paddb %xmm3, %xmm1
1052 ; SSE2-NEXT: paddb %xmm2, %xmm1
1053 ; SSE2-NEXT: paddb %xmm0, %xmm1
1054 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1055 ; SSE2-NEXT: paddb %xmm1, %xmm0
1056 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1057 ; SSE2-NEXT: paddb %xmm0, %xmm1
1058 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1059 ; SSE2-NEXT: psrld $16, %xmm0
1060 ; SSE2-NEXT: paddb %xmm1, %xmm0
1061 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1062 ; SSE2-NEXT: psrlw $8, %xmm1
1063 ; SSE2-NEXT: paddb %xmm0, %xmm1
1064 ; SSE2-NEXT: movd %xmm1, %eax
1065 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1068 ; SSE41-LABEL: test_v64i8:
1070 ; SSE41-NEXT: paddb %xmm3, %xmm1
1071 ; SSE41-NEXT: paddb %xmm2, %xmm1
1072 ; SSE41-NEXT: paddb %xmm0, %xmm1
1073 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1074 ; SSE41-NEXT: paddb %xmm1, %xmm0
1075 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1076 ; SSE41-NEXT: paddb %xmm0, %xmm1
1077 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1078 ; SSE41-NEXT: psrld $16, %xmm0
1079 ; SSE41-NEXT: paddb %xmm1, %xmm0
1080 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1081 ; SSE41-NEXT: psrlw $8, %xmm1
1082 ; SSE41-NEXT: paddb %xmm0, %xmm1
1083 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1084 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1087 ; AVX1-LABEL: test_v64i8:
1089 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1090 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1091 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
1092 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
1093 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1094 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1095 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1096 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1097 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1098 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1099 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1100 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1101 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1102 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1103 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1104 ; AVX1-NEXT: vzeroupper
1107 ; AVX2-LABEL: test_v64i8:
1109 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1110 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1111 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1112 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1113 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1114 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1115 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1116 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1117 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1118 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1119 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1120 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1121 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1122 ; AVX2-NEXT: vzeroupper
1125 ; AVX512-LABEL: test_v64i8:
1127 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1128 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1129 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1130 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1131 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1132 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1133 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1134 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1135 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1136 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1137 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1138 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1139 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1140 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1141 ; AVX512-NEXT: vzeroupper
1143 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> %a0)
1147 define i8 @test_v128i8(<128 x i8> %a0) {
1148 ; SSE2-LABEL: test_v128i8:
1150 ; SSE2-NEXT: paddb %xmm6, %xmm2
1151 ; SSE2-NEXT: paddb %xmm7, %xmm3
1152 ; SSE2-NEXT: paddb %xmm5, %xmm3
1153 ; SSE2-NEXT: paddb %xmm1, %xmm3
1154 ; SSE2-NEXT: paddb %xmm4, %xmm2
1155 ; SSE2-NEXT: paddb %xmm3, %xmm2
1156 ; SSE2-NEXT: paddb %xmm0, %xmm2
1157 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1158 ; SSE2-NEXT: paddb %xmm2, %xmm0
1159 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1160 ; SSE2-NEXT: paddb %xmm0, %xmm1
1161 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1162 ; SSE2-NEXT: psrld $16, %xmm0
1163 ; SSE2-NEXT: paddb %xmm1, %xmm0
1164 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1165 ; SSE2-NEXT: psrlw $8, %xmm1
1166 ; SSE2-NEXT: paddb %xmm0, %xmm1
1167 ; SSE2-NEXT: movd %xmm1, %eax
1168 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1171 ; SSE41-LABEL: test_v128i8:
1173 ; SSE41-NEXT: paddb %xmm6, %xmm2
1174 ; SSE41-NEXT: paddb %xmm7, %xmm3
1175 ; SSE41-NEXT: paddb %xmm5, %xmm3
1176 ; SSE41-NEXT: paddb %xmm1, %xmm3
1177 ; SSE41-NEXT: paddb %xmm4, %xmm2
1178 ; SSE41-NEXT: paddb %xmm3, %xmm2
1179 ; SSE41-NEXT: paddb %xmm0, %xmm2
1180 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1181 ; SSE41-NEXT: paddb %xmm2, %xmm0
1182 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1183 ; SSE41-NEXT: paddb %xmm0, %xmm1
1184 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1185 ; SSE41-NEXT: psrld $16, %xmm0
1186 ; SSE41-NEXT: paddb %xmm1, %xmm0
1187 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1188 ; SSE41-NEXT: psrlw $8, %xmm1
1189 ; SSE41-NEXT: paddb %xmm0, %xmm1
1190 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1191 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1194 ; AVX1-LABEL: test_v128i8:
1196 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm4
1197 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1198 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1199 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
1200 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1201 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
1202 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1203 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
1204 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
1205 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
1206 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1207 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1208 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1209 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1210 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1211 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1212 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1213 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1214 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1215 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1216 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1217 ; AVX1-NEXT: vzeroupper
1220 ; AVX2-LABEL: test_v128i8:
1222 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1
1223 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
1224 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1225 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1226 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1227 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1228 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1229 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1230 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1231 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1232 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1233 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1234 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1235 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1236 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1237 ; AVX2-NEXT: vzeroupper
1240 ; AVX512-LABEL: test_v128i8:
1242 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1243 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1244 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1245 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1246 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1247 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1248 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1249 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1250 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1251 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1252 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1253 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1254 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1255 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1256 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1257 ; AVX512-NEXT: vzeroupper
1259 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> %a0)
1263 declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
1264 declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
1265 declare i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64>)
1266 declare i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64>)
1268 declare i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32>)
1269 declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
1270 declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
1271 declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
1272 declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>)
1274 declare i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16>)
1275 declare i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16>)
1276 declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
1277 declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
1278 declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>)
1279 declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>)
1281 declare i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8>)
1282 declare i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8>)
1283 declare i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8>)
1284 declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
1285 declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
1286 declare i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8>)
1287 declare i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8>)