1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
14 define i64 @test_v2i64_v2i32(<2 x i32> %a0) {
15 ; SSE2-LABEL: test_v2i64_v2i32:
17 ; SSE2-NEXT: xorps %xmm1, %xmm1
18 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
19 ; SSE2-NEXT: psrlq $32, %xmm0
20 ; SSE2-NEXT: paddq %xmm1, %xmm0
21 ; SSE2-NEXT: movq %xmm0, %rax
24 ; SSE41-LABEL: test_v2i64_v2i32:
26 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
27 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
28 ; SSE41-NEXT: paddq %xmm0, %xmm1
29 ; SSE41-NEXT: movq %xmm1, %rax
32 ; AVX-LABEL: test_v2i64_v2i32:
34 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
35 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
36 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
37 ; AVX-NEXT: vmovq %xmm0, %rax
39 %1 = zext <2 x i32> %a0 to <2 x i64>
40 %2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %1)
44 define i64 @test_v4i64_v4i16(<4 x i16> %a0) {
45 ; SSE2-LABEL: test_v4i64_v4i16:
47 ; SSE2-NEXT: pxor %xmm1, %xmm1
48 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
49 ; SSE2-NEXT: movdqa %xmm0, %xmm2
50 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
51 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
52 ; SSE2-NEXT: paddq %xmm2, %xmm0
53 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
54 ; SSE2-NEXT: paddq %xmm0, %xmm1
55 ; SSE2-NEXT: movq %xmm1, %rax
58 ; SSE41-LABEL: test_v4i64_v4i16:
60 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
61 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
62 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
63 ; SSE41-NEXT: paddq %xmm1, %xmm0
64 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
65 ; SSE41-NEXT: paddq %xmm0, %xmm1
66 ; SSE41-NEXT: movq %xmm1, %rax
69 ; AVX1-LABEL: test_v4i64_v4i16:
71 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
72 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
73 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
74 ; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
75 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
76 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
77 ; AVX1-NEXT: vmovq %xmm0, %rax
80 ; AVX2-LABEL: test_v4i64_v4i16:
82 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
83 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
84 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
85 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
86 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
87 ; AVX2-NEXT: vmovq %xmm0, %rax
88 ; AVX2-NEXT: vzeroupper
91 ; AVX512-LABEL: test_v4i64_v4i16:
93 ; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
94 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
95 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
96 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
97 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
98 ; AVX512-NEXT: vmovq %xmm0, %rax
99 ; AVX512-NEXT: vzeroupper
101 %1 = zext <4 x i16> %a0 to <4 x i64>
102 %2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1)
106 define i64 @test_v8i64_v8i8(<8 x i8> %a0) {
107 ; SSE-LABEL: test_v8i64_v8i8:
109 ; SSE-NEXT: pxor %xmm1, %xmm1
110 ; SSE-NEXT: psadbw %xmm0, %xmm1
111 ; SSE-NEXT: movq %xmm1, %rax
114 ; AVX-LABEL: test_v8i64_v8i8:
116 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
117 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
118 ; AVX-NEXT: vmovq %xmm0, %rax
120 %1 = zext <8 x i8> %a0 to <8 x i64>
121 %2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1)
125 define i64 @test_v16i64_v16i8(<16 x i8> %a0) {
126 ; SSE-LABEL: test_v16i64_v16i8:
128 ; SSE-NEXT: pxor %xmm1, %xmm1
129 ; SSE-NEXT: psadbw %xmm0, %xmm1
130 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
131 ; SSE-NEXT: paddq %xmm1, %xmm0
132 ; SSE-NEXT: movq %xmm0, %rax
135 ; AVX-LABEL: test_v16i64_v16i8:
137 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
138 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
139 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
140 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
141 ; AVX-NEXT: vmovq %xmm0, %rax
143 %1 = zext <16 x i8> %a0 to <16 x i64>
144 %2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %1)
152 define i32 @test_v2i32_v2i16(<2 x i16> %a0) {
153 ; SSE2-LABEL: test_v2i32_v2i16:
155 ; SSE2-NEXT: pxor %xmm1, %xmm1
156 ; SSE2-NEXT: movdqa %xmm0, %xmm2
157 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
158 ; SSE2-NEXT: psrld $16, %xmm0
159 ; SSE2-NEXT: paddd %xmm2, %xmm0
160 ; SSE2-NEXT: movd %xmm0, %eax
163 ; SSE41-LABEL: test_v2i32_v2i16:
165 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
166 ; SSE41-NEXT: psrld $16, %xmm0
167 ; SSE41-NEXT: paddd %xmm1, %xmm0
168 ; SSE41-NEXT: movd %xmm0, %eax
171 ; AVX1-SLOW-LABEL: test_v2i32_v2i16:
172 ; AVX1-SLOW: # %bb.0:
173 ; AVX1-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
174 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0
175 ; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
176 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
177 ; AVX1-SLOW-NEXT: retq
179 ; AVX1-FAST-LABEL: test_v2i32_v2i16:
180 ; AVX1-FAST: # %bb.0:
181 ; AVX1-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
182 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
183 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
184 ; AVX1-FAST-NEXT: retq
186 ; AVX2-LABEL: test_v2i32_v2i16:
188 ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
189 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
190 ; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
191 ; AVX2-NEXT: vmovd %xmm0, %eax
194 ; AVX512-LABEL: test_v2i32_v2i16:
196 ; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
197 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
198 ; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0
199 ; AVX512-NEXT: vmovd %xmm0, %eax
201 %1 = zext <2 x i16> %a0 to <2 x i32>
202 %2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %1)
206 define i32 @test_v4i32(<4 x i8> %a0) {
207 ; SSE2-LABEL: test_v4i32:
209 ; SSE2-NEXT: pxor %xmm1, %xmm1
210 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
211 ; SSE2-NEXT: psadbw %xmm1, %xmm0
212 ; SSE2-NEXT: movd %xmm0, %eax
215 ; SSE41-LABEL: test_v4i32:
217 ; SSE41-NEXT: pxor %xmm1, %xmm1
218 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
219 ; SSE41-NEXT: psadbw %xmm1, %xmm0
220 ; SSE41-NEXT: movd %xmm0, %eax
223 ; AVX1-LABEL: test_v4i32:
225 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
226 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
227 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
228 ; AVX1-NEXT: vmovd %xmm0, %eax
231 ; AVX2-LABEL: test_v4i32:
233 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
234 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
235 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
236 ; AVX2-NEXT: vmovd %xmm0, %eax
239 ; AVX512-LABEL: test_v4i32:
241 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
242 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
243 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
244 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
245 ; AVX512-NEXT: vmovd %xmm0, %eax
247 %1 = zext <4 x i8> %a0 to <4 x i32>
248 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
252 define i32 @test_v8i32_v8i8(<8 x i8> %a0) {
253 ; SSE-LABEL: test_v8i32_v8i8:
255 ; SSE-NEXT: pxor %xmm1, %xmm1
256 ; SSE-NEXT: psadbw %xmm0, %xmm1
257 ; SSE-NEXT: movd %xmm1, %eax
260 ; AVX-LABEL: test_v8i32_v8i8:
262 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
263 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
264 ; AVX-NEXT: vmovd %xmm0, %eax
266 %1 = zext <8 x i8> %a0 to <8 x i32>
267 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
271 define i32 @test_v16i32_v16i8(<16 x i8> %a0) {
272 ; SSE-LABEL: test_v16i32_v16i8:
274 ; SSE-NEXT: pxor %xmm1, %xmm1
275 ; SSE-NEXT: psadbw %xmm0, %xmm1
276 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
277 ; SSE-NEXT: paddq %xmm1, %xmm0
278 ; SSE-NEXT: movd %xmm0, %eax
281 ; AVX-LABEL: test_v16i32_v16i8:
283 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
284 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
285 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
286 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
287 ; AVX-NEXT: vmovd %xmm0, %eax
289 %1 = zext <16 x i8> %a0 to <16 x i32>
290 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
294 define i32 @test_v32i32_v32i8(<32 x i8> %a0) {
295 ; SSE-LABEL: test_v32i32_v32i8:
297 ; SSE-NEXT: pxor %xmm2, %xmm2
298 ; SSE-NEXT: psadbw %xmm2, %xmm1
299 ; SSE-NEXT: psadbw %xmm2, %xmm0
300 ; SSE-NEXT: paddq %xmm1, %xmm0
301 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
302 ; SSE-NEXT: paddq %xmm0, %xmm1
303 ; SSE-NEXT: movd %xmm1, %eax
306 ; AVX1-LABEL: test_v32i32_v32i8:
308 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
309 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
310 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
311 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
312 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
313 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
314 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
315 ; AVX1-NEXT: vmovd %xmm0, %eax
316 ; AVX1-NEXT: vzeroupper
319 ; AVX2-LABEL: test_v32i32_v32i8:
321 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
322 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
323 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
324 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
325 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
326 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
327 ; AVX2-NEXT: vmovd %xmm0, %eax
328 ; AVX2-NEXT: vzeroupper
331 ; AVX512-LABEL: test_v32i32_v32i8:
333 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
334 ; AVX512-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
335 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
336 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
337 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
338 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
339 ; AVX512-NEXT: vmovd %xmm0, %eax
340 ; AVX512-NEXT: vzeroupper
342 %1 = zext <32 x i8> %a0 to <32 x i32>
343 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
351 define i16 @test_v2i16_v2i8(<2 x i8> %a0) {
352 ; SSE2-LABEL: test_v2i16_v2i8:
354 ; SSE2-NEXT: pxor %xmm1, %xmm1
355 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
356 ; SSE2-NEXT: movdqa %xmm0, %xmm1
357 ; SSE2-NEXT: psrld $16, %xmm1
358 ; SSE2-NEXT: paddw %xmm0, %xmm1
359 ; SSE2-NEXT: movd %xmm1, %eax
360 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
363 ; SSE41-LABEL: test_v2i16_v2i8:
365 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
366 ; SSE41-NEXT: movdqa %xmm0, %xmm1
367 ; SSE41-NEXT: psrld $16, %xmm1
368 ; SSE41-NEXT: paddw %xmm0, %xmm1
369 ; SSE41-NEXT: movd %xmm1, %eax
370 ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
373 ; AVX1-SLOW-LABEL: test_v2i16_v2i8:
374 ; AVX1-SLOW: # %bb.0:
375 ; AVX1-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
376 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
377 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
378 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
379 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
380 ; AVX1-SLOW-NEXT: retq
382 ; AVX1-FAST-LABEL: test_v2i16_v2i8:
383 ; AVX1-FAST: # %bb.0:
384 ; AVX1-FAST-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
385 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
386 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
387 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
388 ; AVX1-FAST-NEXT: retq
390 ; AVX2-LABEL: test_v2i16_v2i8:
392 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
393 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
394 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
395 ; AVX2-NEXT: vmovd %xmm0, %eax
396 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
399 ; AVX512-LABEL: test_v2i16_v2i8:
401 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
402 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
403 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
404 ; AVX512-NEXT: vmovd %xmm0, %eax
405 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
407 %1 = zext <2 x i8> %a0 to <2 x i16>
408 %2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %1)
412 define i16 @test_v4i16_v4i8(<4 x i8> %a0) {
413 ; SSE2-LABEL: test_v4i16_v4i8:
415 ; SSE2-NEXT: pxor %xmm1, %xmm1
416 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
417 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
418 ; SSE2-NEXT: paddw %xmm0, %xmm1
419 ; SSE2-NEXT: movdqa %xmm1, %xmm0
420 ; SSE2-NEXT: psrld $16, %xmm0
421 ; SSE2-NEXT: paddw %xmm1, %xmm0
422 ; SSE2-NEXT: movd %xmm0, %eax
423 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
426 ; SSE41-LABEL: test_v4i16_v4i8:
428 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
429 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
430 ; SSE41-NEXT: paddw %xmm0, %xmm1
431 ; SSE41-NEXT: movdqa %xmm1, %xmm0
432 ; SSE41-NEXT: psrld $16, %xmm0
433 ; SSE41-NEXT: paddw %xmm1, %xmm0
434 ; SSE41-NEXT: movd %xmm0, %eax
435 ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
438 ; AVX1-SLOW-LABEL: test_v4i16_v4i8:
439 ; AVX1-SLOW: # %bb.0:
440 ; AVX1-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
441 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
442 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
443 ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
444 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
445 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
446 ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
447 ; AVX1-SLOW-NEXT: retq
449 ; AVX1-FAST-LABEL: test_v4i16_v4i8:
450 ; AVX1-FAST: # %bb.0:
451 ; AVX1-FAST-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
452 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
453 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
454 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
455 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
456 ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
457 ; AVX1-FAST-NEXT: retq
459 ; AVX2-LABEL: test_v4i16_v4i8:
461 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
462 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
463 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
464 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
465 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
466 ; AVX2-NEXT: vmovd %xmm0, %eax
467 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
470 ; AVX512-LABEL: test_v4i16_v4i8:
472 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
473 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
474 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
475 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
476 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
477 ; AVX512-NEXT: vmovd %xmm0, %eax
478 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
480 %1 = zext <4 x i8> %a0 to <4 x i16>
481 %2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %1)
486 define i16 @test_v8i16_v8i8(<8 x i8> %a0) {
487 ; SSE-LABEL: test_v8i16_v8i8:
489 ; SSE-NEXT: pxor %xmm1, %xmm1
490 ; SSE-NEXT: psadbw %xmm0, %xmm1
491 ; SSE-NEXT: movd %xmm1, %eax
492 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
495 ; AVX-LABEL: test_v8i16_v8i8:
497 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
498 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
499 ; AVX-NEXT: vmovd %xmm0, %eax
500 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
502 %1 = zext <8 x i8> %a0 to <8 x i16>
503 %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1)
507 define i16 @test_v16i16_v16i8(<16 x i8> %a0) {
508 ; SSE-LABEL: test_v16i16_v16i8:
510 ; SSE-NEXT: pxor %xmm1, %xmm1
511 ; SSE-NEXT: psadbw %xmm0, %xmm1
512 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
513 ; SSE-NEXT: paddq %xmm1, %xmm0
514 ; SSE-NEXT: movd %xmm0, %eax
515 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
518 ; AVX-LABEL: test_v16i16_v16i8:
520 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
521 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
522 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
523 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
524 ; AVX-NEXT: vmovd %xmm0, %eax
525 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
527 %1 = zext <16 x i8> %a0 to <16 x i16>
528 %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1)
532 define i16 @test_v32i16_v32i8(<32 x i8> %a0) {
533 ; SSE-LABEL: test_v32i16_v32i8:
535 ; SSE-NEXT: pxor %xmm2, %xmm2
536 ; SSE-NEXT: psadbw %xmm2, %xmm1
537 ; SSE-NEXT: psadbw %xmm2, %xmm0
538 ; SSE-NEXT: paddq %xmm1, %xmm0
539 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
540 ; SSE-NEXT: paddq %xmm0, %xmm1
541 ; SSE-NEXT: movd %xmm1, %eax
542 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
545 ; AVX1-LABEL: test_v32i16_v32i8:
547 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
548 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
549 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
550 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
551 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
552 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
553 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
554 ; AVX1-NEXT: vmovd %xmm0, %eax
555 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
556 ; AVX1-NEXT: vzeroupper
559 ; AVX2-LABEL: test_v32i16_v32i8:
561 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
562 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
563 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
564 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
565 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
566 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
567 ; AVX2-NEXT: vmovd %xmm0, %eax
568 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
569 ; AVX2-NEXT: vzeroupper
572 ; AVX512-LABEL: test_v32i16_v32i8:
574 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
575 ; AVX512-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
576 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
577 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
578 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
579 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
580 ; AVX512-NEXT: vmovd %xmm0, %eax
581 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
582 ; AVX512-NEXT: vzeroupper
584 %1 = zext <32 x i8> %a0 to <32 x i16>
585 %2 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %1)
589 define i16 @test_v64i16_v64i8(<64 x i8> %a0) {
590 ; SSE-LABEL: test_v64i16_v64i8:
592 ; SSE-NEXT: pxor %xmm4, %xmm4
593 ; SSE-NEXT: psadbw %xmm4, %xmm3
594 ; SSE-NEXT: psadbw %xmm4, %xmm1
595 ; SSE-NEXT: paddq %xmm3, %xmm1
596 ; SSE-NEXT: psadbw %xmm4, %xmm2
597 ; SSE-NEXT: psadbw %xmm4, %xmm0
598 ; SSE-NEXT: paddq %xmm2, %xmm0
599 ; SSE-NEXT: paddq %xmm1, %xmm0
600 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
601 ; SSE-NEXT: paddq %xmm0, %xmm1
602 ; SSE-NEXT: movd %xmm1, %eax
603 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
606 ; AVX1-LABEL: test_v64i16_v64i8:
608 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
609 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
610 ; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
611 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
612 ; AVX1-NEXT: vpsadbw %xmm3, %xmm4, %xmm4
613 ; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
614 ; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
615 ; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
616 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
617 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
618 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
619 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
620 ; AVX1-NEXT: vmovd %xmm0, %eax
621 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
622 ; AVX1-NEXT: vzeroupper
625 ; AVX2-LABEL: test_v64i16_v64i8:
627 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
628 ; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm1
629 ; AVX2-NEXT: vpsadbw %ymm2, %ymm0, %ymm0
630 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
631 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
632 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
633 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
634 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
635 ; AVX2-NEXT: vmovd %xmm0, %eax
636 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
637 ; AVX2-NEXT: vzeroupper
640 ; AVX512-LABEL: test_v64i16_v64i8:
642 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
643 ; AVX512-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
644 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
645 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
646 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
647 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
648 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
649 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
650 ; AVX512-NEXT: vmovd %xmm0, %eax
651 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
652 ; AVX512-NEXT: vzeroupper
654 %1 = zext <64 x i8> %a0 to <64 x i16>
655 %2 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %1)
660 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
661 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
662 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
663 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
665 declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
666 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
667 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
668 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
669 declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
671 declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
672 declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
673 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
674 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
675 declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
676 declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
678 declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
679 declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
680 declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
681 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
682 declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
683 declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
684 declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)