1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQVL
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG_NOVLX
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
13 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
14 ; SSE2-LABEL: testv2i64:
16 ; SSE2-NEXT: movdqa %xmm0, %xmm1
17 ; SSE2-NEXT: psrlw $1, %xmm1
18 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
19 ; SSE2-NEXT: psubb %xmm1, %xmm0
20 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
21 ; SSE2-NEXT: movdqa %xmm0, %xmm2
22 ; SSE2-NEXT: pand %xmm1, %xmm2
23 ; SSE2-NEXT: psrlw $2, %xmm0
24 ; SSE2-NEXT: pand %xmm1, %xmm0
25 ; SSE2-NEXT: paddb %xmm2, %xmm0
26 ; SSE2-NEXT: movdqa %xmm0, %xmm1
27 ; SSE2-NEXT: psrlw $4, %xmm1
28 ; SSE2-NEXT: paddb %xmm0, %xmm1
29 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
30 ; SSE2-NEXT: pxor %xmm0, %xmm0
31 ; SSE2-NEXT: psadbw %xmm0, %xmm1
32 ; SSE2-NEXT: movdqa %xmm1, %xmm0
35 ; SSE3-LABEL: testv2i64:
37 ; SSE3-NEXT: movdqa %xmm0, %xmm1
38 ; SSE3-NEXT: psrlw $1, %xmm1
39 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
40 ; SSE3-NEXT: psubb %xmm1, %xmm0
41 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
42 ; SSE3-NEXT: movdqa %xmm0, %xmm2
43 ; SSE3-NEXT: pand %xmm1, %xmm2
44 ; SSE3-NEXT: psrlw $2, %xmm0
45 ; SSE3-NEXT: pand %xmm1, %xmm0
46 ; SSE3-NEXT: paddb %xmm2, %xmm0
47 ; SSE3-NEXT: movdqa %xmm0, %xmm1
48 ; SSE3-NEXT: psrlw $4, %xmm1
49 ; SSE3-NEXT: paddb %xmm0, %xmm1
50 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
51 ; SSE3-NEXT: pxor %xmm0, %xmm0
52 ; SSE3-NEXT: psadbw %xmm0, %xmm1
53 ; SSE3-NEXT: movdqa %xmm1, %xmm0
56 ; SSSE3-LABEL: testv2i64:
58 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
59 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
60 ; SSSE3-NEXT: pand %xmm1, %xmm2
61 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
62 ; SSSE3-NEXT: movdqa %xmm3, %xmm4
63 ; SSSE3-NEXT: pshufb %xmm2, %xmm4
64 ; SSSE3-NEXT: psrlw $4, %xmm0
65 ; SSSE3-NEXT: pand %xmm1, %xmm0
66 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
67 ; SSSE3-NEXT: paddb %xmm4, %xmm3
68 ; SSSE3-NEXT: pxor %xmm0, %xmm0
69 ; SSSE3-NEXT: psadbw %xmm3, %xmm0
72 ; SSE41-LABEL: testv2i64:
74 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
75 ; SSE41-NEXT: movdqa %xmm0, %xmm2
76 ; SSE41-NEXT: pand %xmm1, %xmm2
77 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
78 ; SSE41-NEXT: movdqa %xmm3, %xmm4
79 ; SSE41-NEXT: pshufb %xmm2, %xmm4
80 ; SSE41-NEXT: psrlw $4, %xmm0
81 ; SSE41-NEXT: pand %xmm1, %xmm0
82 ; SSE41-NEXT: pshufb %xmm0, %xmm3
83 ; SSE41-NEXT: paddb %xmm4, %xmm3
84 ; SSE41-NEXT: pxor %xmm0, %xmm0
85 ; SSE41-NEXT: psadbw %xmm3, %xmm0
88 ; AVX1-LABEL: testv2i64:
90 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
91 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
92 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
93 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
94 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
95 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
96 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
97 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
98 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
99 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
102 ; AVX2-LABEL: testv2i64:
104 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
105 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
106 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
107 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
108 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
109 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
110 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
111 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
112 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
113 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
116 ; AVX512VPOPCNTDQ-LABEL: testv2i64:
117 ; AVX512VPOPCNTDQ: # %bb.0:
118 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
119 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
120 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
121 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
122 ; AVX512VPOPCNTDQ-NEXT: retq
124 ; AVX512VPOPCNTDQVL-LABEL: testv2i64:
125 ; AVX512VPOPCNTDQVL: # %bb.0:
126 ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0
127 ; AVX512VPOPCNTDQVL-NEXT: retq
129 ; BITALG_NOVLX-LABEL: testv2i64:
130 ; BITALG_NOVLX: # %bb.0:
131 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
132 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
133 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
134 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
135 ; BITALG_NOVLX-NEXT: vzeroupper
136 ; BITALG_NOVLX-NEXT: retq
138 ; BITALG-LABEL: testv2i64:
140 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0
141 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
142 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
144 %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
148 define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
149 ; SSE2-LABEL: testv4i32:
151 ; SSE2-NEXT: movdqa %xmm0, %xmm1
152 ; SSE2-NEXT: psrlw $1, %xmm1
153 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
154 ; SSE2-NEXT: psubb %xmm1, %xmm0
155 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
156 ; SSE2-NEXT: movdqa %xmm0, %xmm2
157 ; SSE2-NEXT: pand %xmm1, %xmm2
158 ; SSE2-NEXT: psrlw $2, %xmm0
159 ; SSE2-NEXT: pand %xmm1, %xmm0
160 ; SSE2-NEXT: paddb %xmm2, %xmm0
161 ; SSE2-NEXT: movdqa %xmm0, %xmm1
162 ; SSE2-NEXT: psrlw $4, %xmm1
163 ; SSE2-NEXT: paddb %xmm0, %xmm1
164 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
165 ; SSE2-NEXT: pxor %xmm0, %xmm0
166 ; SSE2-NEXT: movdqa %xmm1, %xmm2
167 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
168 ; SSE2-NEXT: psadbw %xmm0, %xmm2
169 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
170 ; SSE2-NEXT: psadbw %xmm0, %xmm1
171 ; SSE2-NEXT: packuswb %xmm2, %xmm1
172 ; SSE2-NEXT: movdqa %xmm1, %xmm0
175 ; SSE3-LABEL: testv4i32:
177 ; SSE3-NEXT: movdqa %xmm0, %xmm1
178 ; SSE3-NEXT: psrlw $1, %xmm1
179 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
180 ; SSE3-NEXT: psubb %xmm1, %xmm0
181 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
182 ; SSE3-NEXT: movdqa %xmm0, %xmm2
183 ; SSE3-NEXT: pand %xmm1, %xmm2
184 ; SSE3-NEXT: psrlw $2, %xmm0
185 ; SSE3-NEXT: pand %xmm1, %xmm0
186 ; SSE3-NEXT: paddb %xmm2, %xmm0
187 ; SSE3-NEXT: movdqa %xmm0, %xmm1
188 ; SSE3-NEXT: psrlw $4, %xmm1
189 ; SSE3-NEXT: paddb %xmm0, %xmm1
190 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
191 ; SSE3-NEXT: pxor %xmm0, %xmm0
192 ; SSE3-NEXT: movdqa %xmm1, %xmm2
193 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
194 ; SSE3-NEXT: psadbw %xmm0, %xmm2
195 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
196 ; SSE3-NEXT: psadbw %xmm0, %xmm1
197 ; SSE3-NEXT: packuswb %xmm2, %xmm1
198 ; SSE3-NEXT: movdqa %xmm1, %xmm0
201 ; SSSE3-LABEL: testv4i32:
203 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
204 ; SSSE3-NEXT: movdqa %xmm0, %xmm3
205 ; SSSE3-NEXT: pand %xmm2, %xmm3
206 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
207 ; SSSE3-NEXT: movdqa %xmm1, %xmm4
208 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
209 ; SSSE3-NEXT: psrlw $4, %xmm0
210 ; SSSE3-NEXT: pand %xmm2, %xmm0
211 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
212 ; SSSE3-NEXT: paddb %xmm4, %xmm1
213 ; SSSE3-NEXT: pxor %xmm0, %xmm0
214 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
215 ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
216 ; SSSE3-NEXT: psadbw %xmm0, %xmm2
217 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
218 ; SSSE3-NEXT: psadbw %xmm0, %xmm1
219 ; SSSE3-NEXT: packuswb %xmm2, %xmm1
220 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
223 ; SSE41-LABEL: testv4i32:
225 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
226 ; SSE41-NEXT: movdqa %xmm0, %xmm2
227 ; SSE41-NEXT: pand %xmm1, %xmm2
228 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
229 ; SSE41-NEXT: movdqa %xmm3, %xmm4
230 ; SSE41-NEXT: pshufb %xmm2, %xmm4
231 ; SSE41-NEXT: psrlw $4, %xmm0
232 ; SSE41-NEXT: pand %xmm1, %xmm0
233 ; SSE41-NEXT: pshufb %xmm0, %xmm3
234 ; SSE41-NEXT: paddb %xmm4, %xmm3
235 ; SSE41-NEXT: pxor %xmm1, %xmm1
236 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
237 ; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
238 ; SSE41-NEXT: psadbw %xmm1, %xmm3
239 ; SSE41-NEXT: psadbw %xmm1, %xmm0
240 ; SSE41-NEXT: packuswb %xmm3, %xmm0
243 ; AVX1-LABEL: testv4i32:
245 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
246 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
247 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
248 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
249 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
250 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
251 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
252 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
253 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
254 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
255 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
256 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
257 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
258 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
261 ; AVX2-LABEL: testv4i32:
263 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
264 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
265 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
266 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
267 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
268 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
269 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
270 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
271 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
272 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
273 ; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
274 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
275 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
276 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
279 ; AVX512VPOPCNTDQ-LABEL: testv4i32:
280 ; AVX512VPOPCNTDQ: # %bb.0:
281 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
282 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
283 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
284 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
285 ; AVX512VPOPCNTDQ-NEXT: retq
287 ; AVX512VPOPCNTDQVL-LABEL: testv4i32:
288 ; AVX512VPOPCNTDQVL: # %bb.0:
289 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0
290 ; AVX512VPOPCNTDQVL-NEXT: retq
292 ; BITALG_NOVLX-LABEL: testv4i32:
293 ; BITALG_NOVLX: # %bb.0:
294 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
295 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
296 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
297 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
298 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
299 ; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
300 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
301 ; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
302 ; BITALG_NOVLX-NEXT: vzeroupper
303 ; BITALG_NOVLX-NEXT: retq
305 ; BITALG-LABEL: testv4i32:
307 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0
308 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
309 ; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
310 ; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
311 ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
312 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
313 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
315 %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
319 define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
320 ; SSE2-LABEL: testv8i16:
322 ; SSE2-NEXT: movdqa %xmm0, %xmm1
323 ; SSE2-NEXT: psrlw $1, %xmm1
324 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
325 ; SSE2-NEXT: psubb %xmm1, %xmm0
326 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
327 ; SSE2-NEXT: movdqa %xmm0, %xmm2
328 ; SSE2-NEXT: pand %xmm1, %xmm2
329 ; SSE2-NEXT: psrlw $2, %xmm0
330 ; SSE2-NEXT: pand %xmm1, %xmm0
331 ; SSE2-NEXT: paddb %xmm2, %xmm0
332 ; SSE2-NEXT: movdqa %xmm0, %xmm1
333 ; SSE2-NEXT: psrlw $4, %xmm1
334 ; SSE2-NEXT: paddb %xmm0, %xmm1
335 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
336 ; SSE2-NEXT: movdqa %xmm1, %xmm0
337 ; SSE2-NEXT: psllw $8, %xmm0
338 ; SSE2-NEXT: paddb %xmm1, %xmm0
339 ; SSE2-NEXT: psrlw $8, %xmm0
342 ; SSE3-LABEL: testv8i16:
344 ; SSE3-NEXT: movdqa %xmm0, %xmm1
345 ; SSE3-NEXT: psrlw $1, %xmm1
346 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
347 ; SSE3-NEXT: psubb %xmm1, %xmm0
348 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
349 ; SSE3-NEXT: movdqa %xmm0, %xmm2
350 ; SSE3-NEXT: pand %xmm1, %xmm2
351 ; SSE3-NEXT: psrlw $2, %xmm0
352 ; SSE3-NEXT: pand %xmm1, %xmm0
353 ; SSE3-NEXT: paddb %xmm2, %xmm0
354 ; SSE3-NEXT: movdqa %xmm0, %xmm1
355 ; SSE3-NEXT: psrlw $4, %xmm1
356 ; SSE3-NEXT: paddb %xmm0, %xmm1
357 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
358 ; SSE3-NEXT: movdqa %xmm1, %xmm0
359 ; SSE3-NEXT: psllw $8, %xmm0
360 ; SSE3-NEXT: paddb %xmm1, %xmm0
361 ; SSE3-NEXT: psrlw $8, %xmm0
364 ; SSSE3-LABEL: testv8i16:
366 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
367 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
368 ; SSSE3-NEXT: pand %xmm1, %xmm2
369 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
370 ; SSSE3-NEXT: movdqa %xmm3, %xmm4
371 ; SSSE3-NEXT: pshufb %xmm2, %xmm4
372 ; SSSE3-NEXT: psrlw $4, %xmm0
373 ; SSSE3-NEXT: pand %xmm1, %xmm0
374 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
375 ; SSSE3-NEXT: paddb %xmm4, %xmm3
376 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
377 ; SSSE3-NEXT: psllw $8, %xmm0
378 ; SSSE3-NEXT: paddb %xmm3, %xmm0
379 ; SSSE3-NEXT: psrlw $8, %xmm0
382 ; SSE41-LABEL: testv8i16:
384 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
385 ; SSE41-NEXT: movdqa %xmm0, %xmm2
386 ; SSE41-NEXT: pand %xmm1, %xmm2
387 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
388 ; SSE41-NEXT: movdqa %xmm3, %xmm4
389 ; SSE41-NEXT: pshufb %xmm2, %xmm4
390 ; SSE41-NEXT: psrlw $4, %xmm0
391 ; SSE41-NEXT: pand %xmm1, %xmm0
392 ; SSE41-NEXT: pshufb %xmm0, %xmm3
393 ; SSE41-NEXT: paddb %xmm4, %xmm3
394 ; SSE41-NEXT: movdqa %xmm3, %xmm0
395 ; SSE41-NEXT: psllw $8, %xmm0
396 ; SSE41-NEXT: paddb %xmm3, %xmm0
397 ; SSE41-NEXT: psrlw $8, %xmm0
400 ; AVX1-LABEL: testv8i16:
402 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
403 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
404 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
405 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
406 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
407 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
408 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
409 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
410 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
411 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
412 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
415 ; AVX2-LABEL: testv8i16:
417 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
418 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
419 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
420 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
421 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
422 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
423 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
424 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
425 ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
426 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
427 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
430 ; AVX512VPOPCNTDQ-LABEL: testv8i16:
431 ; AVX512VPOPCNTDQ: # %bb.0:
432 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
433 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
434 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
435 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
436 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
437 ; AVX512VPOPCNTDQ-NEXT: retq
439 ; AVX512VPOPCNTDQVL-LABEL: testv8i16:
440 ; AVX512VPOPCNTDQVL: # %bb.0:
441 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
442 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
443 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
444 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper
445 ; AVX512VPOPCNTDQVL-NEXT: retq
447 ; BITALG_NOVLX-LABEL: testv8i16:
448 ; BITALG_NOVLX: # %bb.0:
449 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
450 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
451 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
452 ; BITALG_NOVLX-NEXT: vzeroupper
453 ; BITALG_NOVLX-NEXT: retq
455 ; BITALG-LABEL: testv8i16:
457 ; BITALG-NEXT: vpopcntw %xmm0, %xmm0
459 %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
463 define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
464 ; SSE2-LABEL: testv16i8:
466 ; SSE2-NEXT: movdqa %xmm0, %xmm1
467 ; SSE2-NEXT: psrlw $1, %xmm1
468 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
469 ; SSE2-NEXT: psubb %xmm1, %xmm0
470 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
471 ; SSE2-NEXT: movdqa %xmm0, %xmm2
472 ; SSE2-NEXT: pand %xmm1, %xmm2
473 ; SSE2-NEXT: psrlw $2, %xmm0
474 ; SSE2-NEXT: pand %xmm1, %xmm0
475 ; SSE2-NEXT: paddb %xmm2, %xmm0
476 ; SSE2-NEXT: movdqa %xmm0, %xmm1
477 ; SSE2-NEXT: psrlw $4, %xmm1
478 ; SSE2-NEXT: paddb %xmm0, %xmm1
479 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
480 ; SSE2-NEXT: movdqa %xmm1, %xmm0
483 ; SSE3-LABEL: testv16i8:
485 ; SSE3-NEXT: movdqa %xmm0, %xmm1
486 ; SSE3-NEXT: psrlw $1, %xmm1
487 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
488 ; SSE3-NEXT: psubb %xmm1, %xmm0
489 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
490 ; SSE3-NEXT: movdqa %xmm0, %xmm2
491 ; SSE3-NEXT: pand %xmm1, %xmm2
492 ; SSE3-NEXT: psrlw $2, %xmm0
493 ; SSE3-NEXT: pand %xmm1, %xmm0
494 ; SSE3-NEXT: paddb %xmm2, %xmm0
495 ; SSE3-NEXT: movdqa %xmm0, %xmm1
496 ; SSE3-NEXT: psrlw $4, %xmm1
497 ; SSE3-NEXT: paddb %xmm0, %xmm1
498 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
499 ; SSE3-NEXT: movdqa %xmm1, %xmm0
502 ; SSSE3-LABEL: testv16i8:
504 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
505 ; SSSE3-NEXT: movdqa %xmm0, %xmm3
506 ; SSSE3-NEXT: pand %xmm2, %xmm3
507 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
508 ; SSSE3-NEXT: movdqa %xmm1, %xmm4
509 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
510 ; SSSE3-NEXT: psrlw $4, %xmm0
511 ; SSSE3-NEXT: pand %xmm2, %xmm0
512 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
513 ; SSSE3-NEXT: paddb %xmm4, %xmm1
514 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
517 ; SSE41-LABEL: testv16i8:
519 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
520 ; SSE41-NEXT: movdqa %xmm0, %xmm3
521 ; SSE41-NEXT: pand %xmm2, %xmm3
522 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
523 ; SSE41-NEXT: movdqa %xmm1, %xmm4
524 ; SSE41-NEXT: pshufb %xmm3, %xmm4
525 ; SSE41-NEXT: psrlw $4, %xmm0
526 ; SSE41-NEXT: pand %xmm2, %xmm0
527 ; SSE41-NEXT: pshufb %xmm0, %xmm1
528 ; SSE41-NEXT: paddb %xmm4, %xmm1
529 ; SSE41-NEXT: movdqa %xmm1, %xmm0
532 ; AVX1-LABEL: testv16i8:
534 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
535 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
536 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
537 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
538 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
539 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
540 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
541 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
544 ; AVX2-LABEL: testv16i8:
546 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
547 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
548 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
549 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
550 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
551 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
552 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
553 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
556 ; AVX512VPOPCNTDQ-LABEL: testv16i8:
557 ; AVX512VPOPCNTDQ: # %bb.0:
558 ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
559 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
560 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
561 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
562 ; AVX512VPOPCNTDQ-NEXT: retq
564 ; AVX512VPOPCNTDQVL-LABEL: testv16i8:
565 ; AVX512VPOPCNTDQVL: # %bb.0:
566 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
567 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
568 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0
569 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper
570 ; AVX512VPOPCNTDQVL-NEXT: retq
572 ; BITALG_NOVLX-LABEL: testv16i8:
573 ; BITALG_NOVLX: # %bb.0:
574 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
575 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
576 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
577 ; BITALG_NOVLX-NEXT: vzeroupper
578 ; BITALG_NOVLX-NEXT: retq
580 ; BITALG-LABEL: testv16i8:
582 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0
584 %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in)
588 define <2 x i64> @foldv2i64() nounwind {
589 ; SSE-LABEL: foldv2i64:
591 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,64]
594 ; AVX-LABEL: foldv2i64:
596 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64]
599 ; BITALG_NOVLX-LABEL: foldv2i64:
600 ; BITALG_NOVLX: # %bb.0:
601 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64]
602 ; BITALG_NOVLX-NEXT: retq
604 ; BITALG-LABEL: foldv2i64:
606 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,64]
608 %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> <i64 256, i64 -1>)
612 define <4 x i32> @foldv4i32() nounwind {
613 ; SSE-LABEL: foldv4i32:
615 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,32,0,8]
618 ; AVX-LABEL: foldv4i32:
620 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8]
623 ; BITALG_NOVLX-LABEL: foldv4i32:
624 ; BITALG_NOVLX: # %bb.0:
625 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8]
626 ; BITALG_NOVLX-NEXT: retq
628 ; BITALG-LABEL: foldv4i32:
630 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8]
632 %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>)
636 define <8 x i16> @foldv8i16() nounwind {
637 ; SSE-LABEL: foldv8i16:
639 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
642 ; AVX-LABEL: foldv8i16:
644 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
647 ; BITALG_NOVLX-LABEL: foldv8i16:
648 ; BITALG_NOVLX: # %bb.0:
649 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
650 ; BITALG_NOVLX-NEXT: retq
652 ; BITALG-LABEL: foldv8i16:
654 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
656 %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>)
660 define <16 x i8> @foldv16i8() nounwind {
661 ; SSE-LABEL: foldv16i8:
663 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
666 ; AVX-LABEL: foldv16i8:
668 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
671 ; BITALG_NOVLX-LABEL: foldv16i8:
672 ; BITALG_NOVLX: # %bb.0:
673 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
674 ; BITALG_NOVLX-NEXT: retq
676 ; BITALG-LABEL: foldv16i8:
678 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
680 %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>)
684 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
685 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
686 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
687 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)