1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQVL
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG_NOVLX
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
15 ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
16 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
18 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
19 ; SSE2-LABEL: testv2i64:
21 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
22 ; SSE2-NEXT: paddq %xmm0, %xmm1
23 ; SSE2-NEXT: pandn %xmm1, %xmm0
24 ; SSE2-NEXT: movdqa %xmm0, %xmm1
25 ; SSE2-NEXT: psrlw $1, %xmm1
26 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
27 ; SSE2-NEXT: psubb %xmm1, %xmm0
28 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
29 ; SSE2-NEXT: movdqa %xmm0, %xmm2
30 ; SSE2-NEXT: pand %xmm1, %xmm2
31 ; SSE2-NEXT: psrlw $2, %xmm0
32 ; SSE2-NEXT: pand %xmm1, %xmm0
33 ; SSE2-NEXT: paddb %xmm2, %xmm0
34 ; SSE2-NEXT: movdqa %xmm0, %xmm1
35 ; SSE2-NEXT: psrlw $4, %xmm1
36 ; SSE2-NEXT: paddb %xmm0, %xmm1
37 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
38 ; SSE2-NEXT: pxor %xmm0, %xmm0
39 ; SSE2-NEXT: psadbw %xmm0, %xmm1
40 ; SSE2-NEXT: movdqa %xmm1, %xmm0
43 ; SSE3-LABEL: testv2i64:
45 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
46 ; SSE3-NEXT: paddq %xmm0, %xmm1
47 ; SSE3-NEXT: pandn %xmm1, %xmm0
48 ; SSE3-NEXT: movdqa %xmm0, %xmm1
49 ; SSE3-NEXT: psrlw $1, %xmm1
50 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
51 ; SSE3-NEXT: psubb %xmm1, %xmm0
52 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
53 ; SSE3-NEXT: movdqa %xmm0, %xmm2
54 ; SSE3-NEXT: pand %xmm1, %xmm2
55 ; SSE3-NEXT: psrlw $2, %xmm0
56 ; SSE3-NEXT: pand %xmm1, %xmm0
57 ; SSE3-NEXT: paddb %xmm2, %xmm0
58 ; SSE3-NEXT: movdqa %xmm0, %xmm1
59 ; SSE3-NEXT: psrlw $4, %xmm1
60 ; SSE3-NEXT: paddb %xmm0, %xmm1
61 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
62 ; SSE3-NEXT: pxor %xmm0, %xmm0
63 ; SSE3-NEXT: psadbw %xmm0, %xmm1
64 ; SSE3-NEXT: movdqa %xmm1, %xmm0
67 ; SSSE3-LABEL: testv2i64:
69 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
70 ; SSSE3-NEXT: paddq %xmm0, %xmm1
71 ; SSSE3-NEXT: pandn %xmm1, %xmm0
72 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
73 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
74 ; SSSE3-NEXT: pand %xmm1, %xmm2
75 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
76 ; SSSE3-NEXT: movdqa %xmm3, %xmm4
77 ; SSSE3-NEXT: pshufb %xmm2, %xmm4
78 ; SSSE3-NEXT: psrlw $4, %xmm0
79 ; SSSE3-NEXT: pand %xmm1, %xmm0
80 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
81 ; SSSE3-NEXT: paddb %xmm4, %xmm3
82 ; SSSE3-NEXT: pxor %xmm0, %xmm0
83 ; SSSE3-NEXT: psadbw %xmm3, %xmm0
86 ; SSE41-LABEL: testv2i64:
88 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
89 ; SSE41-NEXT: paddq %xmm0, %xmm1
90 ; SSE41-NEXT: pandn %xmm1, %xmm0
91 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
92 ; SSE41-NEXT: movdqa %xmm0, %xmm2
93 ; SSE41-NEXT: pand %xmm1, %xmm2
94 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
95 ; SSE41-NEXT: movdqa %xmm3, %xmm4
96 ; SSE41-NEXT: pshufb %xmm2, %xmm4
97 ; SSE41-NEXT: psrlw $4, %xmm0
98 ; SSE41-NEXT: pand %xmm1, %xmm0
99 ; SSE41-NEXT: pshufb %xmm0, %xmm3
100 ; SSE41-NEXT: paddb %xmm4, %xmm3
101 ; SSE41-NEXT: pxor %xmm0, %xmm0
102 ; SSE41-NEXT: psadbw %xmm3, %xmm0
105 ; AVX1-LABEL: testv2i64:
107 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
108 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
109 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
110 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
111 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
112 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
113 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
114 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
115 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
116 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
117 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
118 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
119 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
122 ; AVX2-LABEL: testv2i64:
124 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
125 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
126 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
127 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
128 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
129 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
130 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
131 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
132 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
133 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
134 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
135 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
136 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
139 ; AVX512CDVL-LABEL: testv2i64:
140 ; AVX512CDVL: # %bb.0:
141 ; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
142 ; AVX512CDVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1
143 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
144 ; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0
145 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64]
146 ; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0
147 ; AVX512CDVL-NEXT: retq
149 ; AVX512CD-LABEL: testv2i64:
151 ; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
152 ; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1
153 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0
154 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
155 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64]
156 ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0
157 ; AVX512CD-NEXT: vzeroupper
158 ; AVX512CD-NEXT: retq
160 ; AVX512VPOPCNTDQ-LABEL: testv2i64:
161 ; AVX512VPOPCNTDQ: # %bb.0:
162 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
163 ; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm1
164 ; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0
165 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
166 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
167 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
168 ; AVX512VPOPCNTDQ-NEXT: retq
170 ; AVX512VPOPCNTDQVL-LABEL: testv2i64:
171 ; AVX512VPOPCNTDQVL: # %bb.0:
172 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
173 ; AVX512VPOPCNTDQVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1
174 ; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
175 ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0
176 ; AVX512VPOPCNTDQVL-NEXT: retq
178 ; BITALG_NOVLX-LABEL: testv2i64:
179 ; BITALG_NOVLX: # %bb.0:
180 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
181 ; BITALG_NOVLX-NEXT: vpaddq %xmm1, %xmm0, %xmm1
182 ; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
183 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
184 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
185 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
186 ; BITALG_NOVLX-NEXT: vzeroupper
187 ; BITALG_NOVLX-NEXT: retq
189 ; BITALG-LABEL: testv2i64:
191 ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
192 ; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm1
193 ; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0
194 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0
195 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
196 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
199 ; X32-SSE-LABEL: testv2i64:
201 ; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
202 ; X32-SSE-NEXT: paddq %xmm0, %xmm1
203 ; X32-SSE-NEXT: pandn %xmm1, %xmm0
204 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
205 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
206 ; X32-SSE-NEXT: pand %xmm1, %xmm2
207 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
208 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
209 ; X32-SSE-NEXT: pshufb %xmm2, %xmm4
210 ; X32-SSE-NEXT: psrlw $4, %xmm0
211 ; X32-SSE-NEXT: pand %xmm1, %xmm0
212 ; X32-SSE-NEXT: pshufb %xmm0, %xmm3
213 ; X32-SSE-NEXT: paddb %xmm4, %xmm3
214 ; X32-SSE-NEXT: pxor %xmm0, %xmm0
215 ; X32-SSE-NEXT: psadbw %xmm3, %xmm0
217 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0)
221 define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
222 ; SSE2-LABEL: testv2i64u:
224 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
225 ; SSE2-NEXT: paddq %xmm0, %xmm1
226 ; SSE2-NEXT: pandn %xmm1, %xmm0
227 ; SSE2-NEXT: movdqa %xmm0, %xmm1
228 ; SSE2-NEXT: psrlw $1, %xmm1
229 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
230 ; SSE2-NEXT: psubb %xmm1, %xmm0
231 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
232 ; SSE2-NEXT: movdqa %xmm0, %xmm2
233 ; SSE2-NEXT: pand %xmm1, %xmm2
234 ; SSE2-NEXT: psrlw $2, %xmm0
235 ; SSE2-NEXT: pand %xmm1, %xmm0
236 ; SSE2-NEXT: paddb %xmm2, %xmm0
237 ; SSE2-NEXT: movdqa %xmm0, %xmm1
238 ; SSE2-NEXT: psrlw $4, %xmm1
239 ; SSE2-NEXT: paddb %xmm0, %xmm1
240 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
241 ; SSE2-NEXT: pxor %xmm0, %xmm0
242 ; SSE2-NEXT: psadbw %xmm0, %xmm1
243 ; SSE2-NEXT: movdqa %xmm1, %xmm0
246 ; SSE3-LABEL: testv2i64u:
248 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
249 ; SSE3-NEXT: paddq %xmm0, %xmm1
250 ; SSE3-NEXT: pandn %xmm1, %xmm0
251 ; SSE3-NEXT: movdqa %xmm0, %xmm1
252 ; SSE3-NEXT: psrlw $1, %xmm1
253 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
254 ; SSE3-NEXT: psubb %xmm1, %xmm0
255 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
256 ; SSE3-NEXT: movdqa %xmm0, %xmm2
257 ; SSE3-NEXT: pand %xmm1, %xmm2
258 ; SSE3-NEXT: psrlw $2, %xmm0
259 ; SSE3-NEXT: pand %xmm1, %xmm0
260 ; SSE3-NEXT: paddb %xmm2, %xmm0
261 ; SSE3-NEXT: movdqa %xmm0, %xmm1
262 ; SSE3-NEXT: psrlw $4, %xmm1
263 ; SSE3-NEXT: paddb %xmm0, %xmm1
264 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
265 ; SSE3-NEXT: pxor %xmm0, %xmm0
266 ; SSE3-NEXT: psadbw %xmm0, %xmm1
267 ; SSE3-NEXT: movdqa %xmm1, %xmm0
270 ; SSSE3-LABEL: testv2i64u:
272 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
273 ; SSSE3-NEXT: paddq %xmm0, %xmm1
274 ; SSSE3-NEXT: pandn %xmm1, %xmm0
275 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
276 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
277 ; SSSE3-NEXT: pand %xmm1, %xmm2
278 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
279 ; SSSE3-NEXT: movdqa %xmm3, %xmm4
280 ; SSSE3-NEXT: pshufb %xmm2, %xmm4
281 ; SSSE3-NEXT: psrlw $4, %xmm0
282 ; SSSE3-NEXT: pand %xmm1, %xmm0
283 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
284 ; SSSE3-NEXT: paddb %xmm4, %xmm3
285 ; SSSE3-NEXT: pxor %xmm0, %xmm0
286 ; SSSE3-NEXT: psadbw %xmm3, %xmm0
289 ; SSE41-LABEL: testv2i64u:
291 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
292 ; SSE41-NEXT: paddq %xmm0, %xmm1
293 ; SSE41-NEXT: pandn %xmm1, %xmm0
294 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
295 ; SSE41-NEXT: movdqa %xmm0, %xmm2
296 ; SSE41-NEXT: pand %xmm1, %xmm2
297 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
298 ; SSE41-NEXT: movdqa %xmm3, %xmm4
299 ; SSE41-NEXT: pshufb %xmm2, %xmm4
300 ; SSE41-NEXT: psrlw $4, %xmm0
301 ; SSE41-NEXT: pand %xmm1, %xmm0
302 ; SSE41-NEXT: pshufb %xmm0, %xmm3
303 ; SSE41-NEXT: paddb %xmm4, %xmm3
304 ; SSE41-NEXT: pxor %xmm0, %xmm0
305 ; SSE41-NEXT: psadbw %xmm3, %xmm0
308 ; AVX1-LABEL: testv2i64u:
310 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
311 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
312 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
313 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
314 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
315 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
316 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
317 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
318 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
319 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
320 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
321 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
322 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
325 ; AVX2-LABEL: testv2i64u:
327 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
328 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
329 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
330 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
331 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
332 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
333 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
334 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
335 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
336 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
337 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
338 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
339 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
342 ; AVX512CDVL-LABEL: testv2i64u:
343 ; AVX512CDVL: # %bb.0:
344 ; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
345 ; AVX512CDVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1
346 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
347 ; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0
348 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64]
349 ; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0
350 ; AVX512CDVL-NEXT: retq
352 ; AVX512CD-LABEL: testv2i64u:
354 ; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
355 ; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1
356 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0
357 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
358 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64]
359 ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0
360 ; AVX512CD-NEXT: vzeroupper
361 ; AVX512CD-NEXT: retq
363 ; AVX512VPOPCNTDQ-LABEL: testv2i64u:
364 ; AVX512VPOPCNTDQ: # %bb.0:
365 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
366 ; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm1
367 ; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0
368 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
369 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
370 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
371 ; AVX512VPOPCNTDQ-NEXT: retq
373 ; AVX512VPOPCNTDQVL-LABEL: testv2i64u:
374 ; AVX512VPOPCNTDQVL: # %bb.0:
375 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
376 ; AVX512VPOPCNTDQVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1
377 ; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
378 ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0
379 ; AVX512VPOPCNTDQVL-NEXT: retq
381 ; BITALG_NOVLX-LABEL: testv2i64u:
382 ; BITALG_NOVLX: # %bb.0:
383 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
384 ; BITALG_NOVLX-NEXT: vpaddq %xmm1, %xmm0, %xmm1
385 ; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
386 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
387 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
388 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
389 ; BITALG_NOVLX-NEXT: vzeroupper
390 ; BITALG_NOVLX-NEXT: retq
392 ; BITALG-LABEL: testv2i64u:
394 ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
395 ; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm1
396 ; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0
397 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0
398 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
399 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
402 ; X32-SSE-LABEL: testv2i64u:
404 ; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
405 ; X32-SSE-NEXT: paddq %xmm0, %xmm1
406 ; X32-SSE-NEXT: pandn %xmm1, %xmm0
407 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
408 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
409 ; X32-SSE-NEXT: pand %xmm1, %xmm2
410 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
411 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
412 ; X32-SSE-NEXT: pshufb %xmm2, %xmm4
413 ; X32-SSE-NEXT: psrlw $4, %xmm0
414 ; X32-SSE-NEXT: pand %xmm1, %xmm0
415 ; X32-SSE-NEXT: pshufb %xmm0, %xmm3
416 ; X32-SSE-NEXT: paddb %xmm4, %xmm3
417 ; X32-SSE-NEXT: pxor %xmm0, %xmm0
418 ; X32-SSE-NEXT: psadbw %xmm3, %xmm0
420 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1)
424 define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
425 ; SSE2-LABEL: testv4i32:
427 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
428 ; SSE2-NEXT: paddd %xmm0, %xmm1
429 ; SSE2-NEXT: pandn %xmm1, %xmm0
430 ; SSE2-NEXT: movdqa %xmm0, %xmm1
431 ; SSE2-NEXT: psrlw $1, %xmm1
432 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
433 ; SSE2-NEXT: psubb %xmm1, %xmm0
434 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
435 ; SSE2-NEXT: movdqa %xmm0, %xmm2
436 ; SSE2-NEXT: pand %xmm1, %xmm2
437 ; SSE2-NEXT: psrlw $2, %xmm0
438 ; SSE2-NEXT: pand %xmm1, %xmm0
439 ; SSE2-NEXT: paddb %xmm2, %xmm0
440 ; SSE2-NEXT: movdqa %xmm0, %xmm1
441 ; SSE2-NEXT: psrlw $4, %xmm1
442 ; SSE2-NEXT: paddb %xmm0, %xmm1
443 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
444 ; SSE2-NEXT: pxor %xmm0, %xmm0
445 ; SSE2-NEXT: movdqa %xmm1, %xmm2
446 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
447 ; SSE2-NEXT: psadbw %xmm0, %xmm2
448 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
449 ; SSE2-NEXT: psadbw %xmm0, %xmm1
450 ; SSE2-NEXT: packuswb %xmm2, %xmm1
451 ; SSE2-NEXT: movdqa %xmm1, %xmm0
454 ; SSE3-LABEL: testv4i32:
456 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
457 ; SSE3-NEXT: paddd %xmm0, %xmm1
458 ; SSE3-NEXT: pandn %xmm1, %xmm0
459 ; SSE3-NEXT: movdqa %xmm0, %xmm1
460 ; SSE3-NEXT: psrlw $1, %xmm1
461 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
462 ; SSE3-NEXT: psubb %xmm1, %xmm0
463 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
464 ; SSE3-NEXT: movdqa %xmm0, %xmm2
465 ; SSE3-NEXT: pand %xmm1, %xmm2
466 ; SSE3-NEXT: psrlw $2, %xmm0
467 ; SSE3-NEXT: pand %xmm1, %xmm0
468 ; SSE3-NEXT: paddb %xmm2, %xmm0
469 ; SSE3-NEXT: movdqa %xmm0, %xmm1
470 ; SSE3-NEXT: psrlw $4, %xmm1
471 ; SSE3-NEXT: paddb %xmm0, %xmm1
472 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
473 ; SSE3-NEXT: pxor %xmm0, %xmm0
474 ; SSE3-NEXT: movdqa %xmm1, %xmm2
475 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
476 ; SSE3-NEXT: psadbw %xmm0, %xmm2
477 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
478 ; SSE3-NEXT: psadbw %xmm0, %xmm1
479 ; SSE3-NEXT: packuswb %xmm2, %xmm1
480 ; SSE3-NEXT: movdqa %xmm1, %xmm0
483 ; SSSE3-LABEL: testv4i32:
485 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
486 ; SSSE3-NEXT: paddd %xmm0, %xmm1
487 ; SSSE3-NEXT: pandn %xmm1, %xmm0
488 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
489 ; SSSE3-NEXT: movdqa %xmm0, %xmm3
490 ; SSSE3-NEXT: pand %xmm2, %xmm3
491 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
492 ; SSSE3-NEXT: movdqa %xmm1, %xmm4
493 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
494 ; SSSE3-NEXT: psrlw $4, %xmm0
495 ; SSSE3-NEXT: pand %xmm2, %xmm0
496 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
497 ; SSSE3-NEXT: paddb %xmm4, %xmm1
498 ; SSSE3-NEXT: pxor %xmm0, %xmm0
499 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
500 ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
501 ; SSSE3-NEXT: psadbw %xmm0, %xmm2
502 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
503 ; SSSE3-NEXT: psadbw %xmm0, %xmm1
504 ; SSSE3-NEXT: packuswb %xmm2, %xmm1
505 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
508 ; SSE41-LABEL: testv4i32:
510 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
511 ; SSE41-NEXT: paddd %xmm0, %xmm1
512 ; SSE41-NEXT: pandn %xmm1, %xmm0
513 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
514 ; SSE41-NEXT: movdqa %xmm0, %xmm2
515 ; SSE41-NEXT: pand %xmm1, %xmm2
516 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
517 ; SSE41-NEXT: movdqa %xmm3, %xmm4
518 ; SSE41-NEXT: pshufb %xmm2, %xmm4
519 ; SSE41-NEXT: psrlw $4, %xmm0
520 ; SSE41-NEXT: pand %xmm1, %xmm0
521 ; SSE41-NEXT: pshufb %xmm0, %xmm3
522 ; SSE41-NEXT: paddb %xmm4, %xmm3
523 ; SSE41-NEXT: pxor %xmm1, %xmm1
524 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
525 ; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
526 ; SSE41-NEXT: psadbw %xmm1, %xmm3
527 ; SSE41-NEXT: psadbw %xmm1, %xmm0
528 ; SSE41-NEXT: packuswb %xmm3, %xmm0
531 ; AVX1-LABEL: testv4i32:
533 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
534 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
535 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
536 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
537 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
538 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
539 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
540 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
541 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
542 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
543 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
544 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
545 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
546 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
547 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
548 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
549 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
552 ; AVX2-LABEL: testv4i32:
554 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
555 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
556 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
557 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
558 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
559 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
560 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
561 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
562 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
563 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
564 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
565 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
566 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
567 ; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
568 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
569 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
570 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
573 ; AVX512CDVL-LABEL: testv4i32:
574 ; AVX512CDVL: # %bb.0:
575 ; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
576 ; AVX512CDVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1
577 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
578 ; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0
579 ; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
580 ; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0
581 ; AVX512CDVL-NEXT: retq
583 ; AVX512CD-LABEL: testv4i32:
585 ; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
586 ; AVX512CD-NEXT: vpaddd %xmm1, %xmm0, %xmm1
587 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0
588 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
589 ; AVX512CD-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
590 ; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0
591 ; AVX512CD-NEXT: vzeroupper
592 ; AVX512CD-NEXT: retq
594 ; AVX512VPOPCNTDQ-LABEL: testv4i32:
595 ; AVX512VPOPCNTDQ: # %bb.0:
596 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
597 ; AVX512VPOPCNTDQ-NEXT: vpaddd %xmm1, %xmm0, %xmm1
598 ; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0
599 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
600 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
601 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
602 ; AVX512VPOPCNTDQ-NEXT: retq
604 ; AVX512VPOPCNTDQVL-LABEL: testv4i32:
605 ; AVX512VPOPCNTDQVL: # %bb.0:
606 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
607 ; AVX512VPOPCNTDQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1
608 ; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
609 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0
610 ; AVX512VPOPCNTDQVL-NEXT: retq
612 ; BITALG_NOVLX-LABEL: testv4i32:
613 ; BITALG_NOVLX: # %bb.0:
614 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
615 ; BITALG_NOVLX-NEXT: vpaddd %xmm1, %xmm0, %xmm1
616 ; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
617 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
618 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
619 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
620 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
621 ; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
622 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
623 ; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
624 ; BITALG_NOVLX-NEXT: vzeroupper
625 ; BITALG_NOVLX-NEXT: retq
627 ; BITALG-LABEL: testv4i32:
629 ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
630 ; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm1
631 ; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0
632 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0
633 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
634 ; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
635 ; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
636 ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
637 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
638 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
641 ; X32-SSE-LABEL: testv4i32:
643 ; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
644 ; X32-SSE-NEXT: paddd %xmm0, %xmm1
645 ; X32-SSE-NEXT: pandn %xmm1, %xmm0
646 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
647 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
648 ; X32-SSE-NEXT: pand %xmm1, %xmm2
649 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
650 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
651 ; X32-SSE-NEXT: pshufb %xmm2, %xmm4
652 ; X32-SSE-NEXT: psrlw $4, %xmm0
653 ; X32-SSE-NEXT: pand %xmm1, %xmm0
654 ; X32-SSE-NEXT: pshufb %xmm0, %xmm3
655 ; X32-SSE-NEXT: paddb %xmm4, %xmm3
656 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
657 ; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
658 ; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
659 ; X32-SSE-NEXT: psadbw %xmm1, %xmm3
660 ; X32-SSE-NEXT: psadbw %xmm1, %xmm0
661 ; X32-SSE-NEXT: packuswb %xmm3, %xmm0
663 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
667 define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
668 ; SSE2-LABEL: testv4i32u:
670 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
671 ; SSE2-NEXT: paddd %xmm0, %xmm1
672 ; SSE2-NEXT: pandn %xmm1, %xmm0
673 ; SSE2-NEXT: movdqa %xmm0, %xmm1
674 ; SSE2-NEXT: psrlw $1, %xmm1
675 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
676 ; SSE2-NEXT: psubb %xmm1, %xmm0
677 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
678 ; SSE2-NEXT: movdqa %xmm0, %xmm2
679 ; SSE2-NEXT: pand %xmm1, %xmm2
680 ; SSE2-NEXT: psrlw $2, %xmm0
681 ; SSE2-NEXT: pand %xmm1, %xmm0
682 ; SSE2-NEXT: paddb %xmm2, %xmm0
683 ; SSE2-NEXT: movdqa %xmm0, %xmm1
684 ; SSE2-NEXT: psrlw $4, %xmm1
685 ; SSE2-NEXT: paddb %xmm0, %xmm1
686 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
687 ; SSE2-NEXT: pxor %xmm0, %xmm0
688 ; SSE2-NEXT: movdqa %xmm1, %xmm2
689 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
690 ; SSE2-NEXT: psadbw %xmm0, %xmm2
691 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
692 ; SSE2-NEXT: psadbw %xmm0, %xmm1
693 ; SSE2-NEXT: packuswb %xmm2, %xmm1
694 ; SSE2-NEXT: movdqa %xmm1, %xmm0
697 ; SSE3-LABEL: testv4i32u:
699 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
700 ; SSE3-NEXT: paddd %xmm0, %xmm1
701 ; SSE3-NEXT: pandn %xmm1, %xmm0
702 ; SSE3-NEXT: movdqa %xmm0, %xmm1
703 ; SSE3-NEXT: psrlw $1, %xmm1
704 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
705 ; SSE3-NEXT: psubb %xmm1, %xmm0
706 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
707 ; SSE3-NEXT: movdqa %xmm0, %xmm2
708 ; SSE3-NEXT: pand %xmm1, %xmm2
709 ; SSE3-NEXT: psrlw $2, %xmm0
710 ; SSE3-NEXT: pand %xmm1, %xmm0
711 ; SSE3-NEXT: paddb %xmm2, %xmm0
712 ; SSE3-NEXT: movdqa %xmm0, %xmm1
713 ; SSE3-NEXT: psrlw $4, %xmm1
714 ; SSE3-NEXT: paddb %xmm0, %xmm1
715 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
716 ; SSE3-NEXT: pxor %xmm0, %xmm0
717 ; SSE3-NEXT: movdqa %xmm1, %xmm2
718 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
719 ; SSE3-NEXT: psadbw %xmm0, %xmm2
720 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
721 ; SSE3-NEXT: psadbw %xmm0, %xmm1
722 ; SSE3-NEXT: packuswb %xmm2, %xmm1
723 ; SSE3-NEXT: movdqa %xmm1, %xmm0
726 ; SSSE3-LABEL: testv4i32u:
728 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
729 ; SSSE3-NEXT: paddd %xmm0, %xmm1
730 ; SSSE3-NEXT: pandn %xmm1, %xmm0
731 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
732 ; SSSE3-NEXT: movdqa %xmm0, %xmm3
733 ; SSSE3-NEXT: pand %xmm2, %xmm3
734 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
735 ; SSSE3-NEXT: movdqa %xmm1, %xmm4
736 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
737 ; SSSE3-NEXT: psrlw $4, %xmm0
738 ; SSSE3-NEXT: pand %xmm2, %xmm0
739 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
740 ; SSSE3-NEXT: paddb %xmm4, %xmm1
741 ; SSSE3-NEXT: pxor %xmm0, %xmm0
742 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
743 ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
744 ; SSSE3-NEXT: psadbw %xmm0, %xmm2
745 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
746 ; SSSE3-NEXT: psadbw %xmm0, %xmm1
747 ; SSSE3-NEXT: packuswb %xmm2, %xmm1
748 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
751 ; SSE41-LABEL: testv4i32u:
753 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
754 ; SSE41-NEXT: paddd %xmm0, %xmm1
755 ; SSE41-NEXT: pandn %xmm1, %xmm0
756 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
757 ; SSE41-NEXT: movdqa %xmm0, %xmm2
758 ; SSE41-NEXT: pand %xmm1, %xmm2
759 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
760 ; SSE41-NEXT: movdqa %xmm3, %xmm4
761 ; SSE41-NEXT: pshufb %xmm2, %xmm4
762 ; SSE41-NEXT: psrlw $4, %xmm0
763 ; SSE41-NEXT: pand %xmm1, %xmm0
764 ; SSE41-NEXT: pshufb %xmm0, %xmm3
765 ; SSE41-NEXT: paddb %xmm4, %xmm3
766 ; SSE41-NEXT: pxor %xmm1, %xmm1
767 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
768 ; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
769 ; SSE41-NEXT: psadbw %xmm1, %xmm3
770 ; SSE41-NEXT: psadbw %xmm1, %xmm0
771 ; SSE41-NEXT: packuswb %xmm3, %xmm0
774 ; AVX1-LABEL: testv4i32u:
776 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
777 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
778 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
779 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
780 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
781 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
782 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
783 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
784 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
785 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
786 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
787 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
788 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
789 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
790 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
791 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
792 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
795 ; AVX2-LABEL: testv4i32u:
797 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
798 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
799 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
800 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
801 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
802 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
803 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
804 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
805 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
806 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
807 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
808 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
809 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
810 ; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
811 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
812 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
813 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
816 ; AVX512CDVL-LABEL: testv4i32u:
817 ; AVX512CDVL: # %bb.0:
818 ; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
819 ; AVX512CDVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1
820 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
821 ; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0
822 ; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
823 ; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0
824 ; AVX512CDVL-NEXT: retq
826 ; AVX512CD-LABEL: testv4i32u:
828 ; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
829 ; AVX512CD-NEXT: vpaddd %xmm1, %xmm0, %xmm1
830 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0
831 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
832 ; AVX512CD-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
833 ; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0
834 ; AVX512CD-NEXT: vzeroupper
835 ; AVX512CD-NEXT: retq
837 ; AVX512VPOPCNTDQ-LABEL: testv4i32u:
838 ; AVX512VPOPCNTDQ: # %bb.0:
839 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
840 ; AVX512VPOPCNTDQ-NEXT: vpaddd %xmm1, %xmm0, %xmm1
841 ; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0
842 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
843 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
844 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
845 ; AVX512VPOPCNTDQ-NEXT: retq
847 ; AVX512VPOPCNTDQVL-LABEL: testv4i32u:
848 ; AVX512VPOPCNTDQVL: # %bb.0:
849 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
850 ; AVX512VPOPCNTDQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1
851 ; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
852 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0
853 ; AVX512VPOPCNTDQVL-NEXT: retq
855 ; BITALG_NOVLX-LABEL: testv4i32u:
856 ; BITALG_NOVLX: # %bb.0:
857 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
858 ; BITALG_NOVLX-NEXT: vpaddd %xmm1, %xmm0, %xmm1
859 ; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
860 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
861 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
862 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
863 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
864 ; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
865 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
866 ; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
867 ; BITALG_NOVLX-NEXT: vzeroupper
868 ; BITALG_NOVLX-NEXT: retq
870 ; BITALG-LABEL: testv4i32u:
872 ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
873 ; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm1
874 ; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0
875 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0
876 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
877 ; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
878 ; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
879 ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
880 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
881 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
884 ; X32-SSE-LABEL: testv4i32u:
886 ; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
887 ; X32-SSE-NEXT: paddd %xmm0, %xmm1
888 ; X32-SSE-NEXT: pandn %xmm1, %xmm0
889 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
890 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
891 ; X32-SSE-NEXT: pand %xmm1, %xmm2
892 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
893 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
894 ; X32-SSE-NEXT: pshufb %xmm2, %xmm4
895 ; X32-SSE-NEXT: psrlw $4, %xmm0
896 ; X32-SSE-NEXT: pand %xmm1, %xmm0
897 ; X32-SSE-NEXT: pshufb %xmm0, %xmm3
898 ; X32-SSE-NEXT: paddb %xmm4, %xmm3
899 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
900 ; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
901 ; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
902 ; X32-SSE-NEXT: psadbw %xmm1, %xmm3
903 ; X32-SSE-NEXT: psadbw %xmm1, %xmm0
904 ; X32-SSE-NEXT: packuswb %xmm3, %xmm0
906 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
910 define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
911 ; SSE2-LABEL: testv8i16:
913 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
914 ; SSE2-NEXT: paddw %xmm0, %xmm1
915 ; SSE2-NEXT: pandn %xmm1, %xmm0
916 ; SSE2-NEXT: movdqa %xmm0, %xmm1
917 ; SSE2-NEXT: psrlw $1, %xmm1
918 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
919 ; SSE2-NEXT: psubb %xmm1, %xmm0
920 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
921 ; SSE2-NEXT: movdqa %xmm0, %xmm2
922 ; SSE2-NEXT: pand %xmm1, %xmm2
923 ; SSE2-NEXT: psrlw $2, %xmm0
924 ; SSE2-NEXT: pand %xmm1, %xmm0
925 ; SSE2-NEXT: paddb %xmm2, %xmm0
926 ; SSE2-NEXT: movdqa %xmm0, %xmm1
927 ; SSE2-NEXT: psrlw $4, %xmm1
928 ; SSE2-NEXT: paddb %xmm0, %xmm1
929 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
930 ; SSE2-NEXT: movdqa %xmm1, %xmm0
931 ; SSE2-NEXT: psllw $8, %xmm0
932 ; SSE2-NEXT: paddb %xmm1, %xmm0
933 ; SSE2-NEXT: psrlw $8, %xmm0
936 ; SSE3-LABEL: testv8i16:
938 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
939 ; SSE3-NEXT: paddw %xmm0, %xmm1
940 ; SSE3-NEXT: pandn %xmm1, %xmm0
941 ; SSE3-NEXT: movdqa %xmm0, %xmm1
942 ; SSE3-NEXT: psrlw $1, %xmm1
943 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
944 ; SSE3-NEXT: psubb %xmm1, %xmm0
945 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
946 ; SSE3-NEXT: movdqa %xmm0, %xmm2
947 ; SSE3-NEXT: pand %xmm1, %xmm2
948 ; SSE3-NEXT: psrlw $2, %xmm0
949 ; SSE3-NEXT: pand %xmm1, %xmm0
950 ; SSE3-NEXT: paddb %xmm2, %xmm0
951 ; SSE3-NEXT: movdqa %xmm0, %xmm1
952 ; SSE3-NEXT: psrlw $4, %xmm1
953 ; SSE3-NEXT: paddb %xmm0, %xmm1
954 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
955 ; SSE3-NEXT: movdqa %xmm1, %xmm0
956 ; SSE3-NEXT: psllw $8, %xmm0
957 ; SSE3-NEXT: paddb %xmm1, %xmm0
958 ; SSE3-NEXT: psrlw $8, %xmm0
961 ; SSSE3-LABEL: testv8i16:
963 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
964 ; SSSE3-NEXT: paddw %xmm0, %xmm1
965 ; SSSE3-NEXT: pandn %xmm1, %xmm0
966 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
967 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
968 ; SSSE3-NEXT: pand %xmm1, %xmm2
969 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
970 ; SSSE3-NEXT: movdqa %xmm3, %xmm4
971 ; SSSE3-NEXT: pshufb %xmm2, %xmm4
972 ; SSSE3-NEXT: psrlw $4, %xmm0
973 ; SSSE3-NEXT: pand %xmm1, %xmm0
974 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
975 ; SSSE3-NEXT: paddb %xmm4, %xmm3
976 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
977 ; SSSE3-NEXT: psllw $8, %xmm0
978 ; SSSE3-NEXT: paddb %xmm3, %xmm0
979 ; SSSE3-NEXT: psrlw $8, %xmm0
982 ; SSE41-LABEL: testv8i16:
984 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
985 ; SSE41-NEXT: paddw %xmm0, %xmm1
986 ; SSE41-NEXT: pandn %xmm1, %xmm0
987 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
988 ; SSE41-NEXT: movdqa %xmm0, %xmm2
989 ; SSE41-NEXT: pand %xmm1, %xmm2
990 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
991 ; SSE41-NEXT: movdqa %xmm3, %xmm4
992 ; SSE41-NEXT: pshufb %xmm2, %xmm4
993 ; SSE41-NEXT: psrlw $4, %xmm0
994 ; SSE41-NEXT: pand %xmm1, %xmm0
995 ; SSE41-NEXT: pshufb %xmm0, %xmm3
996 ; SSE41-NEXT: paddb %xmm4, %xmm3
997 ; SSE41-NEXT: movdqa %xmm3, %xmm0
998 ; SSE41-NEXT: psllw $8, %xmm0
999 ; SSE41-NEXT: paddb %xmm3, %xmm0
1000 ; SSE41-NEXT: psrlw $8, %xmm0
1003 ; AVX-LABEL: testv8i16:
1005 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1006 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1
1007 ; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
1008 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1009 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
1010 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1011 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1012 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
1013 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
1014 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
1015 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
1016 ; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
1017 ; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
1018 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
1021 ; AVX512VPOPCNTDQ-LABEL: testv8i16:
1022 ; AVX512VPOPCNTDQ: # %bb.0:
1023 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1024 ; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm1
1025 ; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0
1026 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1027 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
1028 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
1029 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1030 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
1031 ; AVX512VPOPCNTDQ-NEXT: retq
1033 ; AVX512VPOPCNTDQVL-LABEL: testv8i16:
1034 ; AVX512VPOPCNTDQVL: # %bb.0:
1035 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1036 ; AVX512VPOPCNTDQVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1
1037 ; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
1038 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1039 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
1040 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
1041 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper
1042 ; AVX512VPOPCNTDQVL-NEXT: retq
1044 ; BITALG_NOVLX-LABEL: testv8i16:
1045 ; BITALG_NOVLX: # %bb.0:
1046 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1047 ; BITALG_NOVLX-NEXT: vpaddw %xmm1, %xmm0, %xmm1
1048 ; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
1049 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
1050 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1051 ; BITALG_NOVLX-NEXT: vzeroupper
1052 ; BITALG_NOVLX-NEXT: retq
1054 ; BITALG-LABEL: testv8i16:
1056 ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1057 ; BITALG-NEXT: vpaddw %xmm1, %xmm0, %xmm1
1058 ; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0
1059 ; BITALG-NEXT: vpopcntw %xmm0, %xmm0
1062 ; X32-SSE-LABEL: testv8i16:
1064 ; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
1065 ; X32-SSE-NEXT: paddw %xmm0, %xmm1
1066 ; X32-SSE-NEXT: pandn %xmm1, %xmm0
1067 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1068 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1069 ; X32-SSE-NEXT: pand %xmm1, %xmm2
1070 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1071 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
1072 ; X32-SSE-NEXT: pshufb %xmm2, %xmm4
1073 ; X32-SSE-NEXT: psrlw $4, %xmm0
1074 ; X32-SSE-NEXT: pand %xmm1, %xmm0
1075 ; X32-SSE-NEXT: pshufb %xmm0, %xmm3
1076 ; X32-SSE-NEXT: paddb %xmm4, %xmm3
1077 ; X32-SSE-NEXT: movdqa %xmm3, %xmm0
1078 ; X32-SSE-NEXT: psllw $8, %xmm0
1079 ; X32-SSE-NEXT: paddb %xmm3, %xmm0
1080 ; X32-SSE-NEXT: psrlw $8, %xmm0
1081 ; X32-SSE-NEXT: retl
1082 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 0)
1086 define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
1087 ; SSE2-LABEL: testv8i16u:
1089 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
1090 ; SSE2-NEXT: paddw %xmm0, %xmm1
1091 ; SSE2-NEXT: pandn %xmm1, %xmm0
1092 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1093 ; SSE2-NEXT: psrlw $1, %xmm1
1094 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
1095 ; SSE2-NEXT: psubb %xmm1, %xmm0
1096 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1097 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1098 ; SSE2-NEXT: pand %xmm1, %xmm2
1099 ; SSE2-NEXT: psrlw $2, %xmm0
1100 ; SSE2-NEXT: pand %xmm1, %xmm0
1101 ; SSE2-NEXT: paddb %xmm2, %xmm0
1102 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1103 ; SSE2-NEXT: psrlw $4, %xmm1
1104 ; SSE2-NEXT: paddb %xmm0, %xmm1
1105 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
1106 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1107 ; SSE2-NEXT: psllw $8, %xmm0
1108 ; SSE2-NEXT: paddb %xmm1, %xmm0
1109 ; SSE2-NEXT: psrlw $8, %xmm0
1112 ; SSE3-LABEL: testv8i16u:
1114 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
1115 ; SSE3-NEXT: paddw %xmm0, %xmm1
1116 ; SSE3-NEXT: pandn %xmm1, %xmm0
1117 ; SSE3-NEXT: movdqa %xmm0, %xmm1
1118 ; SSE3-NEXT: psrlw $1, %xmm1
1119 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
1120 ; SSE3-NEXT: psubb %xmm1, %xmm0
1121 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1122 ; SSE3-NEXT: movdqa %xmm0, %xmm2
1123 ; SSE3-NEXT: pand %xmm1, %xmm2
1124 ; SSE3-NEXT: psrlw $2, %xmm0
1125 ; SSE3-NEXT: pand %xmm1, %xmm0
1126 ; SSE3-NEXT: paddb %xmm2, %xmm0
1127 ; SSE3-NEXT: movdqa %xmm0, %xmm1
1128 ; SSE3-NEXT: psrlw $4, %xmm1
1129 ; SSE3-NEXT: paddb %xmm0, %xmm1
1130 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
1131 ; SSE3-NEXT: movdqa %xmm1, %xmm0
1132 ; SSE3-NEXT: psllw $8, %xmm0
1133 ; SSE3-NEXT: paddb %xmm1, %xmm0
1134 ; SSE3-NEXT: psrlw $8, %xmm0
1137 ; SSSE3-LABEL: testv8i16u:
1139 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
1140 ; SSSE3-NEXT: paddw %xmm0, %xmm1
1141 ; SSSE3-NEXT: pandn %xmm1, %xmm0
1142 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1143 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1144 ; SSSE3-NEXT: pand %xmm1, %xmm2
1145 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1146 ; SSSE3-NEXT: movdqa %xmm3, %xmm4
1147 ; SSSE3-NEXT: pshufb %xmm2, %xmm4
1148 ; SSSE3-NEXT: psrlw $4, %xmm0
1149 ; SSSE3-NEXT: pand %xmm1, %xmm0
1150 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
1151 ; SSSE3-NEXT: paddb %xmm4, %xmm3
1152 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
1153 ; SSSE3-NEXT: psllw $8, %xmm0
1154 ; SSSE3-NEXT: paddb %xmm3, %xmm0
1155 ; SSSE3-NEXT: psrlw $8, %xmm0
1158 ; SSE41-LABEL: testv8i16u:
1160 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
1161 ; SSE41-NEXT: paddw %xmm0, %xmm1
1162 ; SSE41-NEXT: pandn %xmm1, %xmm0
1163 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1164 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1165 ; SSE41-NEXT: pand %xmm1, %xmm2
1166 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1167 ; SSE41-NEXT: movdqa %xmm3, %xmm4
1168 ; SSE41-NEXT: pshufb %xmm2, %xmm4
1169 ; SSE41-NEXT: psrlw $4, %xmm0
1170 ; SSE41-NEXT: pand %xmm1, %xmm0
1171 ; SSE41-NEXT: pshufb %xmm0, %xmm3
1172 ; SSE41-NEXT: paddb %xmm4, %xmm3
1173 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1174 ; SSE41-NEXT: psllw $8, %xmm0
1175 ; SSE41-NEXT: paddb %xmm3, %xmm0
1176 ; SSE41-NEXT: psrlw $8, %xmm0
1179 ; AVX-LABEL: testv8i16u:
1181 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1182 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1
1183 ; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
1184 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1185 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
1186 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1187 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1188 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
1189 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
1190 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
1191 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
1192 ; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
1193 ; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
1194 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
1197 ; AVX512VPOPCNTDQ-LABEL: testv8i16u:
1198 ; AVX512VPOPCNTDQ: # %bb.0:
1199 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1200 ; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm1
1201 ; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0
1202 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1203 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
1204 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
1205 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1206 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
1207 ; AVX512VPOPCNTDQ-NEXT: retq
1209 ; AVX512VPOPCNTDQVL-LABEL: testv8i16u:
1210 ; AVX512VPOPCNTDQVL: # %bb.0:
1211 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1212 ; AVX512VPOPCNTDQVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1
1213 ; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
1214 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1215 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
1216 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
1217 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper
1218 ; AVX512VPOPCNTDQVL-NEXT: retq
1220 ; BITALG_NOVLX-LABEL: testv8i16u:
1221 ; BITALG_NOVLX: # %bb.0:
1222 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1223 ; BITALG_NOVLX-NEXT: vpaddw %xmm1, %xmm0, %xmm1
1224 ; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
1225 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
1226 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1227 ; BITALG_NOVLX-NEXT: vzeroupper
1228 ; BITALG_NOVLX-NEXT: retq
1230 ; BITALG-LABEL: testv8i16u:
1232 ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1233 ; BITALG-NEXT: vpaddw %xmm1, %xmm0, %xmm1
1234 ; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0
1235 ; BITALG-NEXT: vpopcntw %xmm0, %xmm0
1238 ; X32-SSE-LABEL: testv8i16u:
1240 ; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
1241 ; X32-SSE-NEXT: paddw %xmm0, %xmm1
1242 ; X32-SSE-NEXT: pandn %xmm1, %xmm0
1243 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1244 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1245 ; X32-SSE-NEXT: pand %xmm1, %xmm2
1246 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1247 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
1248 ; X32-SSE-NEXT: pshufb %xmm2, %xmm4
1249 ; X32-SSE-NEXT: psrlw $4, %xmm0
1250 ; X32-SSE-NEXT: pand %xmm1, %xmm0
1251 ; X32-SSE-NEXT: pshufb %xmm0, %xmm3
1252 ; X32-SSE-NEXT: paddb %xmm4, %xmm3
1253 ; X32-SSE-NEXT: movdqa %xmm3, %xmm0
1254 ; X32-SSE-NEXT: psllw $8, %xmm0
1255 ; X32-SSE-NEXT: paddb %xmm3, %xmm0
1256 ; X32-SSE-NEXT: psrlw $8, %xmm0
1257 ; X32-SSE-NEXT: retl
1258 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 -1)
1262 define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
1263 ; SSE2-LABEL: testv16i8:
1265 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
1266 ; SSE2-NEXT: paddb %xmm0, %xmm1
1267 ; SSE2-NEXT: pandn %xmm1, %xmm0
1268 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1269 ; SSE2-NEXT: psrlw $1, %xmm1
1270 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
1271 ; SSE2-NEXT: psubb %xmm1, %xmm0
1272 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1273 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1274 ; SSE2-NEXT: pand %xmm1, %xmm2
1275 ; SSE2-NEXT: psrlw $2, %xmm0
1276 ; SSE2-NEXT: pand %xmm1, %xmm0
1277 ; SSE2-NEXT: paddb %xmm2, %xmm0
1278 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1279 ; SSE2-NEXT: psrlw $4, %xmm1
1280 ; SSE2-NEXT: paddb %xmm0, %xmm1
1281 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
1282 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1285 ; SSE3-LABEL: testv16i8:
1287 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
1288 ; SSE3-NEXT: paddb %xmm0, %xmm1
1289 ; SSE3-NEXT: pandn %xmm1, %xmm0
1290 ; SSE3-NEXT: movdqa %xmm0, %xmm1
1291 ; SSE3-NEXT: psrlw $1, %xmm1
1292 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
1293 ; SSE3-NEXT: psubb %xmm1, %xmm0
1294 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1295 ; SSE3-NEXT: movdqa %xmm0, %xmm2
1296 ; SSE3-NEXT: pand %xmm1, %xmm2
1297 ; SSE3-NEXT: psrlw $2, %xmm0
1298 ; SSE3-NEXT: pand %xmm1, %xmm0
1299 ; SSE3-NEXT: paddb %xmm2, %xmm0
1300 ; SSE3-NEXT: movdqa %xmm0, %xmm1
1301 ; SSE3-NEXT: psrlw $4, %xmm1
1302 ; SSE3-NEXT: paddb %xmm0, %xmm1
1303 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
1304 ; SSE3-NEXT: movdqa %xmm1, %xmm0
1307 ; SSSE3-LABEL: testv16i8:
1309 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
1310 ; SSSE3-NEXT: paddb %xmm0, %xmm1
1311 ; SSSE3-NEXT: pandn %xmm1, %xmm0
1312 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1313 ; SSSE3-NEXT: movdqa %xmm0, %xmm3
1314 ; SSSE3-NEXT: pand %xmm2, %xmm3
1315 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1316 ; SSSE3-NEXT: movdqa %xmm1, %xmm4
1317 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
1318 ; SSSE3-NEXT: psrlw $4, %xmm0
1319 ; SSSE3-NEXT: pand %xmm2, %xmm0
1320 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
1321 ; SSSE3-NEXT: paddb %xmm4, %xmm1
1322 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1325 ; SSE41-LABEL: testv16i8:
1327 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
1328 ; SSE41-NEXT: paddb %xmm0, %xmm1
1329 ; SSE41-NEXT: pandn %xmm1, %xmm0
1330 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1331 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1332 ; SSE41-NEXT: pand %xmm2, %xmm3
1333 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1334 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1335 ; SSE41-NEXT: pshufb %xmm3, %xmm4
1336 ; SSE41-NEXT: psrlw $4, %xmm0
1337 ; SSE41-NEXT: pand %xmm2, %xmm0
1338 ; SSE41-NEXT: pshufb %xmm0, %xmm1
1339 ; SSE41-NEXT: paddb %xmm4, %xmm1
1340 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1343 ; AVX-LABEL: testv16i8:
1345 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1346 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1
1347 ; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
1348 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1349 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
1350 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1351 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1352 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
1353 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
1354 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
1355 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
1358 ; AVX512VPOPCNTDQ-LABEL: testv16i8:
1359 ; AVX512VPOPCNTDQ: # %bb.0:
1360 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1361 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm1
1362 ; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0
1363 ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1364 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
1365 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
1366 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
1367 ; AVX512VPOPCNTDQ-NEXT: retq
1369 ; AVX512VPOPCNTDQVL-LABEL: testv16i8:
1370 ; AVX512VPOPCNTDQVL: # %bb.0:
1371 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1372 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1
1373 ; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
1374 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1375 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
1376 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0
1377 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper
1378 ; AVX512VPOPCNTDQVL-NEXT: retq
1380 ; BITALG_NOVLX-LABEL: testv16i8:
1381 ; BITALG_NOVLX: # %bb.0:
1382 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1383 ; BITALG_NOVLX-NEXT: vpaddb %xmm1, %xmm0, %xmm1
1384 ; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
1385 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
1386 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1387 ; BITALG_NOVLX-NEXT: vzeroupper
1388 ; BITALG_NOVLX-NEXT: retq
1390 ; BITALG-LABEL: testv16i8:
1392 ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1393 ; BITALG-NEXT: vpaddb %xmm1, %xmm0, %xmm1
1394 ; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0
1395 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0
1398 ; X32-SSE-LABEL: testv16i8:
1400 ; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
1401 ; X32-SSE-NEXT: paddb %xmm0, %xmm1
1402 ; X32-SSE-NEXT: pandn %xmm1, %xmm0
1403 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1404 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
1405 ; X32-SSE-NEXT: pand %xmm2, %xmm3
1406 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1407 ; X32-SSE-NEXT: movdqa %xmm1, %xmm4
1408 ; X32-SSE-NEXT: pshufb %xmm3, %xmm4
1409 ; X32-SSE-NEXT: psrlw $4, %xmm0
1410 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1411 ; X32-SSE-NEXT: pshufb %xmm0, %xmm1
1412 ; X32-SSE-NEXT: paddb %xmm4, %xmm1
1413 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0
1414 ; X32-SSE-NEXT: retl
1415 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0)
1419 define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
1420 ; SSE2-LABEL: testv16i8u:
1422 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
1423 ; SSE2-NEXT: paddb %xmm0, %xmm1
1424 ; SSE2-NEXT: pandn %xmm1, %xmm0
1425 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1426 ; SSE2-NEXT: psrlw $1, %xmm1
1427 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
1428 ; SSE2-NEXT: psubb %xmm1, %xmm0
1429 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1430 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1431 ; SSE2-NEXT: pand %xmm1, %xmm2
1432 ; SSE2-NEXT: psrlw $2, %xmm0
1433 ; SSE2-NEXT: pand %xmm1, %xmm0
1434 ; SSE2-NEXT: paddb %xmm2, %xmm0
1435 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1436 ; SSE2-NEXT: psrlw $4, %xmm1
1437 ; SSE2-NEXT: paddb %xmm0, %xmm1
1438 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
1439 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1442 ; SSE3-LABEL: testv16i8u:
1444 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
1445 ; SSE3-NEXT: paddb %xmm0, %xmm1
1446 ; SSE3-NEXT: pandn %xmm1, %xmm0
1447 ; SSE3-NEXT: movdqa %xmm0, %xmm1
1448 ; SSE3-NEXT: psrlw $1, %xmm1
1449 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
1450 ; SSE3-NEXT: psubb %xmm1, %xmm0
1451 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1452 ; SSE3-NEXT: movdqa %xmm0, %xmm2
1453 ; SSE3-NEXT: pand %xmm1, %xmm2
1454 ; SSE3-NEXT: psrlw $2, %xmm0
1455 ; SSE3-NEXT: pand %xmm1, %xmm0
1456 ; SSE3-NEXT: paddb %xmm2, %xmm0
1457 ; SSE3-NEXT: movdqa %xmm0, %xmm1
1458 ; SSE3-NEXT: psrlw $4, %xmm1
1459 ; SSE3-NEXT: paddb %xmm0, %xmm1
1460 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
1461 ; SSE3-NEXT: movdqa %xmm1, %xmm0
1464 ; SSSE3-LABEL: testv16i8u:
1466 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
1467 ; SSSE3-NEXT: paddb %xmm0, %xmm1
1468 ; SSSE3-NEXT: pandn %xmm1, %xmm0
1469 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1470 ; SSSE3-NEXT: movdqa %xmm0, %xmm3
1471 ; SSSE3-NEXT: pand %xmm2, %xmm3
1472 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1473 ; SSSE3-NEXT: movdqa %xmm1, %xmm4
1474 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
1475 ; SSSE3-NEXT: psrlw $4, %xmm0
1476 ; SSSE3-NEXT: pand %xmm2, %xmm0
1477 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
1478 ; SSSE3-NEXT: paddb %xmm4, %xmm1
1479 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1482 ; SSE41-LABEL: testv16i8u:
1484 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
1485 ; SSE41-NEXT: paddb %xmm0, %xmm1
1486 ; SSE41-NEXT: pandn %xmm1, %xmm0
1487 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1488 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1489 ; SSE41-NEXT: pand %xmm2, %xmm3
1490 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1491 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1492 ; SSE41-NEXT: pshufb %xmm3, %xmm4
1493 ; SSE41-NEXT: psrlw $4, %xmm0
1494 ; SSE41-NEXT: pand %xmm2, %xmm0
1495 ; SSE41-NEXT: pshufb %xmm0, %xmm1
1496 ; SSE41-NEXT: paddb %xmm4, %xmm1
1497 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1500 ; AVX-LABEL: testv16i8u:
1502 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1503 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1
1504 ; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
1505 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1506 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
1507 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1508 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1509 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
1510 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
1511 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
1512 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
1515 ; AVX512VPOPCNTDQ-LABEL: testv16i8u:
1516 ; AVX512VPOPCNTDQ: # %bb.0:
1517 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1518 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm1
1519 ; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0
1520 ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1521 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
1522 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
1523 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
1524 ; AVX512VPOPCNTDQ-NEXT: retq
1526 ; AVX512VPOPCNTDQVL-LABEL: testv16i8u:
1527 ; AVX512VPOPCNTDQVL: # %bb.0:
1528 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1529 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1
1530 ; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
1531 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1532 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
1533 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0
1534 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper
1535 ; AVX512VPOPCNTDQVL-NEXT: retq
1537 ; BITALG_NOVLX-LABEL: testv16i8u:
1538 ; BITALG_NOVLX: # %bb.0:
1539 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1540 ; BITALG_NOVLX-NEXT: vpaddb %xmm1, %xmm0, %xmm1
1541 ; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
1542 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
1543 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1544 ; BITALG_NOVLX-NEXT: vzeroupper
1545 ; BITALG_NOVLX-NEXT: retq
1547 ; BITALG-LABEL: testv16i8u:
1549 ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1550 ; BITALG-NEXT: vpaddb %xmm1, %xmm0, %xmm1
1551 ; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0
1552 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0
1555 ; X32-SSE-LABEL: testv16i8u:
1557 ; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
1558 ; X32-SSE-NEXT: paddb %xmm0, %xmm1
1559 ; X32-SSE-NEXT: pandn %xmm1, %xmm0
1560 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1561 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
1562 ; X32-SSE-NEXT: pand %xmm2, %xmm3
1563 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1564 ; X32-SSE-NEXT: movdqa %xmm1, %xmm4
1565 ; X32-SSE-NEXT: pshufb %xmm3, %xmm4
1566 ; X32-SSE-NEXT: psrlw $4, %xmm0
1567 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1568 ; X32-SSE-NEXT: pshufb %xmm0, %xmm1
1569 ; X32-SSE-NEXT: paddb %xmm4, %xmm1
1570 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0
1571 ; X32-SSE-NEXT: retl
1572 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1)
1576 define <2 x i64> @foldv2i64() nounwind {
1577 ; SSE-LABEL: foldv2i64:
1579 ; SSE-NEXT: movl $8, %eax
1580 ; SSE-NEXT: movq %rax, %xmm0
1583 ; AVX-LABEL: foldv2i64:
1585 ; AVX-NEXT: movl $8, %eax
1586 ; AVX-NEXT: vmovq %rax, %xmm0
1589 ; AVX512VPOPCNTDQ-LABEL: foldv2i64:
1590 ; AVX512VPOPCNTDQ: # %bb.0:
1591 ; AVX512VPOPCNTDQ-NEXT: movl $8, %eax
1592 ; AVX512VPOPCNTDQ-NEXT: vmovq %rax, %xmm0
1593 ; AVX512VPOPCNTDQ-NEXT: retq
1595 ; AVX512VPOPCNTDQVL-LABEL: foldv2i64:
1596 ; AVX512VPOPCNTDQVL: # %bb.0:
1597 ; AVX512VPOPCNTDQVL-NEXT: movl $8, %eax
1598 ; AVX512VPOPCNTDQVL-NEXT: vmovq %rax, %xmm0
1599 ; AVX512VPOPCNTDQVL-NEXT: retq
1601 ; BITALG_NOVLX-LABEL: foldv2i64:
1602 ; BITALG_NOVLX: # %bb.0:
1603 ; BITALG_NOVLX-NEXT: movl $8, %eax
1604 ; BITALG_NOVLX-NEXT: vmovq %rax, %xmm0
1605 ; BITALG_NOVLX-NEXT: retq
1607 ; BITALG-LABEL: foldv2i64:
1609 ; BITALG-NEXT: movl $8, %eax
1610 ; BITALG-NEXT: vmovq %rax, %xmm0
1613 ; X32-SSE-LABEL: foldv2i64:
1615 ; X32-SSE-NEXT: movl $8, %eax
1616 ; X32-SSE-NEXT: movd %eax, %xmm0
1617 ; X32-SSE-NEXT: retl
1618 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
1622 define <2 x i64> @foldv2i64u() nounwind {
1623 ; SSE-LABEL: foldv2i64u:
1625 ; SSE-NEXT: movl $8, %eax
1626 ; SSE-NEXT: movq %rax, %xmm0
1629 ; AVX-LABEL: foldv2i64u:
1631 ; AVX-NEXT: movl $8, %eax
1632 ; AVX-NEXT: vmovq %rax, %xmm0
1635 ; AVX512VPOPCNTDQ-LABEL: foldv2i64u:
1636 ; AVX512VPOPCNTDQ: # %bb.0:
1637 ; AVX512VPOPCNTDQ-NEXT: movl $8, %eax
1638 ; AVX512VPOPCNTDQ-NEXT: vmovq %rax, %xmm0
1639 ; AVX512VPOPCNTDQ-NEXT: retq
1641 ; AVX512VPOPCNTDQVL-LABEL: foldv2i64u:
1642 ; AVX512VPOPCNTDQVL: # %bb.0:
1643 ; AVX512VPOPCNTDQVL-NEXT: movl $8, %eax
1644 ; AVX512VPOPCNTDQVL-NEXT: vmovq %rax, %xmm0
1645 ; AVX512VPOPCNTDQVL-NEXT: retq
1647 ; BITALG_NOVLX-LABEL: foldv2i64u:
1648 ; BITALG_NOVLX: # %bb.0:
1649 ; BITALG_NOVLX-NEXT: movl $8, %eax
1650 ; BITALG_NOVLX-NEXT: vmovq %rax, %xmm0
1651 ; BITALG_NOVLX-NEXT: retq
1653 ; BITALG-LABEL: foldv2i64u:
1655 ; BITALG-NEXT: movl $8, %eax
1656 ; BITALG-NEXT: vmovq %rax, %xmm0
1659 ; X32-SSE-LABEL: foldv2i64u:
1661 ; X32-SSE-NEXT: movl $8, %eax
1662 ; X32-SSE-NEXT: movd %eax, %xmm0
1663 ; X32-SSE-NEXT: retl
1664 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
1668 define <4 x i32> @foldv4i32() nounwind {
1669 ; SSE-LABEL: foldv4i32:
1671 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
1674 ; AVX-LABEL: foldv4i32:
1676 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1679 ; AVX512VPOPCNTDQ-LABEL: foldv4i32:
1680 ; AVX512VPOPCNTDQ: # %bb.0:
1681 ; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1682 ; AVX512VPOPCNTDQ-NEXT: retq
1684 ; AVX512VPOPCNTDQVL-LABEL: foldv4i32:
1685 ; AVX512VPOPCNTDQVL: # %bb.0:
1686 ; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1687 ; AVX512VPOPCNTDQVL-NEXT: retq
1689 ; BITALG_NOVLX-LABEL: foldv4i32:
1690 ; BITALG_NOVLX: # %bb.0:
1691 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1692 ; BITALG_NOVLX-NEXT: retq
1694 ; BITALG-LABEL: foldv4i32:
1696 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1699 ; X32-SSE-LABEL: foldv4i32:
1701 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
1702 ; X32-SSE-NEXT: retl
1703 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
1707 define <4 x i32> @foldv4i32u() nounwind {
1708 ; SSE-LABEL: foldv4i32u:
1710 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
1713 ; AVX-LABEL: foldv4i32u:
1715 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1718 ; AVX512VPOPCNTDQ-LABEL: foldv4i32u:
1719 ; AVX512VPOPCNTDQ: # %bb.0:
1720 ; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1721 ; AVX512VPOPCNTDQ-NEXT: retq
1723 ; AVX512VPOPCNTDQVL-LABEL: foldv4i32u:
1724 ; AVX512VPOPCNTDQVL: # %bb.0:
1725 ; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1726 ; AVX512VPOPCNTDQVL-NEXT: retq
1728 ; BITALG_NOVLX-LABEL: foldv4i32u:
1729 ; BITALG_NOVLX: # %bb.0:
1730 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1731 ; BITALG_NOVLX-NEXT: retq
1733 ; BITALG-LABEL: foldv4i32u:
1735 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1738 ; X32-SSE-LABEL: foldv4i32u:
1740 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
1741 ; X32-SSE-NEXT: retl
1742 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
1746 define <8 x i16> @foldv8i16() nounwind {
1747 ; SSE-LABEL: foldv8i16:
1749 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1752 ; AVX-LABEL: foldv8i16:
1754 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1757 ; AVX512VPOPCNTDQ-LABEL: foldv8i16:
1758 ; AVX512VPOPCNTDQ: # %bb.0:
1759 ; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1760 ; AVX512VPOPCNTDQ-NEXT: retq
1762 ; AVX512VPOPCNTDQVL-LABEL: foldv8i16:
1763 ; AVX512VPOPCNTDQVL: # %bb.0:
1764 ; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1765 ; AVX512VPOPCNTDQVL-NEXT: retq
1767 ; BITALG_NOVLX-LABEL: foldv8i16:
1768 ; BITALG_NOVLX: # %bb.0:
1769 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1770 ; BITALG_NOVLX-NEXT: retq
1772 ; BITALG-LABEL: foldv8i16:
1774 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1777 ; X32-SSE-LABEL: foldv8i16:
1779 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1780 ; X32-SSE-NEXT: retl
1781 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
1785 define <8 x i16> @foldv8i16u() nounwind {
1786 ; SSE-LABEL: foldv8i16u:
1788 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1791 ; AVX-LABEL: foldv8i16u:
1793 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1796 ; AVX512VPOPCNTDQ-LABEL: foldv8i16u:
1797 ; AVX512VPOPCNTDQ: # %bb.0:
1798 ; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1799 ; AVX512VPOPCNTDQ-NEXT: retq
1801 ; AVX512VPOPCNTDQVL-LABEL: foldv8i16u:
1802 ; AVX512VPOPCNTDQVL: # %bb.0:
1803 ; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1804 ; AVX512VPOPCNTDQVL-NEXT: retq
1806 ; BITALG_NOVLX-LABEL: foldv8i16u:
1807 ; BITALG_NOVLX: # %bb.0:
1808 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1809 ; BITALG_NOVLX-NEXT: retq
1811 ; BITALG-LABEL: foldv8i16u:
1813 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1816 ; X32-SSE-LABEL: foldv8i16u:
1818 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1819 ; X32-SSE-NEXT: retl
1820 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
1824 define <16 x i8> @foldv16i8() nounwind {
1825 ; SSE-LABEL: foldv16i8:
1827 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1830 ; AVX-LABEL: foldv16i8:
1832 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1835 ; AVX512VPOPCNTDQ-LABEL: foldv16i8:
1836 ; AVX512VPOPCNTDQ: # %bb.0:
1837 ; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1838 ; AVX512VPOPCNTDQ-NEXT: retq
1840 ; AVX512VPOPCNTDQVL-LABEL: foldv16i8:
1841 ; AVX512VPOPCNTDQVL: # %bb.0:
1842 ; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1843 ; AVX512VPOPCNTDQVL-NEXT: retq
1845 ; BITALG_NOVLX-LABEL: foldv16i8:
1846 ; BITALG_NOVLX: # %bb.0:
1847 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1848 ; BITALG_NOVLX-NEXT: retq
1850 ; BITALG-LABEL: foldv16i8:
1852 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1855 ; X32-SSE-LABEL: foldv16i8:
1857 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1858 ; X32-SSE-NEXT: retl
1859 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
1863 define <16 x i8> @foldv16i8u() nounwind {
1864 ; SSE-LABEL: foldv16i8u:
1866 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1869 ; AVX-LABEL: foldv16i8u:
1871 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1874 ; AVX512VPOPCNTDQ-LABEL: foldv16i8u:
1875 ; AVX512VPOPCNTDQ: # %bb.0:
1876 ; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1877 ; AVX512VPOPCNTDQ-NEXT: retq
1879 ; AVX512VPOPCNTDQVL-LABEL: foldv16i8u:
1880 ; AVX512VPOPCNTDQVL: # %bb.0:
1881 ; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1882 ; AVX512VPOPCNTDQVL-NEXT: retq
1884 ; BITALG_NOVLX-LABEL: foldv16i8u:
1885 ; BITALG_NOVLX: # %bb.0:
1886 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1887 ; BITALG_NOVLX-NEXT: retq
1889 ; BITALG-LABEL: foldv16i8u:
1891 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1894 ; X32-SSE-LABEL: foldv16i8u:
1896 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1897 ; X32-SSE-NEXT: retl
1898 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
1902 declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
1903 declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
1904 declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1)
1905 declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1)