1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=AVX512VPOPCNTDQ,AVX512VPOPCNTDQ-NOBW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512bw | FileCheck %s --check-prefixes=AVX512VPOPCNTDQ,AVX512VPOPCNTDQ-BW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=BITALG
8 define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
9 ; AVX512F-LABEL: testv8i64:
11 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
12 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
13 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
14 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
15 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
16 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
17 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
18 ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
19 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
20 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
21 ; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
22 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5
23 ; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5
24 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
25 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
26 ; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
27 ; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0
28 ; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0
29 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
32 ; AVX512BW-LABEL: testv8i64:
34 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
35 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
36 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
37 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
38 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
39 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
40 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
41 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
42 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
43 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
46 ; AVX512VPOPCNTDQ-LABEL: testv8i64:
47 ; AVX512VPOPCNTDQ: # %bb.0:
48 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
49 ; AVX512VPOPCNTDQ-NEXT: retq
51 ; BITALG-LABEL: testv8i64:
53 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0
54 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
55 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
57 %out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in)
61 define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
62 ; AVX512F-LABEL: testv16i32:
64 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
65 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
66 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
67 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
68 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
69 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
70 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
71 ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
72 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
73 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
74 ; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
75 ; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5
76 ; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
77 ; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
78 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
79 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5
80 ; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5
81 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
82 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
83 ; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
84 ; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0
85 ; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7]
86 ; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2
87 ; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5]
88 ; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0
89 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
90 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
93 ; AVX512BW-LABEL: testv16i32:
95 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
96 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
97 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
98 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
99 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
100 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
101 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
102 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
103 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
104 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
105 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
106 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
107 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
108 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
109 ; AVX512BW-NEXT: retq
111 ; AVX512VPOPCNTDQ-LABEL: testv16i32:
112 ; AVX512VPOPCNTDQ: # %bb.0:
113 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
114 ; AVX512VPOPCNTDQ-NEXT: retq
116 ; BITALG-LABEL: testv16i32:
118 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0
119 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
120 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
121 ; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
122 ; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
123 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
124 ; BITALG-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
126 %out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in)
130 define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
131 ; AVX512F-LABEL: testv32i16:
133 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
134 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
135 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
136 ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
137 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
138 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
139 ; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4
140 ; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2
141 ; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm4
142 ; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2
143 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
144 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
145 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm4
146 ; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4
147 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
148 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
149 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
150 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
151 ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
152 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
153 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
154 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
157 ; AVX512BW-LABEL: testv32i16:
159 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
160 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
161 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
162 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
163 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
164 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
165 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
166 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
167 ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
168 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
169 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
170 ; AVX512BW-NEXT: retq
172 ; AVX512VPOPCNTDQ-NOBW-LABEL: testv32i16:
173 ; AVX512VPOPCNTDQ-NOBW: # %bb.0:
174 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
175 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm1, %zmm1
176 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm1, %ymm1
177 ; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
178 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
179 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
180 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
181 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
182 ; AVX512VPOPCNTDQ-NOBW-NEXT: retq
184 ; AVX512VPOPCNTDQ-BW-LABEL: testv32i16:
185 ; AVX512VPOPCNTDQ-BW: # %bb.0:
186 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
187 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
188 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
189 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
190 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
191 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
192 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
193 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
194 ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1
195 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
196 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0
197 ; AVX512VPOPCNTDQ-BW-NEXT: retq
199 ; BITALG-LABEL: testv32i16:
201 ; BITALG-NEXT: vpopcntw %zmm0, %zmm0
203 %out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
207 define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
208 ; AVX512F-LABEL: testv64i8:
210 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
211 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
212 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
213 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
214 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
215 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
216 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
217 ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
218 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
219 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
220 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
221 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
222 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
223 ; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
224 ; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
225 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
228 ; AVX512BW-LABEL: testv64i8:
230 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
231 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
232 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
233 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
234 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
235 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
236 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
237 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
238 ; AVX512BW-NEXT: retq
240 ; AVX512VPOPCNTDQ-NOBW-LABEL: testv64i8:
241 ; AVX512VPOPCNTDQ-NOBW: # %bb.0:
242 ; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
243 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
244 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm3
245 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
246 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
247 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm1, %ymm1
248 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1
249 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm1, %ymm4, %ymm1
250 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm1, %ymm1
251 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm3
252 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
253 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm0
254 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0
255 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm4, %ymm0
256 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm0, %ymm0
257 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
258 ; AVX512VPOPCNTDQ-NOBW-NEXT: retq
260 ; AVX512VPOPCNTDQ-BW-LABEL: testv64i8:
261 ; AVX512VPOPCNTDQ-BW: # %bb.0:
262 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
263 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
264 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
265 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
266 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
267 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
268 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
269 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
270 ; AVX512VPOPCNTDQ-BW-NEXT: retq
272 ; BITALG-LABEL: testv64i8:
274 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0
276 %out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in)
280 define <8 x i64> @eq_1_v8i64(<8 x i64> %0) {
281 ; AVX512F-LABEL: eq_1_v8i64:
283 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
284 ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
285 ; AVX512F-NEXT: vptestnmq %zmm1, %zmm0, %k1
286 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
287 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
290 ; AVX512BW-LABEL: eq_1_v8i64:
292 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
293 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1
294 ; AVX512BW-NEXT: vptestnmq %zmm1, %zmm0, %k1
295 ; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
296 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
297 ; AVX512BW-NEXT: retq
299 ; AVX512VPOPCNTDQ-LABEL: eq_1_v8i64:
300 ; AVX512VPOPCNTDQ: # %bb.0:
301 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
302 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
303 ; AVX512VPOPCNTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
304 ; AVX512VPOPCNTDQ-NEXT: retq
306 ; BITALG-LABEL: eq_1_v8i64:
308 ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
309 ; BITALG-NEXT: vpaddq %zmm1, %zmm0, %zmm1
310 ; BITALG-NEXT: vptestnmq %zmm1, %zmm0, %k1
311 ; BITALG-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
312 ; BITALG-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
314 %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
315 %3 = icmp eq <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
316 %4 = sext <8 x i1> %3 to <8 x i64>
320 define <8 x i64> @ne_1_v8i64(<8 x i64> %0) {
321 ; AVX512F-LABEL: ne_1_v8i64:
323 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
324 ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
325 ; AVX512F-NEXT: vptestmq %zmm1, %zmm0, %k0
326 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k1
327 ; AVX512F-NEXT: korw %k0, %k1, %k1
328 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
331 ; AVX512BW-LABEL: ne_1_v8i64:
333 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
334 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1
335 ; AVX512BW-NEXT: vptestmq %zmm1, %zmm0, %k0
336 ; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k1
337 ; AVX512BW-NEXT: korw %k0, %k1, %k1
338 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
339 ; AVX512BW-NEXT: retq
341 ; AVX512VPOPCNTDQ-LABEL: ne_1_v8i64:
342 ; AVX512VPOPCNTDQ: # %bb.0:
343 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
344 ; AVX512VPOPCNTDQ-NEXT: vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
345 ; AVX512VPOPCNTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
346 ; AVX512VPOPCNTDQ-NEXT: retq
348 ; BITALG-LABEL: ne_1_v8i64:
350 ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
351 ; BITALG-NEXT: vpaddq %zmm1, %zmm0, %zmm1
352 ; BITALG-NEXT: vptestmq %zmm1, %zmm0, %k0
353 ; BITALG-NEXT: vptestnmq %zmm0, %zmm0, %k1
354 ; BITALG-NEXT: korw %k0, %k1, %k1
355 ; BITALG-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
357 %2 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %0)
358 %3 = icmp ne <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
359 %4 = sext <8 x i1> %3 to <8 x i64>
363 define <16 x i32> @eq_1_v16i32(<16 x i32> %0) {
364 ; AVX512F-LABEL: eq_1_v16i32:
366 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
367 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1
368 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1
369 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
370 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
373 ; AVX512BW-LABEL: eq_1_v16i32:
375 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
376 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1
377 ; AVX512BW-NEXT: vptestnmd %zmm1, %zmm0, %k1
378 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
379 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
380 ; AVX512BW-NEXT: retq
382 ; AVX512VPOPCNTDQ-LABEL: eq_1_v16i32:
383 ; AVX512VPOPCNTDQ: # %bb.0:
384 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
385 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
386 ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
387 ; AVX512VPOPCNTDQ-NEXT: retq
389 ; BITALG-LABEL: eq_1_v16i32:
391 ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
392 ; BITALG-NEXT: vpaddd %zmm1, %zmm0, %zmm1
393 ; BITALG-NEXT: vptestnmd %zmm1, %zmm0, %k1
394 ; BITALG-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
395 ; BITALG-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
397 %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
398 %3 = icmp eq <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
399 %4 = sext <16 x i1> %3 to <16 x i32>
403 define <16 x i32> @ne_1_v16i32(<16 x i32> %0) {
404 ; AVX512F-LABEL: ne_1_v16i32:
406 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
407 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1
408 ; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k0
409 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
410 ; AVX512F-NEXT: korw %k0, %k1, %k1
411 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
414 ; AVX512BW-LABEL: ne_1_v16i32:
416 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
417 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1
418 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm0, %k0
419 ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k1
420 ; AVX512BW-NEXT: korw %k0, %k1, %k1
421 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
422 ; AVX512BW-NEXT: retq
424 ; AVX512VPOPCNTDQ-LABEL: ne_1_v16i32:
425 ; AVX512VPOPCNTDQ: # %bb.0:
426 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
427 ; AVX512VPOPCNTDQ-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
428 ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
429 ; AVX512VPOPCNTDQ-NEXT: retq
431 ; BITALG-LABEL: ne_1_v16i32:
433 ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
434 ; BITALG-NEXT: vpaddd %zmm1, %zmm0, %zmm1
435 ; BITALG-NEXT: vptestmd %zmm1, %zmm0, %k0
436 ; BITALG-NEXT: vptestnmd %zmm0, %zmm0, %k1
437 ; BITALG-NEXT: korw %k0, %k1, %k1
438 ; BITALG-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
440 %2 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %0)
441 %3 = icmp ne <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
442 %4 = sext <16 x i1> %3 to <16 x i32>
446 define <32 x i16> @eq_1_v32i16(<32 x i16> %0) {
447 ; AVX512F-LABEL: eq_1_v32i16:
449 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
450 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
451 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm3
452 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm4
453 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
454 ; AVX512F-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
455 ; AVX512F-NEXT: vpaddw %ymm4, %ymm1, %ymm5
456 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
457 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
458 ; AVX512F-NEXT: vpaddw %ymm4, %ymm0, %ymm4
459 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
460 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
461 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
462 ; AVX512F-NEXT: vpandnq %zmm0, %zmm3, %zmm0
465 ; AVX512BW-LABEL: eq_1_v32i16:
467 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
468 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
469 ; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k1
470 ; AVX512BW-NEXT: vptestmw %zmm0, %zmm0, %k0 {%k1}
471 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
472 ; AVX512BW-NEXT: retq
474 ; AVX512VPOPCNTDQ-NOBW-LABEL: eq_1_v32i16:
475 ; AVX512VPOPCNTDQ-NOBW: # %bb.0:
476 ; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
477 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
478 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm3
479 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm4
480 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
481 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
482 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm4, %ymm1, %ymm5
483 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm5, %ymm1, %ymm1
484 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
485 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm4, %ymm0, %ymm4
486 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm4, %ymm0, %ymm0
487 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
488 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
489 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpandnq %zmm0, %zmm3, %zmm0
490 ; AVX512VPOPCNTDQ-NOBW-NEXT: retq
492 ; AVX512VPOPCNTDQ-BW-LABEL: eq_1_v32i16:
493 ; AVX512VPOPCNTDQ-BW: # %bb.0:
494 ; AVX512VPOPCNTDQ-BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
495 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
496 ; AVX512VPOPCNTDQ-BW-NEXT: vptestnmw %zmm1, %zmm0, %k1
497 ; AVX512VPOPCNTDQ-BW-NEXT: vptestmw %zmm0, %zmm0, %k0 {%k1}
498 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0
499 ; AVX512VPOPCNTDQ-BW-NEXT: retq
501 ; BITALG-LABEL: eq_1_v32i16:
503 ; BITALG-NEXT: vpopcntw %zmm0, %zmm0
504 ; BITALG-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
505 ; BITALG-NEXT: vpmovm2w %k0, %zmm0
507 %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0)
508 %3 = icmp eq <32 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
509 %4 = sext <32 x i1> %3 to <32 x i16>
513 define <32 x i16> @ne_1_v32i16(<32 x i16> %0) {
514 ; AVX512F-LABEL: ne_1_v32i16:
516 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
517 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
518 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm3
519 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm4
520 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
521 ; AVX512F-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
522 ; AVX512F-NEXT: vpaddw %ymm4, %ymm1, %ymm5
523 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
524 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
525 ; AVX512F-NEXT: vpaddw %ymm4, %ymm0, %ymm4
526 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
527 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
528 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
529 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
530 ; AVX512F-NEXT: vpternlogq $222, %zmm1, %zmm3, %zmm0
533 ; AVX512BW-LABEL: ne_1_v32i16:
535 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
536 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
537 ; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0
538 ; AVX512BW-NEXT: vptestnmw %zmm0, %zmm0, %k1
539 ; AVX512BW-NEXT: kord %k0, %k1, %k0
540 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
541 ; AVX512BW-NEXT: retq
543 ; AVX512VPOPCNTDQ-NOBW-LABEL: ne_1_v32i16:
544 ; AVX512VPOPCNTDQ-NOBW: # %bb.0:
545 ; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
546 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
547 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm3
548 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm4
549 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
550 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
551 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm4, %ymm1, %ymm5
552 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm5, %ymm1, %ymm1
553 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
554 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm4, %ymm0, %ymm4
555 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm4, %ymm0, %ymm0
556 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
557 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
558 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
559 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpternlogq $222, %zmm1, %zmm3, %zmm0
560 ; AVX512VPOPCNTDQ-NOBW-NEXT: retq
562 ; AVX512VPOPCNTDQ-BW-LABEL: ne_1_v32i16:
563 ; AVX512VPOPCNTDQ-BW: # %bb.0:
564 ; AVX512VPOPCNTDQ-BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
565 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
566 ; AVX512VPOPCNTDQ-BW-NEXT: vptestmw %zmm1, %zmm0, %k0
567 ; AVX512VPOPCNTDQ-BW-NEXT: vptestnmw %zmm0, %zmm0, %k1
568 ; AVX512VPOPCNTDQ-BW-NEXT: kord %k0, %k1, %k0
569 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0
570 ; AVX512VPOPCNTDQ-BW-NEXT: retq
572 ; BITALG-LABEL: ne_1_v32i16:
574 ; BITALG-NEXT: vpopcntw %zmm0, %zmm0
575 ; BITALG-NEXT: vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
576 ; BITALG-NEXT: vpmovm2w %k0, %zmm0
578 %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0)
579 %3 = icmp ne <32 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
580 %4 = sext <32 x i1> %3 to <32 x i16>
584 define <64 x i8> @eq_1_v64i8(<64 x i8> %0) {
585 ; AVX512F-LABEL: eq_1_v64i8:
587 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
588 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
589 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm3
590 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm4
591 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
592 ; AVX512F-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
593 ; AVX512F-NEXT: vpaddb %ymm4, %ymm1, %ymm5
594 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
595 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
596 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm4
597 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
598 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
599 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
600 ; AVX512F-NEXT: vpandnq %zmm0, %zmm3, %zmm0
603 ; AVX512BW-LABEL: eq_1_v64i8:
605 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
606 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1
607 ; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k1
608 ; AVX512BW-NEXT: vptestmb %zmm0, %zmm0, %k0 {%k1}
609 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
610 ; AVX512BW-NEXT: retq
612 ; AVX512VPOPCNTDQ-NOBW-LABEL: eq_1_v64i8:
613 ; AVX512VPOPCNTDQ-NOBW: # %bb.0:
614 ; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
615 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
616 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm3
617 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm4
618 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
619 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
620 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm1, %ymm5
621 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm5, %ymm1, %ymm1
622 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
623 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm4
624 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm4, %ymm0, %ymm0
625 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
626 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
627 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpandnq %zmm0, %zmm3, %zmm0
628 ; AVX512VPOPCNTDQ-NOBW-NEXT: retq
630 ; AVX512VPOPCNTDQ-BW-LABEL: eq_1_v64i8:
631 ; AVX512VPOPCNTDQ-BW: # %bb.0:
632 ; AVX512VPOPCNTDQ-BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
633 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1
634 ; AVX512VPOPCNTDQ-BW-NEXT: vptestnmb %zmm1, %zmm0, %k1
635 ; AVX512VPOPCNTDQ-BW-NEXT: vptestmb %zmm0, %zmm0, %k0 {%k1}
636 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2b %k0, %zmm0
637 ; AVX512VPOPCNTDQ-BW-NEXT: retq
639 ; BITALG-LABEL: eq_1_v64i8:
641 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0
642 ; BITALG-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
643 ; BITALG-NEXT: vpmovm2b %k0, %zmm0
645 %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0)
646 %3 = icmp eq <64 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
647 %4 = sext <64 x i1> %3 to <64 x i8>
651 define <64 x i8> @ne_1_v64i8(<64 x i8> %0) {
652 ; AVX512F-LABEL: ne_1_v64i8:
654 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
655 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
656 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm3
657 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm4
658 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
659 ; AVX512F-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
660 ; AVX512F-NEXT: vpaddb %ymm4, %ymm1, %ymm5
661 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
662 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
663 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm4
664 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
665 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
666 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
667 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
668 ; AVX512F-NEXT: vpternlogq $222, %zmm1, %zmm3, %zmm0
671 ; AVX512BW-LABEL: ne_1_v64i8:
673 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
674 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1
675 ; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0
676 ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k1
677 ; AVX512BW-NEXT: korq %k0, %k1, %k0
678 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
679 ; AVX512BW-NEXT: retq
681 ; AVX512VPOPCNTDQ-NOBW-LABEL: ne_1_v64i8:
682 ; AVX512VPOPCNTDQ-NOBW: # %bb.0:
683 ; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
684 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
685 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm3
686 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm4
687 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
688 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
689 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm1, %ymm5
690 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm5, %ymm1, %ymm1
691 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
692 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm4
693 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm4, %ymm0, %ymm0
694 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
695 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
696 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
697 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpternlogq $222, %zmm1, %zmm3, %zmm0
698 ; AVX512VPOPCNTDQ-NOBW-NEXT: retq
700 ; AVX512VPOPCNTDQ-BW-LABEL: ne_1_v64i8:
701 ; AVX512VPOPCNTDQ-BW: # %bb.0:
702 ; AVX512VPOPCNTDQ-BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
703 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1
704 ; AVX512VPOPCNTDQ-BW-NEXT: vptestmb %zmm1, %zmm0, %k0
705 ; AVX512VPOPCNTDQ-BW-NEXT: vptestnmb %zmm0, %zmm0, %k1
706 ; AVX512VPOPCNTDQ-BW-NEXT: korq %k0, %k1, %k0
707 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2b %k0, %zmm0
708 ; AVX512VPOPCNTDQ-BW-NEXT: retq
710 ; BITALG-LABEL: ne_1_v64i8:
712 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0
713 ; BITALG-NEXT: vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
714 ; BITALG-NEXT: vpmovm2b %k0, %zmm0
716 %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0)
717 %3 = icmp ne <64 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
718 %4 = sext <64 x i1> %3 to <64 x i8>
722 declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>)
723 declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>)
724 declare <32 x i16> @llvm.ctpop.v32i16(<32 x i16>)
725 declare <64 x i8> @llvm.ctpop.v64i8(<64 x i8>)