1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-ALL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-PERLANE
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-ALL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-PERLANE
8 define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
9 ; AVX512F-LABEL: shuf2i1_1_0:
11 ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
12 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
13 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
14 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
15 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
16 ; AVX512F-NEXT: vzeroupper
19 ; AVX512VL-LABEL: shuf2i1_1_0:
21 ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
22 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
23 ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
24 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
25 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
28 ; VL_BW_DQ-LABEL: shuf2i1_1_0:
30 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
31 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
32 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
33 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
35 %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
39 define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
40 ; AVX512F-LABEL: shuf2i1_1_2:
42 ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
43 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
44 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
45 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
46 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
47 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
48 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
49 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
50 ; AVX512F-NEXT: vzeroupper
53 ; AVX512VL-LABEL: shuf2i1_1_2:
55 ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
56 ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
57 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
58 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
59 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551615,0]
60 ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
61 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
62 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
65 ; VL_BW_DQ-LABEL: shuf2i1_1_2:
67 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
68 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
69 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
70 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
71 ; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
72 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
73 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
75 %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
80 define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
81 ; AVX512F-LABEL: shuf4i1_3_2_10:
83 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
84 ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
85 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
86 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
87 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
88 ; AVX512F-NEXT: vzeroupper
91 ; AVX512VL-LABEL: shuf4i1_3_2_10:
93 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
94 ; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0
95 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1
96 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
97 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
100 ; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
102 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
103 ; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
104 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
105 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
106 ; VL_BW_DQ-NEXT: retq
107 %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
111 define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
112 ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
114 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
115 ; AVX512F-NEXT: vpermq %zmm2, %zmm1, %zmm2
116 ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
117 ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
118 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
119 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
120 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
121 ; AVX512F-NEXT: vzeroupper
124 ; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
126 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
127 ; AVX512VL-NEXT: vpermq %zmm2, %zmm1, %zmm2
128 ; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0
129 ; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
130 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
131 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
132 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
133 ; AVX512VL-NEXT: vzeroupper
134 ; AVX512VL-NEXT: retq
136 ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
138 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
139 ; VL_BW_DQ-NEXT: vpermq %zmm2, %zmm1, %zmm2
140 ; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
141 ; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
142 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
143 ; VL_BW_DQ-NEXT: vzeroupper
144 ; VL_BW_DQ-NEXT: retq
145 %a2 = icmp eq <8 x i64> %a, %a1
146 %b2 = icmp eq <8 x i64> %b, %b1
147 %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
151 define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
152 ; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
154 ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
155 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
156 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
157 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
158 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
159 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
160 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
161 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
162 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
163 ; AVX512F-NEXT: vzeroupper
166 ; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
168 ; AVX512VL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
169 ; AVX512VL-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
170 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
171 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
172 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
173 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
174 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1
175 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
176 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
177 ; AVX512VL-NEXT: vzeroupper
178 ; AVX512VL-NEXT: retq
180 ; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
182 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
183 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
184 ; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0
185 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1
186 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
187 ; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
188 ; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0
189 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
190 ; VL_BW_DQ-NEXT: vzeroupper
191 ; VL_BW_DQ-NEXT: retq
192 %a2 = icmp eq <16 x i32> %a, %a1
193 %b2 = icmp eq <16 x i32> %b, %b1
194 %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
198 define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
199 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
201 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
202 ; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
203 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
204 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
205 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
206 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
207 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
208 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
209 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
210 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
211 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
212 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
213 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
214 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
215 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
218 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
220 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm1
221 ; AVX512VL-NEXT: vpslld $31, %zmm1, %zmm1
222 ; AVX512VL-NEXT: vptestmd %zmm1, %zmm1, %k1
223 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
224 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
225 ; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0
226 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
227 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
228 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
229 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
230 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
231 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1
232 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
233 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
234 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
235 ; AVX512VL-NEXT: retq
237 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
239 ; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0
240 ; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0
241 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
242 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
243 ; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
244 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
245 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0
246 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0
247 ; VL_BW_DQ-NEXT: retq
248 %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
252 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) {
253 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
255 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
256 ; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4
257 ; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
258 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
259 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
260 ; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
261 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
262 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
263 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
264 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
265 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
266 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
267 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
268 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
269 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
270 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
271 ; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0
274 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
276 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
277 ; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4
278 ; AVX512VL-NEXT: vpmovsxwd %ymm4, %zmm4
279 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
280 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
281 ; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
282 ; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
283 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
284 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
285 ; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
286 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
287 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
288 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
289 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
290 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
291 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
292 ; AVX512VL-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0
293 ; AVX512VL-NEXT: retq
295 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
297 ; VL_BW_DQ-NEXT: vptestnmw %zmm0, %zmm0, %k0
298 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
299 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
300 ; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
301 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0
302 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
303 ; VL_BW_DQ-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
304 ; VL_BW_DQ-NEXT: retq
305 %cmp = icmp eq <32 x i16> %a, zeroinitializer
306 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
307 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
311 define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8(<32 x i8> %a, <32 x i8> %c, <32 x i8> %d) {
312 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
314 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
315 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
316 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm3
317 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
318 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
319 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
320 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
321 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
322 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
323 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
324 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
325 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
326 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
327 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
328 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
329 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
332 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
334 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
335 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
336 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm3
337 ; AVX512VL-NEXT: vptestmd %zmm3, %zmm3, %k1
338 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
339 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
340 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
341 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
342 ; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
343 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
344 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
345 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
346 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
347 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
348 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
349 ; AVX512VL-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0
350 ; AVX512VL-NEXT: retq
352 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
354 ; VL_BW_DQ-NEXT: vptestnmb %ymm0, %ymm0, %k0
355 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
356 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
357 ; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
358 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0
359 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
360 ; VL_BW_DQ-NEXT: vpblendmb %ymm1, %ymm2, %ymm0 {%k1}
361 ; VL_BW_DQ-NEXT: retq
362 %cmp = icmp eq <32 x i8> %a, zeroinitializer
363 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
364 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
368 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
369 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
371 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
372 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
373 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
374 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
375 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
376 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
377 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
378 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
379 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
380 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
381 ; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0
384 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
386 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
387 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2
388 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
389 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
390 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
391 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
392 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
393 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
394 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
395 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
396 ; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0
397 ; AVX512VL-NEXT: retq
399 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
401 ; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
402 ; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1
403 ; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0
404 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
405 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
406 ; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
407 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
408 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
409 ; VL_BW_DQ-NEXT: vpblendmw %zmm2, %zmm3, %zmm0 {%k1}
410 ; VL_BW_DQ-NEXT: retq
411 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
412 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
413 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
414 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
415 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
419 define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) {
420 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
422 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
423 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
424 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
425 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
426 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
427 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
428 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
429 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
430 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
431 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
432 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
435 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
437 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
438 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2
439 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
440 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
441 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
442 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
443 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
444 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
445 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
446 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
447 ; AVX512VL-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm0
448 ; AVX512VL-NEXT: retq
450 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
452 ; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
453 ; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1
454 ; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0
455 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
456 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
457 ; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
458 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
459 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
460 ; VL_BW_DQ-NEXT: vpblendmb %ymm2, %ymm3, %ymm0 {%k1}
461 ; VL_BW_DQ-NEXT: retq
462 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
463 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
464 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
465 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
466 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
470 define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
471 ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
473 ; AVX512F-NEXT: kmovw %edi, %k1
474 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
475 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
476 ; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
477 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
478 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
479 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
480 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
481 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
482 ; AVX512F-NEXT: vzeroupper
485 ; AVX512VL-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
486 ; AVX512VL-FAST-ALL: # %bb.0:
487 ; AVX512VL-FAST-ALL-NEXT: kmovw %edi, %k1
488 ; AVX512VL-FAST-ALL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
489 ; AVX512VL-FAST-ALL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
490 ; AVX512VL-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2]
491 ; AVX512VL-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
492 ; AVX512VL-FAST-ALL-NEXT: vpslld $31, %ymm1, %ymm1
493 ; AVX512VL-FAST-ALL-NEXT: vptestmd %ymm1, %ymm1, %k1
494 ; AVX512VL-FAST-ALL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
495 ; AVX512VL-FAST-ALL-NEXT: vpmovdw %ymm0, %xmm0
496 ; AVX512VL-FAST-ALL-NEXT: vzeroupper
497 ; AVX512VL-FAST-ALL-NEXT: retq
499 ; AVX512VL-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
500 ; AVX512VL-FAST-PERLANE: # %bb.0:
501 ; AVX512VL-FAST-PERLANE-NEXT: kmovw %edi, %k1
502 ; AVX512VL-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
503 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
504 ; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
505 ; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1
506 ; AVX512VL-FAST-PERLANE-NEXT: vpslld $31, %ymm1, %ymm1
507 ; AVX512VL-FAST-PERLANE-NEXT: vptestmd %ymm1, %ymm1, %k1
508 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
509 ; AVX512VL-FAST-PERLANE-NEXT: vpmovdw %ymm0, %xmm0
510 ; AVX512VL-FAST-PERLANE-NEXT: vzeroupper
511 ; AVX512VL-FAST-PERLANE-NEXT: retq
513 ; VL_BW_DQ-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
514 ; VL_BW_DQ-FAST-ALL: # %bb.0:
515 ; VL_BW_DQ-FAST-ALL-NEXT: kmovd %edi, %k0
516 ; VL_BW_DQ-FAST-ALL-NEXT: vpmovm2d %k0, %ymm0
517 ; VL_BW_DQ-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
518 ; VL_BW_DQ-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
519 ; VL_BW_DQ-FAST-ALL-NEXT: vpmovd2m %ymm0, %k0
520 ; VL_BW_DQ-FAST-ALL-NEXT: vpmovm2w %k0, %xmm0
521 ; VL_BW_DQ-FAST-ALL-NEXT: vzeroupper
522 ; VL_BW_DQ-FAST-ALL-NEXT: retq
524 ; VL_BW_DQ-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
525 ; VL_BW_DQ-FAST-PERLANE: # %bb.0:
526 ; VL_BW_DQ-FAST-PERLANE-NEXT: kmovd %edi, %k0
527 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0
528 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
529 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm0
530 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0
531 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2w %k0, %xmm0
532 ; VL_BW_DQ-FAST-PERLANE-NEXT: vzeroupper
533 ; VL_BW_DQ-FAST-PERLANE-NEXT: retq
534 %b = bitcast i8 %a to <8 x i1>
535 %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
539 define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
540 ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
542 ; AVX512F-NEXT: kmovw %edi, %k1
543 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
544 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
545 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,2,10,u,3,u,2,u]
546 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
547 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
548 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
549 ; AVX512F-NEXT: kmovw %k0, %eax
550 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
551 ; AVX512F-NEXT: vzeroupper
554 ; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
556 ; AVX512VL-NEXT: kmovw %edi, %k1
557 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
558 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
559 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
560 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
561 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
562 ; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0
563 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
564 ; AVX512VL-NEXT: kmovw %k0, %eax
565 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
566 ; AVX512VL-NEXT: vzeroupper
567 ; AVX512VL-NEXT: retq
569 ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
571 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
572 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
573 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
574 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
575 ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
576 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
577 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
578 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
579 ; VL_BW_DQ-NEXT: vzeroupper
580 ; VL_BW_DQ-NEXT: retq
581 %b = bitcast i8 %a to <8 x i1>
582 %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
583 %d = bitcast <8 x i1> %c to i8
587 define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
588 ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
590 ; AVX512F-NEXT: kmovw %edi, %k1
591 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
592 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,4,5,6,7]
593 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
594 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
595 ; AVX512F-NEXT: kmovw %k0, %eax
596 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
597 ; AVX512F-NEXT: vzeroupper
600 ; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
602 ; AVX512VL-NEXT: kmovw %edi, %k1
603 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
604 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
605 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
606 ; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
607 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
608 ; AVX512VL-NEXT: kmovw %k0, %eax
609 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
610 ; AVX512VL-NEXT: vzeroupper
611 ; AVX512VL-NEXT: retq
613 ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
615 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
616 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
617 ; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
618 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
619 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
620 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
621 ; VL_BW_DQ-NEXT: vzeroupper
622 ; VL_BW_DQ-NEXT: retq
623 %b = bitcast i8 %a to <8 x i1>
624 %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
625 %d = bitcast <8 x i1> %c to i8
629 define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
630 ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
632 ; AVX512F-NEXT: kmovw %edi, %k1
633 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
634 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
635 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
636 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
637 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
638 ; AVX512F-NEXT: kmovw %k0, %eax
639 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
640 ; AVX512F-NEXT: vzeroupper
643 ; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
645 ; AVX512VL-NEXT: kmovw %edi, %k1
646 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
647 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
648 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
649 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
650 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
651 ; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0
652 ; AVX512VL-NEXT: kmovw %k0, %eax
653 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
654 ; AVX512VL-NEXT: vzeroupper
655 ; AVX512VL-NEXT: retq
657 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
659 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
660 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
661 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
662 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
663 ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
664 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
665 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
666 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
667 ; VL_BW_DQ-NEXT: vzeroupper
668 ; VL_BW_DQ-NEXT: retq
669 %b = bitcast i8 %a to <8 x i1>
670 %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
671 %d = bitcast <8 x i1>%c to i8
675 define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
676 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
678 ; AVX512F-NEXT: kmovw %edi, %k1
679 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
680 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
681 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
682 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
683 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
684 ; AVX512F-NEXT: kmovw %k0, %eax
685 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
686 ; AVX512F-NEXT: vzeroupper
689 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
691 ; AVX512VL-NEXT: kmovw %edi, %k1
692 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
693 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
694 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
695 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
696 ; AVX512VL-NEXT: kmovw %k0, %eax
697 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
698 ; AVX512VL-NEXT: vzeroupper
699 ; AVX512VL-NEXT: retq
701 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
703 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
704 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
705 ; VL_BW_DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
706 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
707 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
708 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
709 ; VL_BW_DQ-NEXT: vzeroupper
710 ; VL_BW_DQ-NEXT: retq
711 %b = bitcast i8 %a to <8 x i1>
712 %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
713 %d = bitcast <8 x i1>%c to i8
717 define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
718 ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
720 ; AVX512F-NEXT: kmovw %edi, %k1
721 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
722 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
723 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0]
724 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
725 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
726 ; AVX512F-NEXT: kmovw %k0, %eax
727 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
728 ; AVX512F-NEXT: vzeroupper
731 ; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
733 ; AVX512VL-NEXT: kmovw %edi, %k1
734 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
735 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
736 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
737 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
738 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
739 ; AVX512VL-NEXT: kmovw %k0, %eax
740 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
741 ; AVX512VL-NEXT: vzeroupper
742 ; AVX512VL-NEXT: retq
744 ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
746 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
747 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
748 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
749 ; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
750 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
751 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
752 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
753 ; VL_BW_DQ-NEXT: vzeroupper
754 ; VL_BW_DQ-NEXT: retq
755 %b = bitcast i8 %a to <8 x i1>
756 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
757 %c1 = bitcast <8 x i1>%c to i8
761 define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
762 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
764 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
765 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
766 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
767 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
768 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
769 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
770 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
771 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
772 ; AVX512F-NEXT: kmovw %k0, %eax
773 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
774 ; AVX512F-NEXT: vzeroupper
777 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
779 ; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0
780 ; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
781 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1
782 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
783 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
784 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7]
785 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
786 ; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0
787 ; AVX512VL-NEXT: kmovw %k0, %eax
788 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
789 ; AVX512VL-NEXT: vzeroupper
790 ; AVX512VL-NEXT: retq
792 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
794 ; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0
795 ; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
796 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
797 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7]
798 ; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
799 ; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2
800 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
801 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
802 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
803 ; VL_BW_DQ-NEXT: vzeroupper
804 ; VL_BW_DQ-NEXT: retq
805 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
806 %c1 = bitcast <8 x i1>%c to i8
810 define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
811 ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
813 ; AVX512F-NEXT: kmovw %edi, %k1
814 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
815 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
816 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
817 ; AVX512F-NEXT: kmovw %k0, %eax
818 ; AVX512F-NEXT: # kill: def $ax killed $ax killed $eax
819 ; AVX512F-NEXT: vzeroupper
822 ; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
824 ; AVX512VL-NEXT: kmovw %edi, %k1
825 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
826 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
827 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
828 ; AVX512VL-NEXT: kmovw %k0, %eax
829 ; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
830 ; AVX512VL-NEXT: vzeroupper
831 ; AVX512VL-NEXT: retq
833 ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
835 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
836 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0
837 ; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
838 ; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
839 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
840 ; VL_BW_DQ-NEXT: # kill: def $ax killed $ax killed $eax
841 ; VL_BW_DQ-NEXT: vzeroupper
842 ; VL_BW_DQ-NEXT: retq
843 %b = bitcast i16 %a to <16 x i1>
844 %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
845 %d = bitcast <16 x i1> %c to i16
849 define i64 @shuf64i1_zero(i64 %a) {
850 ; AVX512F-LABEL: shuf64i1_zero:
852 ; AVX512F-NEXT: kmovw %edi, %k1
853 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
854 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
855 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
856 ; AVX512F-NEXT: kmovw %k0, %eax
857 ; AVX512F-NEXT: kmovw %k0, %ecx
858 ; AVX512F-NEXT: shll $16, %ecx
859 ; AVX512F-NEXT: orl %eax, %ecx
860 ; AVX512F-NEXT: movq %rcx, %rax
861 ; AVX512F-NEXT: shlq $32, %rax
862 ; AVX512F-NEXT: orq %rcx, %rax
863 ; AVX512F-NEXT: vzeroupper
866 ; AVX512VL-LABEL: shuf64i1_zero:
868 ; AVX512VL-NEXT: kmovw %edi, %k1
869 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
870 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
871 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
872 ; AVX512VL-NEXT: kmovw %k0, %eax
873 ; AVX512VL-NEXT: kmovw %k0, %ecx
874 ; AVX512VL-NEXT: shll $16, %ecx
875 ; AVX512VL-NEXT: orl %eax, %ecx
876 ; AVX512VL-NEXT: movq %rcx, %rax
877 ; AVX512VL-NEXT: shlq $32, %rax
878 ; AVX512VL-NEXT: orq %rcx, %rax
879 ; AVX512VL-NEXT: vzeroupper
880 ; AVX512VL-NEXT: retq
882 ; VL_BW_DQ-LABEL: shuf64i1_zero:
884 ; VL_BW_DQ-NEXT: kmovq %rdi, %k0
885 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %zmm0
886 ; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0
887 ; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0
888 ; VL_BW_DQ-NEXT: kmovq %k0, %rax
889 ; VL_BW_DQ-NEXT: vzeroupper
890 ; VL_BW_DQ-NEXT: retq
891 %b = bitcast i64 %a to <64 x i1>
892 %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
893 %d = bitcast <64 x i1> %c to i64
897 define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) {
898 ; AVX512F-LABEL: PR52500:
900 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
901 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
902 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
903 ; AVX512F-NEXT: vmovd %edi, %xmm0
904 ; AVX512F-NEXT: movl $789, %eax # imm = 0x315
905 ; AVX512F-NEXT: vmovd %eax, %xmm1
906 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
907 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
908 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1}
909 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
910 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
911 ; AVX512F-NEXT: vzeroupper
914 ; AVX512VL-LABEL: PR52500:
916 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
917 ; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0
918 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1
919 ; AVX512VL-NEXT: vmovd %edi, %xmm0
920 ; AVX512VL-NEXT: movl $789, %eax # imm = 0x315
921 ; AVX512VL-NEXT: vmovd %eax, %xmm1
922 ; AVX512VL-NEXT: vpmulld %xmm1, %xmm0, %xmm0
923 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
924 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1}
925 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
926 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
927 ; AVX512VL-NEXT: vzeroupper
928 ; AVX512VL-NEXT: retq
930 ; VL_BW_DQ-LABEL: PR52500:
932 ; VL_BW_DQ-NEXT: vpsllw $7, %xmm0, %xmm0
933 ; VL_BW_DQ-NEXT: vpmovb2m %xmm0, %k1
934 ; VL_BW_DQ-NEXT: vmovd %edi, %xmm0
935 ; VL_BW_DQ-NEXT: movl $789, %eax # imm = 0x315
936 ; VL_BW_DQ-NEXT: vmovd %eax, %xmm1
937 ; VL_BW_DQ-NEXT: vpmulld %xmm1, %xmm0, %xmm0
938 ; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
939 ; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k1}
940 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
941 ; VL_BW_DQ-NEXT: vzeroupper
942 ; VL_BW_DQ-NEXT: retq
943 %insrt = insertelement <16 x i32> undef, i32 %in, i32 0
944 %mul = mul <16 x i32> %insrt, <i32 789, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
945 %eq = icmp eq <16 x i32> %mul, zeroinitializer
946 %cmp1 = shufflevector <16 x i1> %eq, <16 x i1> poison, <16 x i32> zeroinitializer
947 %and = and <16 x i1> %cmp1, %msk