1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=VL_BW_DQ
6 define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
7 ; AVX512F-LABEL: shuf2i1_1_0:
9 ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
10 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
11 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
12 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
13 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
14 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
15 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
16 ; AVX512F-NEXT: vzeroupper
19 ; AVX512VL-LABEL: shuf2i1_1_0:
21 ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
22 ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
23 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
24 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
25 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
26 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
27 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
30 ; VL_BW_DQ-LABEL: shuf2i1_1_0:
32 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
33 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
34 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
35 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
36 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
37 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
39 %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
43 define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
44 ; AVX512F-LABEL: shuf2i1_1_2:
46 ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
47 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
48 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
49 ; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
50 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
51 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
52 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
53 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
54 ; AVX512F-NEXT: vzeroupper
57 ; AVX512VL-LABEL: shuf2i1_1_2:
59 ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
60 ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
61 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
62 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
63 ; AVX512VL-NEXT: movq $-1, %rax
64 ; AVX512VL-NEXT: vmovq %rax, %xmm2
65 ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
66 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
67 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
70 ; VL_BW_DQ-LABEL: shuf2i1_1_2:
72 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
73 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
74 ; VL_BW_DQ-NEXT: movq $-1, %rax
75 ; VL_BW_DQ-NEXT: vmovq %rax, %xmm0
76 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1
77 ; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
78 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
79 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
81 %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
86 define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
87 ; AVX512F-LABEL: shuf4i1_3_2_10:
89 ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
90 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
91 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
92 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
93 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
94 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
95 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
96 ; AVX512F-NEXT: vzeroupper
99 ; AVX512VL-LABEL: shuf4i1_3_2_10:
101 ; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0
102 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1
103 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
104 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z}
105 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
106 ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
107 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
108 ; AVX512VL-NEXT: retq
110 ; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
112 ; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
113 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
114 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
115 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
116 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
117 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
118 ; VL_BW_DQ-NEXT: retq
119 %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
123 define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
124 ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
126 ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
127 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
128 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
129 ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
130 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
131 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
132 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
133 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
134 ; AVX512F-NEXT: vzeroupper
137 ; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
139 ; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
140 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
141 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
142 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
143 ; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1
144 ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
145 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
146 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
147 ; AVX512VL-NEXT: vzeroupper
148 ; AVX512VL-NEXT: retq
150 ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
152 ; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
153 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
154 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0]
155 ; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
156 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
157 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
158 ; VL_BW_DQ-NEXT: vzeroupper
159 ; VL_BW_DQ-NEXT: retq
160 %a2 = icmp eq <8 x i64> %a, %a1
161 %b2 = icmp eq <8 x i64> %b, %b1
162 %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
166 define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
167 ; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
169 ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
170 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
171 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
172 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
173 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
174 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
175 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
176 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
177 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
178 ; AVX512F-NEXT: vzeroupper
181 ; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
183 ; AVX512VL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
184 ; AVX512VL-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
185 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
186 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
187 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
188 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
189 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1
190 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
191 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
192 ; AVX512VL-NEXT: vzeroupper
193 ; AVX512VL-NEXT: retq
195 ; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
197 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
198 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
199 ; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0
200 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1
201 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
202 ; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
203 ; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0
204 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
205 ; VL_BW_DQ-NEXT: vzeroupper
206 ; VL_BW_DQ-NEXT: retq
207 %a2 = icmp eq <16 x i32> %a, %a1
208 %b2 = icmp eq <16 x i32> %b, %b1
209 %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
213 define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
214 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
216 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
217 ; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
218 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
219 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
220 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
221 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
222 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
223 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
224 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
225 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
226 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
227 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
228 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
229 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
230 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
233 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
235 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm1
236 ; AVX512VL-NEXT: vpslld $31, %zmm1, %zmm1
237 ; AVX512VL-NEXT: vptestmd %zmm1, %zmm1, %k1
238 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
239 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
240 ; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0
241 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
242 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
243 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
244 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
245 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
246 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1
247 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
248 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
249 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
250 ; AVX512VL-NEXT: retq
252 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
254 ; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0
255 ; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0
256 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
257 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
258 ; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
259 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
260 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0
261 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0
262 ; VL_BW_DQ-NEXT: retq
263 %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
267 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) {
268 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
270 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
271 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
272 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
273 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
274 ; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0
275 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
276 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
277 ; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm0
278 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
279 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
280 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
281 ; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
282 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
283 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm6
284 ; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1
285 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
286 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
287 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3
288 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
289 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
292 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
294 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
295 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4
296 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5
297 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
298 ; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0
299 ; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
300 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1
301 ; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm0
302 ; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
303 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
304 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
305 ; AVX512VL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
306 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
307 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm5, %zmm6
308 ; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1
309 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
310 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
311 ; AVX512VL-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3
312 ; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
313 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
314 ; AVX512VL-NEXT: retq
316 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
318 ; VL_BW_DQ-NEXT: vptestnmw %zmm0, %zmm0, %k0
319 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
320 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
321 ; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
322 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0
323 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
324 ; VL_BW_DQ-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
325 ; VL_BW_DQ-NEXT: retq
326 %cmp = icmp eq <32 x i16> %a, zeroinitializer
327 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
328 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
332 define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8(<32 x i8> %a, <32 x i8> %c, <32 x i8> %d) {
333 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
335 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
336 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
337 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm3
338 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
339 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
340 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
341 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
342 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
343 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
344 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
345 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
346 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
347 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
348 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
349 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
350 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
353 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
355 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
356 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
357 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm3
358 ; AVX512VL-NEXT: vptestmd %zmm3, %zmm3, %k1
359 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
360 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
361 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
362 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
363 ; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
364 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
365 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
366 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
367 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
368 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
369 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
370 ; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
371 ; AVX512VL-NEXT: retq
373 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
375 ; VL_BW_DQ-NEXT: vptestnmb %ymm0, %ymm0, %k0
376 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
377 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
378 ; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
379 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0
380 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
381 ; VL_BW_DQ-NEXT: vpblendmb %ymm1, %ymm2, %ymm0 {%k1}
382 ; VL_BW_DQ-NEXT: retq
383 %cmp = icmp eq <32 x i8> %a, zeroinitializer
384 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
385 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
389 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
390 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
392 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
393 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm5
394 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
395 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
396 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
397 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
398 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
399 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
400 ; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1
401 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
402 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
403 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm1
404 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
405 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
408 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
410 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4
411 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm5
412 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
413 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2
414 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
415 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
416 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
417 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
418 ; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1
419 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
420 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
421 ; AVX512VL-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm1
422 ; AVX512VL-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
423 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
424 ; AVX512VL-NEXT: retq
426 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
428 ; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
429 ; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1
430 ; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0
431 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
432 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
433 ; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
434 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
435 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
436 ; VL_BW_DQ-NEXT: vpblendmw %zmm2, %zmm3, %zmm0 {%k1}
437 ; VL_BW_DQ-NEXT: retq
438 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
439 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
440 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
441 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
442 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
446 define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) {
447 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
449 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
450 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
451 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
452 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
453 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
454 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
455 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
456 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
457 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
458 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
459 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
462 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
464 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
465 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2
466 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
467 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
468 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
469 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
470 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
471 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
472 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
473 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
474 ; AVX512VL-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
475 ; AVX512VL-NEXT: retq
477 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
479 ; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
480 ; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1
481 ; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0
482 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
483 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
484 ; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
485 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
486 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
487 ; VL_BW_DQ-NEXT: vpblendmb %ymm2, %ymm3, %ymm0 {%k1}
488 ; VL_BW_DQ-NEXT: retq
489 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
490 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
491 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
492 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
493 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
497 define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
498 ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
500 ; AVX512F-NEXT: kmovw %edi, %k1
501 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
502 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
503 ; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
504 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
505 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
506 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
507 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
508 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
509 ; AVX512F-NEXT: vzeroupper
512 ; AVX512VL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
514 ; AVX512VL-NEXT: kmovw %edi, %k1
515 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
516 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
517 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
518 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
519 ; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1
520 ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
521 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
522 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
523 ; AVX512VL-NEXT: vzeroupper
524 ; AVX512VL-NEXT: retq
526 ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
528 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
529 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
530 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
531 ; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %ymm0
532 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
533 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
534 ; VL_BW_DQ-NEXT: vzeroupper
535 ; VL_BW_DQ-NEXT: retq
536 %b = bitcast i8 %a to <8 x i1>
537 %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
541 define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
542 ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
544 ; AVX512F-NEXT: kmovw %edi, %k1
545 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
546 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
547 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
548 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
549 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
550 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
551 ; AVX512F-NEXT: kmovw %k0, %eax
552 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
553 ; AVX512F-NEXT: vzeroupper
556 ; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
558 ; AVX512VL-NEXT: kmovw %edi, %k1
559 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
560 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
561 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
562 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
563 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
564 ; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0
565 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
566 ; AVX512VL-NEXT: kmovw %k0, %eax
567 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
568 ; AVX512VL-NEXT: vzeroupper
569 ; AVX512VL-NEXT: retq
571 ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
573 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
574 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
575 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
576 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
577 ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
578 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
579 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
580 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
581 ; VL_BW_DQ-NEXT: vzeroupper
582 ; VL_BW_DQ-NEXT: retq
583 %b = bitcast i8 %a to <8 x i1>
584 %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
585 %d = bitcast <8 x i1> %c to i8
589 define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
590 ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
592 ; AVX512F-NEXT: kmovw %edi, %k1
593 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
594 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
595 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
596 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
597 ; AVX512F-NEXT: kmovw %k0, %eax
598 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
599 ; AVX512F-NEXT: vzeroupper
602 ; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
604 ; AVX512VL-NEXT: kmovw %edi, %k1
605 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
606 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
607 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
608 ; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
609 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
610 ; AVX512VL-NEXT: kmovw %k0, %eax
611 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
612 ; AVX512VL-NEXT: vzeroupper
613 ; AVX512VL-NEXT: retq
615 ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
617 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
618 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
619 ; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
620 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
621 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
622 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
623 ; VL_BW_DQ-NEXT: vzeroupper
624 ; VL_BW_DQ-NEXT: retq
625 %b = bitcast i8 %a to <8 x i1>
626 %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
627 %d = bitcast <8 x i1> %c to i8
631 define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
632 ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
634 ; AVX512F-NEXT: kmovw %edi, %k1
635 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
636 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
637 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
638 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
639 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
640 ; AVX512F-NEXT: kmovw %k0, %eax
641 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
642 ; AVX512F-NEXT: vzeroupper
645 ; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
647 ; AVX512VL-NEXT: kmovw %edi, %k1
648 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
649 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
650 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
651 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
652 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
653 ; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0
654 ; AVX512VL-NEXT: kmovw %k0, %eax
655 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
656 ; AVX512VL-NEXT: vzeroupper
657 ; AVX512VL-NEXT: retq
659 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
661 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
662 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
663 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
664 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
665 ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
666 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
667 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
668 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
669 ; VL_BW_DQ-NEXT: vzeroupper
670 ; VL_BW_DQ-NEXT: retq
671 %b = bitcast i8 %a to <8 x i1>
672 %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
673 %d = bitcast <8 x i1>%c to i8
677 define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
678 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
680 ; AVX512F-NEXT: kmovw %edi, %k1
681 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
682 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
683 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
684 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
685 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
686 ; AVX512F-NEXT: kmovw %k0, %eax
687 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
688 ; AVX512F-NEXT: vzeroupper
691 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
693 ; AVX512VL-NEXT: kmovw %edi, %k1
694 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
695 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
696 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
697 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
698 ; AVX512VL-NEXT: kmovw %k0, %eax
699 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
700 ; AVX512VL-NEXT: vzeroupper
701 ; AVX512VL-NEXT: retq
703 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
705 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
706 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
707 ; VL_BW_DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
708 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
709 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
710 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
711 ; VL_BW_DQ-NEXT: vzeroupper
712 ; VL_BW_DQ-NEXT: retq
713 %b = bitcast i8 %a to <8 x i1>
714 %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
715 %d = bitcast <8 x i1>%c to i8
719 define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
720 ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
722 ; AVX512F-NEXT: kmovw %edi, %k1
723 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
724 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
725 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0]
726 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
727 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
728 ; AVX512F-NEXT: kmovw %k0, %eax
729 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
730 ; AVX512F-NEXT: vzeroupper
733 ; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
735 ; AVX512VL-NEXT: kmovw %edi, %k1
736 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
737 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
738 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
739 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
740 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
741 ; AVX512VL-NEXT: kmovw %k0, %eax
742 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
743 ; AVX512VL-NEXT: vzeroupper
744 ; AVX512VL-NEXT: retq
746 ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
748 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
749 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
750 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
751 ; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
752 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
753 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
754 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
755 ; VL_BW_DQ-NEXT: vzeroupper
756 ; VL_BW_DQ-NEXT: retq
757 %b = bitcast i8 %a to <8 x i1>
758 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
759 %c1 = bitcast <8 x i1>%c to i8
763 define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
764 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
766 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
767 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
768 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
769 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
770 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
771 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
772 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
773 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
774 ; AVX512F-NEXT: kmovw %k0, %eax
775 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
776 ; AVX512F-NEXT: vzeroupper
779 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
781 ; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0
782 ; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
783 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1
784 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
785 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
786 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7]
787 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
788 ; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0
789 ; AVX512VL-NEXT: kmovw %k0, %eax
790 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
791 ; AVX512VL-NEXT: vzeroupper
792 ; AVX512VL-NEXT: retq
794 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
796 ; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0
797 ; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
798 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
799 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7]
800 ; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
801 ; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2
802 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
803 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
804 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
805 ; VL_BW_DQ-NEXT: vzeroupper
806 ; VL_BW_DQ-NEXT: retq
807 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
808 %c1 = bitcast <8 x i1>%c to i8
813 define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
814 ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
816 ; AVX512F-NEXT: kmovw %edi, %k1
817 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
818 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
819 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
820 ; AVX512F-NEXT: kmovw %k0, %eax
821 ; AVX512F-NEXT: # kill: def $ax killed $ax killed $eax
822 ; AVX512F-NEXT: vzeroupper
825 ; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
827 ; AVX512VL-NEXT: kmovw %edi, %k1
828 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
829 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
830 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
831 ; AVX512VL-NEXT: kmovw %k0, %eax
832 ; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
833 ; AVX512VL-NEXT: vzeroupper
834 ; AVX512VL-NEXT: retq
836 ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
838 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
839 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0
840 ; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
841 ; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
842 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
843 ; VL_BW_DQ-NEXT: # kill: def $ax killed $ax killed $eax
844 ; VL_BW_DQ-NEXT: vzeroupper
845 ; VL_BW_DQ-NEXT: retq
846 %b = bitcast i16 %a to <16 x i1>
847 %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
848 %d = bitcast <16 x i1> %c to i16
852 define i64 @shuf64i1_zero(i64 %a) {
853 ; AVX512F-LABEL: shuf64i1_zero:
855 ; AVX512F-NEXT: kmovw %edi, %k1
856 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
857 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
858 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
859 ; AVX512F-NEXT: kmovw %k0, %eax
860 ; AVX512F-NEXT: kmovw %k0, %ecx
861 ; AVX512F-NEXT: shll $16, %ecx
862 ; AVX512F-NEXT: orl %eax, %ecx
863 ; AVX512F-NEXT: movq %rcx, %rax
864 ; AVX512F-NEXT: shlq $32, %rax
865 ; AVX512F-NEXT: orq %rcx, %rax
866 ; AVX512F-NEXT: vzeroupper
869 ; AVX512VL-LABEL: shuf64i1_zero:
871 ; AVX512VL-NEXT: kmovw %edi, %k1
872 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
873 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
874 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
875 ; AVX512VL-NEXT: kmovw %k0, %eax
876 ; AVX512VL-NEXT: kmovw %k0, %ecx
877 ; AVX512VL-NEXT: shll $16, %ecx
878 ; AVX512VL-NEXT: orl %eax, %ecx
879 ; AVX512VL-NEXT: movq %rcx, %rax
880 ; AVX512VL-NEXT: shlq $32, %rax
881 ; AVX512VL-NEXT: orq %rcx, %rax
882 ; AVX512VL-NEXT: vzeroupper
883 ; AVX512VL-NEXT: retq
885 ; VL_BW_DQ-LABEL: shuf64i1_zero:
887 ; VL_BW_DQ-NEXT: kmovq %rdi, %k0
888 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %zmm0
889 ; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0
890 ; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0
891 ; VL_BW_DQ-NEXT: kmovq %k0, %rax
892 ; VL_BW_DQ-NEXT: vzeroupper
893 ; VL_BW_DQ-NEXT: retq
894 %b = bitcast i64 %a to <64 x i1>
895 %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
896 %d = bitcast <64 x i1> %c to i64