1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=VL_BW_DQ
6 define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
7 ; AVX512F-LABEL: shuf2i1_1_0:
9 ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
10 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
11 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
12 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
13 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
14 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
15 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
16 ; AVX512F-NEXT: vzeroupper
19 ; AVX512VL-LABEL: shuf2i1_1_0:
21 ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
22 ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
23 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
24 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
25 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
26 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
27 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
30 ; VL_BW_DQ-LABEL: shuf2i1_1_0:
32 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
33 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
34 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
35 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
36 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
37 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
39 %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
43 define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
44 ; AVX512F-LABEL: shuf2i1_1_2:
46 ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
47 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
48 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
49 ; AVX512F-NEXT: movq $-1, %rax
50 ; AVX512F-NEXT: vmovq %rax, %xmm1
51 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
52 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
53 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
54 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
55 ; AVX512F-NEXT: vzeroupper
58 ; AVX512VL-LABEL: shuf2i1_1_2:
60 ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
61 ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
62 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
63 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
64 ; AVX512VL-NEXT: movq $-1, %rax
65 ; AVX512VL-NEXT: vmovq %rax, %xmm2
66 ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
67 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
68 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
71 ; VL_BW_DQ-LABEL: shuf2i1_1_2:
73 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
74 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
75 ; VL_BW_DQ-NEXT: movq $-1, %rax
76 ; VL_BW_DQ-NEXT: vmovq %rax, %xmm0
77 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1
78 ; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
79 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
80 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
82 %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
87 define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
88 ; AVX512F-LABEL: shuf4i1_3_2_10:
90 ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
91 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
92 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
93 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
94 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
95 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
96 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
97 ; AVX512F-NEXT: vzeroupper
100 ; AVX512VL-LABEL: shuf4i1_3_2_10:
102 ; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0
103 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1
104 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
105 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z}
106 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
107 ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
108 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
109 ; AVX512VL-NEXT: retq
111 ; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
113 ; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
114 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
115 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
116 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
117 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
118 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
119 ; VL_BW_DQ-NEXT: retq
120 %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
124 define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
125 ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
127 ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
128 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
129 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
130 ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
131 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
132 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
133 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
134 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
135 ; AVX512F-NEXT: vzeroupper
138 ; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
140 ; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
141 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
142 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
143 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
144 ; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1
145 ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
146 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
147 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
148 ; AVX512VL-NEXT: vzeroupper
149 ; AVX512VL-NEXT: retq
151 ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
153 ; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
154 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
155 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0]
156 ; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
157 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
158 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
159 ; VL_BW_DQ-NEXT: vzeroupper
160 ; VL_BW_DQ-NEXT: retq
161 %a2 = icmp eq <8 x i64> %a, %a1
162 %b2 = icmp eq <8 x i64> %b, %b1
163 %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
167 define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
168 ; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
170 ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
171 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
172 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
173 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
174 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
175 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
176 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
177 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
178 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
179 ; AVX512F-NEXT: vzeroupper
182 ; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
184 ; AVX512VL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
185 ; AVX512VL-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
186 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
187 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
188 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
189 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
190 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1
191 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
192 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
193 ; AVX512VL-NEXT: vzeroupper
194 ; AVX512VL-NEXT: retq
196 ; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
198 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
199 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
200 ; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0
201 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1
202 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
203 ; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
204 ; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0
205 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
206 ; VL_BW_DQ-NEXT: vzeroupper
207 ; VL_BW_DQ-NEXT: retq
208 %a2 = icmp eq <16 x i32> %a, %a1
209 %b2 = icmp eq <16 x i32> %b, %b1
210 %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
214 define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
215 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
217 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
218 ; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
219 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
220 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
221 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
222 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
223 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
224 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
225 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
226 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
227 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
228 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
229 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
230 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
231 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
234 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
236 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
237 ; AVX512VL-NEXT: vpslld $31, %zmm1, %zmm1
238 ; AVX512VL-NEXT: vptestmd %zmm1, %zmm1, %k1
239 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
240 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
241 ; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0
242 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
243 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
244 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
245 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
246 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
247 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1
248 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
249 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
250 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
251 ; AVX512VL-NEXT: retq
253 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
255 ; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0
256 ; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0
257 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
258 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
259 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
260 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0
261 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0
262 ; VL_BW_DQ-NEXT: retq
263 %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
267 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) {
268 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
270 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
271 ; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0
272 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
273 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
274 ; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm1, %ymm0
275 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
276 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
277 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
278 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
279 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
280 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
281 ; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1
282 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
283 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm1
284 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
285 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
288 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
290 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
291 ; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0
292 ; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
293 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1
294 ; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm1, %ymm0
295 ; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
296 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
297 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
298 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
299 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
300 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
301 ; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1
302 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
303 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm1
304 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
305 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
306 ; AVX512VL-NEXT: retq
308 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
310 ; VL_BW_DQ-NEXT: vptestnmw %zmm0, %zmm0, %k0
311 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
312 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
313 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0
314 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
315 ; VL_BW_DQ-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
316 ; VL_BW_DQ-NEXT: retq
317 %cmp = icmp eq <32 x i16> %a, zeroinitializer
318 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
319 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
323 define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8(<32 x i8> %a, <32 x i8> %c, <32 x i8> %d) {
324 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
326 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
327 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
328 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm3
329 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
330 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
331 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
332 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
333 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
334 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
335 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
336 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
337 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
338 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
339 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
340 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
341 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
344 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
346 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
347 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
348 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm3
349 ; AVX512VL-NEXT: vptestmd %zmm3, %zmm3, %k1
350 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
351 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
352 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
353 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
354 ; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
355 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
356 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
357 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
358 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
359 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
360 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
361 ; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
362 ; AVX512VL-NEXT: retq
364 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
366 ; VL_BW_DQ-NEXT: vptestnmb %ymm0, %ymm0, %k0
367 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
368 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
369 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0
370 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
371 ; VL_BW_DQ-NEXT: vpblendmb %ymm1, %ymm2, %ymm0 {%k1}
372 ; VL_BW_DQ-NEXT: retq
373 %cmp = icmp eq <32 x i8> %a, zeroinitializer
374 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
375 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
379 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
380 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
382 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
383 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
384 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
385 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
386 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
387 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
388 ; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1
389 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
390 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm1
391 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
392 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
395 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
397 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
398 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2
399 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
400 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
401 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
402 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
403 ; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1
404 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
405 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm1
406 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
407 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
408 ; AVX512VL-NEXT: retq
410 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
412 ; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
413 ; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1
414 ; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0
415 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
416 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
417 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
418 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
419 ; VL_BW_DQ-NEXT: vpblendmw %zmm2, %zmm3, %zmm0 {%k1}
420 ; VL_BW_DQ-NEXT: retq
421 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
422 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
423 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
424 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
425 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
429 define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) {
430 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
432 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
433 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
434 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
435 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
436 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
437 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
438 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
439 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
440 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
441 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
442 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
445 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
447 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
448 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2
449 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
450 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
451 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
452 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
453 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
454 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
455 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
456 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
457 ; AVX512VL-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
458 ; AVX512VL-NEXT: retq
460 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
462 ; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
463 ; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1
464 ; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0
465 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
466 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
467 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
468 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
469 ; VL_BW_DQ-NEXT: vpblendmb %ymm2, %ymm3, %ymm0 {%k1}
470 ; VL_BW_DQ-NEXT: retq
471 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
472 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
473 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
474 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
475 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
479 define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
480 ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
482 ; AVX512F-NEXT: kmovw %edi, %k1
483 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
484 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
485 ; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
486 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
487 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
488 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
489 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
490 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
491 ; AVX512F-NEXT: vzeroupper
494 ; AVX512VL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
496 ; AVX512VL-NEXT: kmovw %edi, %k1
497 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
498 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
499 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
500 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
501 ; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1
502 ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
503 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
504 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
505 ; AVX512VL-NEXT: vzeroupper
506 ; AVX512VL-NEXT: retq
508 ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
510 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
511 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
512 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
513 ; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %ymm0
514 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
515 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
516 ; VL_BW_DQ-NEXT: vzeroupper
517 ; VL_BW_DQ-NEXT: retq
518 %b = bitcast i8 %a to <8 x i1>
519 %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
523 define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
524 ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
526 ; AVX512F-NEXT: kmovw %edi, %k1
527 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
528 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
529 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
530 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
531 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
532 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
533 ; AVX512F-NEXT: kmovw %k0, %eax
534 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
535 ; AVX512F-NEXT: vzeroupper
538 ; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
540 ; AVX512VL-NEXT: kmovw %edi, %k1
541 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
542 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
543 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
544 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
545 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
546 ; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0
547 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
548 ; AVX512VL-NEXT: kmovw %k0, %eax
549 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
550 ; AVX512VL-NEXT: vzeroupper
551 ; AVX512VL-NEXT: retq
553 ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
555 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
556 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
557 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
558 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
559 ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
560 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
561 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
562 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
563 ; VL_BW_DQ-NEXT: vzeroupper
564 ; VL_BW_DQ-NEXT: retq
565 %b = bitcast i8 %a to <8 x i1>
566 %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
567 %d = bitcast <8 x i1> %c to i8
571 define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
572 ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
574 ; AVX512F-NEXT: kmovw %edi, %k1
575 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
576 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
577 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
578 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
579 ; AVX512F-NEXT: kmovw %k0, %eax
580 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
581 ; AVX512F-NEXT: vzeroupper
584 ; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
586 ; AVX512VL-NEXT: kmovw %edi, %k1
587 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
588 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
589 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
590 ; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
591 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
592 ; AVX512VL-NEXT: kmovw %k0, %eax
593 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
594 ; AVX512VL-NEXT: vzeroupper
595 ; AVX512VL-NEXT: retq
597 ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
599 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
600 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
601 ; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
602 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
603 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
604 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
605 ; VL_BW_DQ-NEXT: vzeroupper
606 ; VL_BW_DQ-NEXT: retq
607 %b = bitcast i8 %a to <8 x i1>
608 %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
609 %d = bitcast <8 x i1> %c to i8
613 define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
614 ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
616 ; AVX512F-NEXT: kmovw %edi, %k1
617 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
618 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
619 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
620 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
621 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
622 ; AVX512F-NEXT: kmovw %k0, %eax
623 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
624 ; AVX512F-NEXT: vzeroupper
627 ; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
629 ; AVX512VL-NEXT: kmovw %edi, %k1
630 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
631 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
632 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
633 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
634 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
635 ; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0
636 ; AVX512VL-NEXT: kmovw %k0, %eax
637 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
638 ; AVX512VL-NEXT: vzeroupper
639 ; AVX512VL-NEXT: retq
641 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
643 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
644 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
645 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
646 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
647 ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
648 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
649 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
650 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
651 ; VL_BW_DQ-NEXT: vzeroupper
652 ; VL_BW_DQ-NEXT: retq
653 %b = bitcast i8 %a to <8 x i1>
654 %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
655 %d = bitcast <8 x i1>%c to i8
659 define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
660 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
662 ; AVX512F-NEXT: kmovw %edi, %k1
663 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
664 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
665 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
666 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
667 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
668 ; AVX512F-NEXT: kmovw %k0, %eax
669 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
670 ; AVX512F-NEXT: vzeroupper
673 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
675 ; AVX512VL-NEXT: kmovw %edi, %k1
676 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
677 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
678 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
679 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
680 ; AVX512VL-NEXT: kmovw %k0, %eax
681 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
682 ; AVX512VL-NEXT: vzeroupper
683 ; AVX512VL-NEXT: retq
685 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
687 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
688 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
689 ; VL_BW_DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
690 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
691 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
692 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
693 ; VL_BW_DQ-NEXT: vzeroupper
694 ; VL_BW_DQ-NEXT: retq
695 %b = bitcast i8 %a to <8 x i1>
696 %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
697 %d = bitcast <8 x i1>%c to i8
701 define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
702 ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
704 ; AVX512F-NEXT: kmovw %edi, %k1
705 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
706 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
707 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0]
708 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
709 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
710 ; AVX512F-NEXT: kmovw %k0, %eax
711 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
712 ; AVX512F-NEXT: vzeroupper
715 ; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
717 ; AVX512VL-NEXT: kmovw %edi, %k1
718 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
719 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
720 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
721 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
722 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
723 ; AVX512VL-NEXT: kmovw %k0, %eax
724 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
725 ; AVX512VL-NEXT: vzeroupper
726 ; AVX512VL-NEXT: retq
728 ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
730 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
731 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
732 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
733 ; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
734 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
735 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
736 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
737 ; VL_BW_DQ-NEXT: vzeroupper
738 ; VL_BW_DQ-NEXT: retq
739 %b = bitcast i8 %a to <8 x i1>
740 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
741 %c1 = bitcast <8 x i1>%c to i8
745 define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
746 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
748 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
749 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
750 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
751 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
752 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
753 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
754 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
755 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
756 ; AVX512F-NEXT: kmovw %k0, %eax
757 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
758 ; AVX512F-NEXT: vzeroupper
761 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
763 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
764 ; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
765 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1
766 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
767 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
768 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
769 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
770 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
771 ; AVX512VL-NEXT: kmovw %k0, %eax
772 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
773 ; AVX512VL-NEXT: vzeroupper
774 ; AVX512VL-NEXT: retq
776 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
778 ; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0
779 ; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
780 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
781 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
782 ; VL_BW_DQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
783 ; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
784 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
785 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
786 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
787 ; VL_BW_DQ-NEXT: vzeroupper
788 ; VL_BW_DQ-NEXT: retq
789 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
790 %c1 = bitcast <8 x i1>%c to i8
795 define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
796 ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
798 ; AVX512F-NEXT: kmovw %edi, %k1
799 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
800 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
801 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
802 ; AVX512F-NEXT: kmovw %k0, %eax
803 ; AVX512F-NEXT: # kill: def $ax killed $ax killed $eax
804 ; AVX512F-NEXT: vzeroupper
807 ; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
809 ; AVX512VL-NEXT: kmovw %edi, %k1
810 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
811 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
812 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
813 ; AVX512VL-NEXT: kmovw %k0, %eax
814 ; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
815 ; AVX512VL-NEXT: vzeroupper
816 ; AVX512VL-NEXT: retq
818 ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
820 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
821 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0
822 ; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
823 ; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
824 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
825 ; VL_BW_DQ-NEXT: # kill: def $ax killed $ax killed $eax
826 ; VL_BW_DQ-NEXT: vzeroupper
827 ; VL_BW_DQ-NEXT: retq
828 %b = bitcast i16 %a to <16 x i1>
829 %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
830 %d = bitcast <16 x i1> %c to i16
834 define i64 @shuf64i1_zero(i64 %a) {
835 ; AVX512F-LABEL: shuf64i1_zero:
837 ; AVX512F-NEXT: kmovw %edi, %k1
838 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
839 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
840 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
841 ; AVX512F-NEXT: kmovw %k0, %eax
842 ; AVX512F-NEXT: kmovw %k0, %ecx
843 ; AVX512F-NEXT: shll $16, %ecx
844 ; AVX512F-NEXT: orl %eax, %ecx
845 ; AVX512F-NEXT: movq %rcx, %rax
846 ; AVX512F-NEXT: shlq $32, %rax
847 ; AVX512F-NEXT: orq %rcx, %rax
848 ; AVX512F-NEXT: vzeroupper
851 ; AVX512VL-LABEL: shuf64i1_zero:
853 ; AVX512VL-NEXT: kmovw %edi, %k1
854 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
855 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
856 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
857 ; AVX512VL-NEXT: kmovw %k0, %eax
858 ; AVX512VL-NEXT: kmovw %k0, %ecx
859 ; AVX512VL-NEXT: shll $16, %ecx
860 ; AVX512VL-NEXT: orl %eax, %ecx
861 ; AVX512VL-NEXT: movq %rcx, %rax
862 ; AVX512VL-NEXT: shlq $32, %rax
863 ; AVX512VL-NEXT: orq %rcx, %rax
864 ; AVX512VL-NEXT: vzeroupper
865 ; AVX512VL-NEXT: retq
867 ; VL_BW_DQ-LABEL: shuf64i1_zero:
869 ; VL_BW_DQ-NEXT: kmovq %rdi, %k0
870 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %zmm0
871 ; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0
872 ; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0
873 ; VL_BW_DQ-NEXT: kmovq %k0, %rax
874 ; VL_BW_DQ-NEXT: vzeroupper
875 ; VL_BW_DQ-NEXT: retq
876 %b = bitcast i64 %a to <64 x i1>
877 %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
878 %d = bitcast <64 x i1> %c to i64