1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-ALL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-PERLANE
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-ALL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-PERLANE
8 define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
9 ; AVX512F-LABEL: shuf2i1_1_0:
11 ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
12 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
13 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
14 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
15 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
16 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
17 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
18 ; AVX512F-NEXT: vzeroupper
21 ; AVX512VL-LABEL: shuf2i1_1_0:
23 ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
24 ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
25 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
26 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
27 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
28 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
29 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
32 ; VL_BW_DQ-LABEL: shuf2i1_1_0:
34 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
35 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
36 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
37 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
38 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
39 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
41 %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
45 define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
46 ; AVX512F-LABEL: shuf2i1_1_2:
48 ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
49 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
50 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
51 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
52 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
53 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
54 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
55 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
56 ; AVX512F-NEXT: vzeroupper
59 ; AVX512VL-LABEL: shuf2i1_1_2:
61 ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
62 ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
63 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
64 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
65 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551615,0]
66 ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
67 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
68 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
71 ; VL_BW_DQ-LABEL: shuf2i1_1_2:
73 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
74 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
75 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
76 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
77 ; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
78 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
79 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
81 %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
86 define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
87 ; AVX512F-LABEL: shuf4i1_3_2_10:
89 ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
90 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
91 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
92 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
93 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
94 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
95 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
96 ; AVX512F-NEXT: vzeroupper
99 ; AVX512VL-LABEL: shuf4i1_3_2_10:
101 ; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0
102 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1
103 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
104 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z}
105 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
106 ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
107 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
108 ; AVX512VL-NEXT: retq
110 ; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
112 ; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
113 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
114 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
115 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
116 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
117 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
118 ; VL_BW_DQ-NEXT: retq
119 %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
123 define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
124 ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
126 ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
127 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
128 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
129 ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
130 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
131 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
132 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
133 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
134 ; AVX512F-NEXT: vzeroupper
137 ; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
139 ; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
140 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
141 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
142 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
143 ; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1
144 ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
145 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
146 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
147 ; AVX512VL-NEXT: vzeroupper
148 ; AVX512VL-NEXT: retq
150 ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
152 ; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
153 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
154 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0]
155 ; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
156 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
157 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
158 ; VL_BW_DQ-NEXT: vzeroupper
159 ; VL_BW_DQ-NEXT: retq
160 %a2 = icmp eq <8 x i64> %a, %a1
161 %b2 = icmp eq <8 x i64> %b, %b1
162 %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
166 define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
167 ; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
169 ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
170 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
171 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
172 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
173 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
174 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
175 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
176 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
177 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
178 ; AVX512F-NEXT: vzeroupper
181 ; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
183 ; AVX512VL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
184 ; AVX512VL-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
185 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
186 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
187 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
188 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
189 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1
190 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
191 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
192 ; AVX512VL-NEXT: vzeroupper
193 ; AVX512VL-NEXT: retq
195 ; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
197 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
198 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
199 ; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0
200 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1
201 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
202 ; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
203 ; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0
204 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
205 ; VL_BW_DQ-NEXT: vzeroupper
206 ; VL_BW_DQ-NEXT: retq
207 %a2 = icmp eq <16 x i32> %a, %a1
208 %b2 = icmp eq <16 x i32> %b, %b1
209 %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
213 define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
214 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
216 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
217 ; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
218 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
219 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
220 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
221 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
222 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
223 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
224 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
225 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
226 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
227 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
228 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
229 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
230 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
233 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
235 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm1
236 ; AVX512VL-NEXT: vpslld $31, %zmm1, %zmm1
237 ; AVX512VL-NEXT: vptestmd %zmm1, %zmm1, %k1
238 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
239 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
240 ; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0
241 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
242 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
243 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
244 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
245 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
246 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1
247 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
248 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
249 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
250 ; AVX512VL-NEXT: retq
252 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
254 ; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0
255 ; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0
256 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
257 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
258 ; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
259 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
260 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0
261 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0
262 ; VL_BW_DQ-NEXT: retq
263 %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
267 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) {
268 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
270 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
271 ; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4
272 ; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
273 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
274 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
275 ; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
276 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
277 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
278 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
279 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
280 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
281 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
282 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
283 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
284 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
285 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
286 ; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0
289 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
291 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
292 ; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4
293 ; AVX512VL-NEXT: vpmovsxwd %ymm4, %zmm4
294 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
295 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
296 ; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
297 ; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
298 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
299 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
300 ; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
301 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
302 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
303 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
304 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
305 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
306 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
307 ; AVX512VL-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0
308 ; AVX512VL-NEXT: retq
310 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
312 ; VL_BW_DQ-NEXT: vptestnmw %zmm0, %zmm0, %k0
313 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
314 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
315 ; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
316 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0
317 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
318 ; VL_BW_DQ-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
319 ; VL_BW_DQ-NEXT: retq
320 %cmp = icmp eq <32 x i16> %a, zeroinitializer
321 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
322 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
326 define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8(<32 x i8> %a, <32 x i8> %c, <32 x i8> %d) {
327 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
329 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
330 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
331 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm3
332 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
333 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
334 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
335 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
336 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
337 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
338 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
339 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
340 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
341 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
342 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
343 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
344 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
347 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
349 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
350 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
351 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm3
352 ; AVX512VL-NEXT: vptestmd %zmm3, %zmm3, %k1
353 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
354 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
355 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
356 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
357 ; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
358 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
359 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
360 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
361 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
362 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
363 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
364 ; AVX512VL-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0
365 ; AVX512VL-NEXT: retq
367 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
369 ; VL_BW_DQ-NEXT: vptestnmb %ymm0, %ymm0, %k0
370 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
371 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
372 ; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
373 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0
374 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
375 ; VL_BW_DQ-NEXT: vpblendmb %ymm1, %ymm2, %ymm0 {%k1}
376 ; VL_BW_DQ-NEXT: retq
377 %cmp = icmp eq <32 x i8> %a, zeroinitializer
378 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
379 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
383 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
384 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
386 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
387 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
388 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
389 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
390 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
391 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
392 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
393 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
394 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
395 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
396 ; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0
399 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
401 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
402 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2
403 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
404 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
405 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
406 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
407 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
408 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
409 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
410 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
411 ; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0
412 ; AVX512VL-NEXT: retq
414 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
416 ; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
417 ; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1
418 ; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0
419 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
420 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
421 ; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
422 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
423 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
424 ; VL_BW_DQ-NEXT: vpblendmw %zmm2, %zmm3, %zmm0 {%k1}
425 ; VL_BW_DQ-NEXT: retq
426 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
427 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
428 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
429 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
430 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
434 define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) {
435 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
437 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
438 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
439 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
440 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
441 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
442 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
443 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
444 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
445 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
446 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
447 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
450 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
452 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
453 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2
454 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
455 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
456 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
457 ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
458 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
459 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
460 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
461 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
462 ; AVX512VL-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm0
463 ; AVX512VL-NEXT: retq
465 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
467 ; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
468 ; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1
469 ; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0
470 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
471 ; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
472 ; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
473 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
474 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
475 ; VL_BW_DQ-NEXT: vpblendmb %ymm2, %ymm3, %ymm0 {%k1}
476 ; VL_BW_DQ-NEXT: retq
477 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
478 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
479 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
480 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
481 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
485 define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
486 ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
488 ; AVX512F-NEXT: kmovw %edi, %k1
489 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
490 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
491 ; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
492 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
493 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
494 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
495 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
496 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
497 ; AVX512F-NEXT: vzeroupper
500 ; AVX512VL-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
501 ; AVX512VL-FAST-ALL: # %bb.0:
502 ; AVX512VL-FAST-ALL-NEXT: kmovw %edi, %k1
503 ; AVX512VL-FAST-ALL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
504 ; AVX512VL-FAST-ALL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
505 ; AVX512VL-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2]
506 ; AVX512VL-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
507 ; AVX512VL-FAST-ALL-NEXT: vpslld $31, %ymm1, %ymm1
508 ; AVX512VL-FAST-ALL-NEXT: vptestmd %ymm1, %ymm1, %k1
509 ; AVX512VL-FAST-ALL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
510 ; AVX512VL-FAST-ALL-NEXT: vpmovdw %ymm0, %xmm0
511 ; AVX512VL-FAST-ALL-NEXT: vzeroupper
512 ; AVX512VL-FAST-ALL-NEXT: retq
514 ; AVX512VL-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
515 ; AVX512VL-FAST-PERLANE: # %bb.0:
516 ; AVX512VL-FAST-PERLANE-NEXT: kmovw %edi, %k1
517 ; AVX512VL-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
518 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
519 ; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
520 ; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1
521 ; AVX512VL-FAST-PERLANE-NEXT: vpslld $31, %ymm1, %ymm1
522 ; AVX512VL-FAST-PERLANE-NEXT: vptestmd %ymm1, %ymm1, %k1
523 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
524 ; AVX512VL-FAST-PERLANE-NEXT: vpmovdw %ymm0, %xmm0
525 ; AVX512VL-FAST-PERLANE-NEXT: vzeroupper
526 ; AVX512VL-FAST-PERLANE-NEXT: retq
528 ; VL_BW_DQ-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
529 ; VL_BW_DQ-FAST-ALL: # %bb.0:
530 ; VL_BW_DQ-FAST-ALL-NEXT: kmovd %edi, %k0
531 ; VL_BW_DQ-FAST-ALL-NEXT: vpmovm2d %k0, %ymm0
532 ; VL_BW_DQ-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
533 ; VL_BW_DQ-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
534 ; VL_BW_DQ-FAST-ALL-NEXT: vpmovd2m %ymm0, %k0
535 ; VL_BW_DQ-FAST-ALL-NEXT: vpmovm2w %k0, %xmm0
536 ; VL_BW_DQ-FAST-ALL-NEXT: vzeroupper
537 ; VL_BW_DQ-FAST-ALL-NEXT: retq
539 ; VL_BW_DQ-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
540 ; VL_BW_DQ-FAST-PERLANE: # %bb.0:
541 ; VL_BW_DQ-FAST-PERLANE-NEXT: kmovd %edi, %k0
542 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0
543 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
544 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm0
545 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0
546 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2w %k0, %xmm0
547 ; VL_BW_DQ-FAST-PERLANE-NEXT: vzeroupper
548 ; VL_BW_DQ-FAST-PERLANE-NEXT: retq
549 %b = bitcast i8 %a to <8 x i1>
550 %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
554 define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
555 ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
557 ; AVX512F-NEXT: kmovw %edi, %k1
558 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
559 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
560 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
561 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
562 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
563 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
564 ; AVX512F-NEXT: kmovw %k0, %eax
565 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
566 ; AVX512F-NEXT: vzeroupper
569 ; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
571 ; AVX512VL-NEXT: kmovw %edi, %k1
572 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
573 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
574 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
575 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
576 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
577 ; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0
578 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
579 ; AVX512VL-NEXT: kmovw %k0, %eax
580 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
581 ; AVX512VL-NEXT: vzeroupper
582 ; AVX512VL-NEXT: retq
584 ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
586 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
587 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
588 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
589 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
590 ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
591 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
592 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
593 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
594 ; VL_BW_DQ-NEXT: vzeroupper
595 ; VL_BW_DQ-NEXT: retq
596 %b = bitcast i8 %a to <8 x i1>
597 %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
598 %d = bitcast <8 x i1> %c to i8
602 define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
603 ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
605 ; AVX512F-NEXT: kmovw %edi, %k1
606 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
607 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,4,5,6,7]
608 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
609 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
610 ; AVX512F-NEXT: kmovw %k0, %eax
611 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
612 ; AVX512F-NEXT: vzeroupper
615 ; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
617 ; AVX512VL-NEXT: kmovw %edi, %k1
618 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
619 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
620 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
621 ; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
622 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
623 ; AVX512VL-NEXT: kmovw %k0, %eax
624 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
625 ; AVX512VL-NEXT: vzeroupper
626 ; AVX512VL-NEXT: retq
628 ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
630 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
631 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
632 ; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
633 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
634 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
635 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
636 ; VL_BW_DQ-NEXT: vzeroupper
637 ; VL_BW_DQ-NEXT: retq
638 %b = bitcast i8 %a to <8 x i1>
639 %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
640 %d = bitcast <8 x i1> %c to i8
644 define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
645 ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
647 ; AVX512F-NEXT: kmovw %edi, %k1
648 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
649 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
650 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
651 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
652 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
653 ; AVX512F-NEXT: kmovw %k0, %eax
654 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
655 ; AVX512F-NEXT: vzeroupper
658 ; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
660 ; AVX512VL-NEXT: kmovw %edi, %k1
661 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
662 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
663 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
664 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
665 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
666 ; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0
667 ; AVX512VL-NEXT: kmovw %k0, %eax
668 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
669 ; AVX512VL-NEXT: vzeroupper
670 ; AVX512VL-NEXT: retq
672 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
674 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
675 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
676 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
677 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
678 ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
679 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
680 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
681 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
682 ; VL_BW_DQ-NEXT: vzeroupper
683 ; VL_BW_DQ-NEXT: retq
684 %b = bitcast i8 %a to <8 x i1>
685 %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
686 %d = bitcast <8 x i1>%c to i8
690 define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
691 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
693 ; AVX512F-NEXT: kmovw %edi, %k1
694 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
695 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
696 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
697 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
698 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
699 ; AVX512F-NEXT: kmovw %k0, %eax
700 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
701 ; AVX512F-NEXT: vzeroupper
704 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
706 ; AVX512VL-NEXT: kmovw %edi, %k1
707 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
708 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
709 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
710 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
711 ; AVX512VL-NEXT: kmovw %k0, %eax
712 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
713 ; AVX512VL-NEXT: vzeroupper
714 ; AVX512VL-NEXT: retq
716 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
718 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
719 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
720 ; VL_BW_DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
721 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
722 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
723 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
724 ; VL_BW_DQ-NEXT: vzeroupper
725 ; VL_BW_DQ-NEXT: retq
726 %b = bitcast i8 %a to <8 x i1>
727 %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
728 %d = bitcast <8 x i1>%c to i8
732 define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
733 ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
735 ; AVX512F-NEXT: kmovw %edi, %k1
736 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
737 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
738 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0]
739 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
740 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
741 ; AVX512F-NEXT: kmovw %k0, %eax
742 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
743 ; AVX512F-NEXT: vzeroupper
746 ; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
748 ; AVX512VL-NEXT: kmovw %edi, %k1
749 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
750 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
751 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
752 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
753 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
754 ; AVX512VL-NEXT: kmovw %k0, %eax
755 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
756 ; AVX512VL-NEXT: vzeroupper
757 ; AVX512VL-NEXT: retq
759 ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
761 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
762 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
763 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
764 ; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
765 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
766 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
767 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
768 ; VL_BW_DQ-NEXT: vzeroupper
769 ; VL_BW_DQ-NEXT: retq
770 %b = bitcast i8 %a to <8 x i1>
771 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
772 %c1 = bitcast <8 x i1>%c to i8
776 define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
777 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
779 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
780 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
781 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
782 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
783 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
784 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
785 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
786 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
787 ; AVX512F-NEXT: kmovw %k0, %eax
788 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
789 ; AVX512F-NEXT: vzeroupper
792 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
794 ; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0
795 ; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
796 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1
797 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
798 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
799 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7]
800 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
801 ; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0
802 ; AVX512VL-NEXT: kmovw %k0, %eax
803 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
804 ; AVX512VL-NEXT: vzeroupper
805 ; AVX512VL-NEXT: retq
807 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
809 ; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0
810 ; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
811 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
812 ; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7]
813 ; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
814 ; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2
815 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
816 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
817 ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
818 ; VL_BW_DQ-NEXT: vzeroupper
819 ; VL_BW_DQ-NEXT: retq
820 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
821 %c1 = bitcast <8 x i1>%c to i8
825 define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
826 ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
828 ; AVX512F-NEXT: kmovw %edi, %k1
829 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
830 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
831 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
832 ; AVX512F-NEXT: kmovw %k0, %eax
833 ; AVX512F-NEXT: # kill: def $ax killed $ax killed $eax
834 ; AVX512F-NEXT: vzeroupper
837 ; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
839 ; AVX512VL-NEXT: kmovw %edi, %k1
840 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
841 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
842 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
843 ; AVX512VL-NEXT: kmovw %k0, %eax
844 ; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
845 ; AVX512VL-NEXT: vzeroupper
846 ; AVX512VL-NEXT: retq
848 ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
850 ; VL_BW_DQ-NEXT: kmovd %edi, %k0
851 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0
852 ; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
853 ; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
854 ; VL_BW_DQ-NEXT: kmovd %k0, %eax
855 ; VL_BW_DQ-NEXT: # kill: def $ax killed $ax killed $eax
856 ; VL_BW_DQ-NEXT: vzeroupper
857 ; VL_BW_DQ-NEXT: retq
858 %b = bitcast i16 %a to <16 x i1>
859 %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
860 %d = bitcast <16 x i1> %c to i16
864 define i64 @shuf64i1_zero(i64 %a) {
865 ; AVX512F-LABEL: shuf64i1_zero:
867 ; AVX512F-NEXT: kmovw %edi, %k1
868 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
869 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
870 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
871 ; AVX512F-NEXT: kmovw %k0, %eax
872 ; AVX512F-NEXT: kmovw %k0, %ecx
873 ; AVX512F-NEXT: shll $16, %ecx
874 ; AVX512F-NEXT: orl %eax, %ecx
875 ; AVX512F-NEXT: movq %rcx, %rax
876 ; AVX512F-NEXT: shlq $32, %rax
877 ; AVX512F-NEXT: orq %rcx, %rax
878 ; AVX512F-NEXT: vzeroupper
881 ; AVX512VL-LABEL: shuf64i1_zero:
883 ; AVX512VL-NEXT: kmovw %edi, %k1
884 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
885 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
886 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
887 ; AVX512VL-NEXT: kmovw %k0, %eax
888 ; AVX512VL-NEXT: kmovw %k0, %ecx
889 ; AVX512VL-NEXT: shll $16, %ecx
890 ; AVX512VL-NEXT: orl %eax, %ecx
891 ; AVX512VL-NEXT: movq %rcx, %rax
892 ; AVX512VL-NEXT: shlq $32, %rax
893 ; AVX512VL-NEXT: orq %rcx, %rax
894 ; AVX512VL-NEXT: vzeroupper
895 ; AVX512VL-NEXT: retq
897 ; VL_BW_DQ-LABEL: shuf64i1_zero:
899 ; VL_BW_DQ-NEXT: kmovq %rdi, %k0
900 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %zmm0
901 ; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0
902 ; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0
903 ; VL_BW_DQ-NEXT: kmovq %k0, %rax
904 ; VL_BW_DQ-NEXT: vzeroupper
905 ; VL_BW_DQ-NEXT: retq
906 %b = bitcast i64 %a to <64 x i1>
907 %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
908 %d = bitcast <64 x i1> %c to i64
912 define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) {
913 ; AVX512F-LABEL: PR52500:
915 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
916 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
917 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
918 ; AVX512F-NEXT: vmovd %edi, %xmm0
919 ; AVX512F-NEXT: movl $789, %eax # imm = 0x315
920 ; AVX512F-NEXT: vmovd %eax, %xmm1
921 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
922 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
923 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1}
924 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
925 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
926 ; AVX512F-NEXT: vzeroupper
929 ; AVX512VL-LABEL: PR52500:
931 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
932 ; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0
933 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1
934 ; AVX512VL-NEXT: vmovd %edi, %xmm0
935 ; AVX512VL-NEXT: movl $789, %eax # imm = 0x315
936 ; AVX512VL-NEXT: vmovd %eax, %xmm1
937 ; AVX512VL-NEXT: vpmulld %xmm1, %xmm0, %xmm0
938 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
939 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1}
940 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
941 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
942 ; AVX512VL-NEXT: vzeroupper
943 ; AVX512VL-NEXT: retq
945 ; VL_BW_DQ-LABEL: PR52500:
947 ; VL_BW_DQ-NEXT: vpsllw $7, %xmm0, %xmm0
948 ; VL_BW_DQ-NEXT: vpmovb2m %xmm0, %k1
949 ; VL_BW_DQ-NEXT: vmovd %edi, %xmm0
950 ; VL_BW_DQ-NEXT: movl $789, %eax # imm = 0x315
951 ; VL_BW_DQ-NEXT: vmovd %eax, %xmm1
952 ; VL_BW_DQ-NEXT: vpmulld %xmm1, %xmm0, %xmm0
953 ; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
954 ; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k1}
955 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
956 ; VL_BW_DQ-NEXT: vzeroupper
957 ; VL_BW_DQ-NEXT: retq
958 %insrt = insertelement <16 x i32> undef, i32 %in, i32 0
959 %mul = mul <16 x i32> %insrt, <i32 789, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
960 %eq = icmp eq <16 x i32> %mul, zeroinitializer
961 %cmp1 = shufflevector <16 x i1> %eq, <16 x i1> poison, <16 x i32> zeroinitializer
962 %and = and <16 x i1> %cmp1, %msk