1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512DQ
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW
9 define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
10 ; ALL-LABEL: var_shift_v8i64:
12 ; ALL-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0
14 %shift = lshr <8 x i64> %a, %b
18 define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
19 ; ALL-LABEL: var_shift_v16i32:
21 ; ALL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
23 %shift = lshr <16 x i32> %a, %b
27 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
28 ; AVX512DQ-LABEL: var_shift_v32i16:
30 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
31 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
32 ; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm3, %zmm2
33 ; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2
34 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
35 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
36 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
37 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
38 ; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
39 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
40 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
43 ; AVX512BW-LABEL: var_shift_v32i16:
45 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
47 %shift = lshr <32 x i16> %a, %b
51 define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
52 ; AVX512DQ-LABEL: var_shift_v64i8:
54 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
55 ; AVX512DQ-NEXT: vpsrlw $4, %ymm2, %ymm3
56 ; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
57 ; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
58 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5
59 ; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5
60 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
61 ; AVX512DQ-NEXT: vpsrlw $2, %ymm2, %ymm3
62 ; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
63 ; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
64 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5
65 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
66 ; AVX512DQ-NEXT: vpsrlw $1, %ymm2, %ymm3
67 ; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
68 ; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
69 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5
70 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
71 ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm3
72 ; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
73 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
74 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
75 ; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm3
76 ; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
77 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
78 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
79 ; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm3
80 ; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
81 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
82 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
83 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
86 ; AVX512BW-LABEL: var_shift_v64i8:
88 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
89 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
90 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
91 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
92 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
93 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2
94 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
95 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
96 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
97 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
98 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2
99 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
100 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
101 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
102 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
103 ; AVX512BW-NEXT: retq
104 %shift = lshr <64 x i8> %a, %b
109 ; Uniform Variable Shifts
112 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
113 ; ALL-LABEL: splatvar_shift_v8i64:
115 ; ALL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
117 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
118 %shift = lshr <8 x i64> %a, %splat
122 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
123 ; ALL-LABEL: splatvar_shift_v16i32:
125 ; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
126 ; ALL-NEXT: vpsrld %xmm1, %zmm0, %zmm0
128 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
129 %shift = lshr <16 x i32> %a, %splat
130 ret <16 x i32> %shift
133 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
134 ; AVX512DQ-LABEL: splatvar_shift_v32i16:
136 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
137 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
138 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
139 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
140 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
141 ; AVX512DQ-NEXT: retq
143 ; AVX512BW-LABEL: splatvar_shift_v32i16:
145 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
146 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
147 ; AVX512BW-NEXT: retq
148 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
149 %shift = lshr <32 x i16> %a, %splat
150 ret <32 x i16> %shift
153 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
154 ; AVX512DQ-LABEL: splatvar_shift_v64i8:
156 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
157 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
158 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
159 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
160 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
161 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
162 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
163 ; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
164 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
165 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
166 ; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
167 ; AVX512DQ-NEXT: retq
169 ; AVX512BW-LABEL: splatvar_shift_v64i8:
171 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
172 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
173 ; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
174 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
175 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
176 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
177 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
178 ; AVX512BW-NEXT: retq
179 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
180 %shift = lshr <64 x i8> %a, %splat
185 ; Uniform Variable Modulo Shifts
188 define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
189 ; ALL-LABEL: splatvar_modulo_shift_v8i64:
191 ; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
192 ; ALL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
194 %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
195 %splat = shufflevector <8 x i64> %mod, <8 x i64> undef, <8 x i32> zeroinitializer
196 %shift = lshr <8 x i64> %a, %splat
200 define <16 x i32> @splatvar_modulo_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
201 ; ALL-LABEL: splatvar_modulo_shift_v16i32:
203 ; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
204 ; ALL-NEXT: vpsrld %xmm1, %zmm0, %zmm0
206 %mod = and <16 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
207 %splat = shufflevector <16 x i32> %mod, <16 x i32> undef, <16 x i32> zeroinitializer
208 %shift = lshr <16 x i32> %a, %splat
209 ret <16 x i32> %shift
212 define <32 x i16> @splatvar_modulo_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
213 ; AVX512DQ-LABEL: splatvar_modulo_shift_v32i16:
215 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
216 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
217 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
218 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
219 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
220 ; AVX512DQ-NEXT: retq
222 ; AVX512BW-LABEL: splatvar_modulo_shift_v32i16:
224 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
225 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
226 ; AVX512BW-NEXT: retq
227 %mod = and <32 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
228 %splat = shufflevector <32 x i16> %mod, <32 x i16> undef, <32 x i32> zeroinitializer
229 %shift = lshr <32 x i16> %a, %splat
230 ret <32 x i16> %shift
233 define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
234 ; AVX512DQ-LABEL: splatvar_modulo_shift_v64i8:
236 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
237 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
238 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
239 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
240 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
241 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
242 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
243 ; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
244 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
245 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
246 ; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
247 ; AVX512DQ-NEXT: retq
249 ; AVX512BW-LABEL: splatvar_modulo_shift_v64i8:
251 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
252 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
253 ; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
254 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
255 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
256 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
257 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
258 ; AVX512BW-NEXT: retq
259 %mod = and <64 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
260 %splat = shufflevector <64 x i8> %mod, <64 x i8> undef, <64 x i32> zeroinitializer
261 %shift = lshr <64 x i8> %a, %splat
269 define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
270 ; ALL-LABEL: constant_shift_v8i64:
272 ; ALL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
274 %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
278 define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
279 ; ALL-LABEL: constant_shift_v16i32:
281 ; ALL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
283 %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
284 ret <16 x i32> %shift
287 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
288 ; AVX512DQ-LABEL: constant_shift_v32i16:
290 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
291 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
292 ; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
293 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7]
294 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
295 ; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
296 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
297 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
298 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
299 ; AVX512DQ-NEXT: retq
301 ; AVX512BW-LABEL: constant_shift_v32i16:
303 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
304 ; AVX512BW-NEXT: retq
305 %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
306 ret <32 x i16> %shift
309 define <32 x i16> @constant_shift_v32i16_pairs(<32 x i16> %a) nounwind {
310 ; AVX512DQ-LABEL: constant_shift_v32i16_pairs:
312 ; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
313 ; AVX512DQ-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
314 ; AVX512DQ-NEXT: retq
316 ; AVX512BW-LABEL: constant_shift_v32i16_pairs:
318 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
319 ; AVX512BW-NEXT: retq
320 %shift = lshr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 9, i16 9, i16 9, i16 9, i16 10, i16 10, i16 10, i16 10, i16 11, i16 11, i16 11, i16 11, i16 12, i16 12, i16 12, i16 12, i16 13, i16 13, i16 13, i16 13, i16 14, i16 14, i16 14, i16 14, i16 15, i16 15, i16 15, i16 15>
321 ret <32 x i16> %shift
324 define <64 x i8> @constant_shift_v64i8_pairs(<64 x i8> %a) nounwind {
325 ; AVX512DQ-LABEL: constant_shift_v64i8_pairs:
327 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
328 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096]
329 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
330 ; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
331 ; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
332 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
333 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15,1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15,1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15,1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15]
334 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
335 ; AVX512DQ-NEXT: vpandq %zmm0, %zmm1, %zmm0
336 ; AVX512DQ-NEXT: retq
338 ; AVX512BW-LABEL: constant_shift_v64i8_pairs:
340 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
341 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
342 ; AVX512BW-NEXT: retq
343 %shift = lshr <64 x i8> %a, <i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4>
347 define <64 x i8> @constant_shift_v64i8_quads(<64 x i8> %a) nounwind {
348 ; ALL-LABEL: constant_shift_v64i8_quads:
350 ; ALL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
351 ; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
353 %shift = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 5, i8 5, i8 5, i8 5, i8 6, i8 6, i8 6, i8 6, i8 7, i8 7, i8 7, i8 7, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3, i8 3, i8 4, i8 4, i8 4, i8 4, i8 5, i8 5, i8 5, i8 5, i8 6, i8 6, i8 6, i8 6, i8 7, i8 7, i8 7, i8 7, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3, i8 3>
357 define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
358 ; AVX512DQ-LABEL: constant_shift_v64i8:
360 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
361 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
362 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
363 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
364 ; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
365 ; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3
366 ; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
367 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
368 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
369 ; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1]
370 ; AVX512DQ-NEXT: vpmullw %ymm5, %ymm1, %ymm1
371 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
372 ; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
373 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
374 ; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3
375 ; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
376 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
377 ; AVX512DQ-NEXT: vpmullw %ymm5, %ymm0, %ymm0
378 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
379 ; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
380 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
381 ; AVX512DQ-NEXT: retq
383 ; AVX512BW-LABEL: constant_shift_v64i8:
385 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
386 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
387 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
388 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
389 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
390 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
391 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
392 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
393 ; AVX512BW-NEXT: retq
394 %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
399 ; Uniform Constant Shifts
402 define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
403 ; ALL-LABEL: splatconstant_shift_v8i64:
405 ; ALL-NEXT: vpsrlq $7, %zmm0, %zmm0
407 %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
411 define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
412 ; ALL-LABEL: splatconstant_shift_v16i32:
414 ; ALL-NEXT: vpsrld $5, %zmm0, %zmm0
416 %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
417 ret <16 x i32> %shift
420 define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
421 ; AVX512DQ-LABEL: splatconstant_shift_v32i16:
423 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm1
424 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
425 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
426 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
427 ; AVX512DQ-NEXT: retq
429 ; AVX512BW-LABEL: splatconstant_shift_v32i16:
431 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0
432 ; AVX512BW-NEXT: retq
433 %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
434 ret <32 x i16> %shift
437 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
438 ; AVX512DQ-LABEL: splatconstant_shift_v64i8:
440 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm1
441 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
442 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
443 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
444 ; AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
445 ; AVX512DQ-NEXT: retq
447 ; AVX512BW-LABEL: splatconstant_shift_v64i8:
449 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0
450 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
451 ; AVX512BW-NEXT: retq
452 %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>