1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512DQ
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW
9 define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
10 ; ALL-LABEL: var_shift_v8i64:
12 ; ALL-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0
14 %shift = lshr <8 x i64> %a, %b
18 define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
19 ; ALL-LABEL: var_shift_v16i32:
21 ; ALL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
23 %shift = lshr <16 x i32> %a, %b
27 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
28 ; AVX512DQ-LABEL: var_shift_v32i16:
30 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
31 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
32 ; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm3, %zmm2
33 ; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2
34 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
35 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
36 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
37 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
38 ; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
39 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
40 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
43 ; AVX512BW-LABEL: var_shift_v32i16:
45 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
47 %shift = lshr <32 x i16> %a, %b
51 define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
52 ; AVX512DQ-LABEL: var_shift_v64i8:
54 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
55 ; AVX512DQ-NEXT: vpsrlw $4, %ymm2, %ymm3
56 ; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
57 ; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
58 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5
59 ; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5
60 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
61 ; AVX512DQ-NEXT: vpsrlw $2, %ymm2, %ymm3
62 ; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
63 ; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
64 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5
65 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
66 ; AVX512DQ-NEXT: vpsrlw $1, %ymm2, %ymm3
67 ; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
68 ; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
69 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5
70 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
71 ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm3
72 ; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
73 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
74 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
75 ; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm3
76 ; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
77 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
78 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
79 ; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm3
80 ; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
81 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
82 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
83 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
86 ; AVX512BW-LABEL: var_shift_v64i8:
88 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
89 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
90 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
91 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
92 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
93 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2
94 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
95 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
96 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
97 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
98 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2
99 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
100 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
101 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
102 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
103 ; AVX512BW-NEXT: retq
104 %shift = lshr <64 x i8> %a, %b
109 ; Uniform Variable Shifts
112 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
113 ; ALL-LABEL: splatvar_shift_v8i64:
115 ; ALL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
117 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
118 %shift = lshr <8 x i64> %a, %splat
122 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
123 ; ALL-LABEL: splatvar_shift_v16i32:
125 ; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
126 ; ALL-NEXT: vpsrld %xmm1, %zmm0, %zmm0
128 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
129 %shift = lshr <16 x i32> %a, %splat
130 ret <16 x i32> %shift
133 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
134 ; AVX512DQ-LABEL: splatvar_shift_v32i16:
136 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
137 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
138 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
139 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
140 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
141 ; AVX512DQ-NEXT: retq
143 ; AVX512BW-LABEL: splatvar_shift_v32i16:
145 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
146 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
147 ; AVX512BW-NEXT: retq
148 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
149 %shift = lshr <32 x i16> %a, %splat
150 ret <32 x i16> %shift
153 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
154 ; AVX512DQ-LABEL: splatvar_shift_v64i8:
156 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
157 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
158 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
159 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
160 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
161 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
162 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
163 ; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
164 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
165 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
166 ; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
167 ; AVX512DQ-NEXT: retq
169 ; AVX512BW-LABEL: splatvar_shift_v64i8:
171 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
172 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
173 ; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
174 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
175 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
176 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
177 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
178 ; AVX512BW-NEXT: retq
179 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
180 %shift = lshr <64 x i8> %a, %splat
185 ; Uniform Variable Modulo Shifts
188 define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
189 ; ALL-LABEL: splatvar_modulo_shift_v8i64:
191 ; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
192 ; ALL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
194 %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
195 %splat = shufflevector <8 x i64> %mod, <8 x i64> undef, <8 x i32> zeroinitializer
196 %shift = lshr <8 x i64> %a, %splat
200 define <16 x i32> @splatvar_modulo_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
201 ; ALL-LABEL: splatvar_modulo_shift_v16i32:
203 ; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
204 ; ALL-NEXT: vpsrld %xmm1, %zmm0, %zmm0
206 %mod = and <16 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
207 %splat = shufflevector <16 x i32> %mod, <16 x i32> undef, <16 x i32> zeroinitializer
208 %shift = lshr <16 x i32> %a, %splat
209 ret <16 x i32> %shift
212 define <32 x i16> @splatvar_modulo_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
213 ; AVX512DQ-LABEL: splatvar_modulo_shift_v32i16:
215 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
216 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
217 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
218 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
219 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
220 ; AVX512DQ-NEXT: retq
222 ; AVX512BW-LABEL: splatvar_modulo_shift_v32i16:
224 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
225 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
226 ; AVX512BW-NEXT: retq
227 %mod = and <32 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
228 %splat = shufflevector <32 x i16> %mod, <32 x i16> undef, <32 x i32> zeroinitializer
229 %shift = lshr <32 x i16> %a, %splat
230 ret <32 x i16> %shift
233 define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
234 ; AVX512DQ-LABEL: splatvar_modulo_shift_v64i8:
236 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
237 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
238 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
239 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
240 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
241 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
242 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
243 ; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
244 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
245 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
246 ; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
247 ; AVX512DQ-NEXT: retq
249 ; AVX512BW-LABEL: splatvar_modulo_shift_v64i8:
251 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
252 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
253 ; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
254 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
255 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
256 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
257 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
258 ; AVX512BW-NEXT: retq
259 %mod = and <64 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
260 %splat = shufflevector <64 x i8> %mod, <64 x i8> undef, <64 x i32> zeroinitializer
261 %shift = lshr <64 x i8> %a, %splat
269 define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
270 ; ALL-LABEL: constant_shift_v8i64:
272 ; ALL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
274 %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
278 define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
279 ; ALL-LABEL: constant_shift_v16i32:
281 ; ALL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
283 %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
284 ret <16 x i32> %shift
287 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
288 ; AVX512DQ-LABEL: constant_shift_v32i16:
290 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
291 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
292 ; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
293 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7]
294 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
295 ; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
296 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
297 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
298 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
299 ; AVX512DQ-NEXT: retq
301 ; AVX512BW-LABEL: constant_shift_v32i16:
303 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
304 ; AVX512BW-NEXT: retq
305 %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
306 ret <32 x i16> %shift
309 define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
310 ; AVX512DQ-LABEL: constant_shift_v64i8:
312 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
313 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
314 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
315 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
316 ; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
317 ; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3
318 ; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
319 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
320 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
321 ; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1]
322 ; AVX512DQ-NEXT: vpmullw %ymm5, %ymm1, %ymm1
323 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
324 ; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
325 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
326 ; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3
327 ; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
328 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
329 ; AVX512DQ-NEXT: vpmullw %ymm5, %ymm0, %ymm0
330 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
331 ; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
332 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
333 ; AVX512DQ-NEXT: retq
335 ; AVX512BW-LABEL: constant_shift_v64i8:
337 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
338 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
339 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
340 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
341 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
342 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
343 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
344 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
345 ; AVX512BW-NEXT: retq
346 %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
351 ; Uniform Constant Shifts
354 define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
355 ; ALL-LABEL: splatconstant_shift_v8i64:
357 ; ALL-NEXT: vpsrlq $7, %zmm0, %zmm0
359 %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
363 define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
364 ; ALL-LABEL: splatconstant_shift_v16i32:
366 ; ALL-NEXT: vpsrld $5, %zmm0, %zmm0
368 %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
369 ret <16 x i32> %shift
372 define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
373 ; AVX512DQ-LABEL: splatconstant_shift_v32i16:
375 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm1
376 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
377 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
378 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
379 ; AVX512DQ-NEXT: retq
381 ; AVX512BW-LABEL: splatconstant_shift_v32i16:
383 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0
384 ; AVX512BW-NEXT: retq
385 %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
386 ret <32 x i16> %shift
389 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
390 ; AVX512DQ-LABEL: splatconstant_shift_v64i8:
392 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm1
393 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
394 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
395 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
396 ; AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
397 ; AVX512DQ-NEXT: retq
399 ; AVX512BW-LABEL: splatconstant_shift_v64i8:
401 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0
402 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
403 ; AVX512BW-NEXT: retq
404 %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>