1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
11 ; 32-bit runs to make sure we do reasonable things for i64 shifts.
12 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86-AVX1
13 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX2
19 define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
20 ; AVX1-LABEL: var_shift_v4i64:
22 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
23 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
24 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4
25 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
26 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2
27 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
28 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3
29 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
30 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
31 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
32 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
35 ; AVX2-LABEL: var_shift_v4i64:
37 ; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
40 ; XOPAVX1-LABEL: var_shift_v4i64:
42 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
43 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
44 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm3, %xmm2
45 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
46 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
49 ; XOPAVX2-LABEL: var_shift_v4i64:
51 ; XOPAVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
54 ; AVX512-LABEL: var_shift_v4i64:
56 ; AVX512-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
59 ; AVX512VL-LABEL: var_shift_v4i64:
61 ; AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
64 ; X86-AVX1-LABEL: var_shift_v4i64:
66 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
67 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
68 ; X86-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4
69 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
70 ; X86-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2
71 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
72 ; X86-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3
73 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
74 ; X86-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
75 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
76 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
79 ; X86-AVX2-LABEL: var_shift_v4i64:
81 ; X86-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
83 %shift = shl <4 x i64> %a, %b
87 define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
88 ; AVX1-LABEL: var_shift_v8i32:
90 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
91 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
92 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
93 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
94 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
95 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
96 ; AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
97 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
98 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
99 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
100 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
101 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
104 ; AVX2-LABEL: var_shift_v8i32:
106 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
109 ; XOPAVX1-LABEL: var_shift_v8i32:
111 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
112 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
113 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm3, %xmm2
114 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
115 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
118 ; XOPAVX2-LABEL: var_shift_v8i32:
120 ; XOPAVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
123 ; AVX512-LABEL: var_shift_v8i32:
125 ; AVX512-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
128 ; AVX512VL-LABEL: var_shift_v8i32:
130 ; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
131 ; AVX512VL-NEXT: retq
133 ; X86-AVX1-LABEL: var_shift_v8i32:
135 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
136 ; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm2
137 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
138 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
139 ; X86-AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
140 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
141 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
142 ; X86-AVX1-NEXT: vpslld $23, %xmm1, %xmm1
143 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
144 ; X86-AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
145 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
146 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
147 ; X86-AVX1-NEXT: retl
149 ; X86-AVX2-LABEL: var_shift_v8i32:
151 ; X86-AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
152 ; X86-AVX2-NEXT: retl
153 %shift = shl <8 x i32> %a, %b
157 define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
158 ; AVX1-LABEL: var_shift_v16i16:
160 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
161 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
162 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
163 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
164 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
165 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
166 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
167 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
168 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
169 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
170 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
171 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
172 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
173 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
174 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
175 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
176 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
177 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
178 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
179 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
180 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
181 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
182 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
183 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
186 ; AVX2-LABEL: var_shift_v16i16:
188 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
189 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
190 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
191 ; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
192 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
193 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
194 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
195 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
196 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
197 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
200 ; XOPAVX1-LABEL: var_shift_v16i16:
202 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
203 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
204 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm3, %xmm2
205 ; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0
206 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
209 ; XOPAVX2-LABEL: var_shift_v16i16:
211 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
212 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
213 ; XOPAVX2-NEXT: vpshlw %xmm2, %xmm3, %xmm2
214 ; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0
215 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
218 ; AVX512DQ-LABEL: var_shift_v16i16:
220 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
221 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
222 ; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
223 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
224 ; AVX512DQ-NEXT: retq
226 ; AVX512BW-LABEL: var_shift_v16i16:
228 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
229 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
230 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
231 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
232 ; AVX512BW-NEXT: retq
234 ; AVX512DQVL-LABEL: var_shift_v16i16:
235 ; AVX512DQVL: # %bb.0:
236 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
237 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
238 ; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
239 ; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
240 ; AVX512DQVL-NEXT: retq
242 ; AVX512BWVL-LABEL: var_shift_v16i16:
243 ; AVX512BWVL: # %bb.0:
244 ; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
245 ; AVX512BWVL-NEXT: retq
247 ; X86-AVX1-LABEL: var_shift_v16i16:
249 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
250 ; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4,4,5,5,6,6,7,7]
251 ; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm4
252 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
253 ; X86-AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4
254 ; X86-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
255 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
256 ; X86-AVX1-NEXT: vpslld $23, %xmm3, %xmm3
257 ; X86-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3
258 ; X86-AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
259 ; X86-AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
260 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
261 ; X86-AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
262 ; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4,4,5,5,6,6,7,7]
263 ; X86-AVX1-NEXT: vpslld $23, %xmm4, %xmm4
264 ; X86-AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4
265 ; X86-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
266 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
267 ; X86-AVX1-NEXT: vpslld $23, %xmm1, %xmm1
268 ; X86-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
269 ; X86-AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
270 ; X86-AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
271 ; X86-AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
272 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
273 ; X86-AVX1-NEXT: retl
275 ; X86-AVX2-LABEL: var_shift_v16i16:
277 ; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
278 ; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
279 ; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
280 ; X86-AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
281 ; X86-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
282 ; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
283 ; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
284 ; X86-AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
285 ; X86-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
286 ; X86-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
287 ; X86-AVX2-NEXT: retl
288 %shift = shl <16 x i16> %a, %b
289 ret <16 x i16> %shift
292 define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
293 ; AVX1-LABEL: var_shift_v32i8:
295 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
296 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm3
297 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
298 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
299 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
300 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
301 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
302 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm3
303 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
304 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
305 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
306 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
307 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
308 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
309 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
310 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3
311 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
312 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
313 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
314 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3
315 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
316 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
317 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
318 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3
319 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
320 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
321 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
324 ; AVX2-LABEL: var_shift_v32i8:
326 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
327 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
328 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
329 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
330 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
331 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
332 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
333 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
334 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
335 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
336 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
339 ; XOPAVX1-LABEL: var_shift_v32i8:
341 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
342 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
343 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm3, %xmm2
344 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
345 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
348 ; XOPAVX2-LABEL: var_shift_v32i8:
350 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
351 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
352 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm3, %xmm2
353 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
354 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
357 ; AVX512DQ-LABEL: var_shift_v32i8:
359 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
360 ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2
361 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
362 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
363 ; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2
364 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
365 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
366 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
367 ; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm2
368 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
369 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
370 ; AVX512DQ-NEXT: retq
372 ; AVX512BW-LABEL: var_shift_v32i8:
374 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
375 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
376 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
377 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
378 ; AVX512BW-NEXT: retq
380 ; AVX512DQVL-LABEL: var_shift_v32i8:
381 ; AVX512DQVL: # %bb.0:
382 ; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
383 ; AVX512DQVL-NEXT: vpsllw $4, %ymm0, %ymm2
384 ; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
385 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
386 ; AVX512DQVL-NEXT: vpsllw $2, %ymm0, %ymm2
387 ; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
388 ; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
389 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
390 ; AVX512DQVL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
391 ; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
392 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
393 ; AVX512DQVL-NEXT: retq
395 ; AVX512BWVL-LABEL: var_shift_v32i8:
396 ; AVX512BWVL: # %bb.0:
397 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
398 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
399 ; AVX512BWVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
400 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
401 ; AVX512BWVL-NEXT: retq
403 ; X86-AVX1-LABEL: var_shift_v32i8:
405 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
406 ; X86-AVX1-NEXT: vpsllw $4, %xmm2, %xmm3
407 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
408 ; X86-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
409 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
410 ; X86-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
411 ; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
412 ; X86-AVX1-NEXT: vpsllw $2, %xmm2, %xmm3
413 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
414 ; X86-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
415 ; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
416 ; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
417 ; X86-AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
418 ; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
419 ; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
420 ; X86-AVX1-NEXT: vpsllw $4, %xmm0, %xmm3
421 ; X86-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
422 ; X86-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
423 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
424 ; X86-AVX1-NEXT: vpsllw $2, %xmm0, %xmm3
425 ; X86-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
426 ; X86-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
427 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
428 ; X86-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3
429 ; X86-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
430 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
431 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
432 ; X86-AVX1-NEXT: retl
434 ; X86-AVX2-LABEL: var_shift_v32i8:
436 ; X86-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
437 ; X86-AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
438 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
439 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
440 ; X86-AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
441 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
442 ; X86-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
443 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
444 ; X86-AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
445 ; X86-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
446 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
447 ; X86-AVX2-NEXT: retl
448 %shift = shl <32 x i8> %a, %b
453 ; Uniform Variable Shifts
456 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
457 ; AVX1-LABEL: splatvar_shift_v4i64:
459 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
460 ; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
461 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
462 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
465 ; AVX2-LABEL: splatvar_shift_v4i64:
467 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
470 ; XOPAVX1-LABEL: splatvar_shift_v4i64:
472 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
473 ; XOPAVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
474 ; XOPAVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
475 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
478 ; XOPAVX2-LABEL: splatvar_shift_v4i64:
480 ; XOPAVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
483 ; AVX512-LABEL: splatvar_shift_v4i64:
485 ; AVX512-NEXT: vpsllq %xmm1, %ymm0, %ymm0
488 ; AVX512VL-LABEL: splatvar_shift_v4i64:
490 ; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0
491 ; AVX512VL-NEXT: retq
493 ; X86-AVX1-LABEL: splatvar_shift_v4i64:
495 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
496 ; X86-AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
497 ; X86-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
498 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
499 ; X86-AVX1-NEXT: retl
501 ; X86-AVX2-LABEL: splatvar_shift_v4i64:
503 ; X86-AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
504 ; X86-AVX2-NEXT: retl
505 %splat = shufflevector <4 x i64> %b, <4 x i64> poison, <4 x i32> zeroinitializer
506 %shift = shl <4 x i64> %a, %splat
510 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
511 ; AVX1-LABEL: splatvar_shift_v8i32:
513 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
514 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
515 ; AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
516 ; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0
517 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
520 ; AVX2-LABEL: splatvar_shift_v8i32:
522 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
523 ; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
526 ; XOPAVX1-LABEL: splatvar_shift_v8i32:
528 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
529 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
530 ; XOPAVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
531 ; XOPAVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0
532 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
535 ; XOPAVX2-LABEL: splatvar_shift_v8i32:
537 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
538 ; XOPAVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
541 ; AVX512-LABEL: splatvar_shift_v8i32:
543 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
544 ; AVX512-NEXT: vpslld %xmm1, %ymm0, %ymm0
547 ; AVX512VL-LABEL: splatvar_shift_v8i32:
549 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
550 ; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0
551 ; AVX512VL-NEXT: retq
553 ; X86-AVX1-LABEL: splatvar_shift_v8i32:
555 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
556 ; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
557 ; X86-AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
558 ; X86-AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0
559 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
560 ; X86-AVX1-NEXT: retl
562 ; X86-AVX2-LABEL: splatvar_shift_v8i32:
564 ; X86-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
565 ; X86-AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
566 ; X86-AVX2-NEXT: retl
567 %splat = shufflevector <8 x i32> %b, <8 x i32> poison, <8 x i32> zeroinitializer
568 %shift = shl <8 x i32> %a, %splat
572 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
573 ; AVX1-LABEL: splatvar_shift_v16i16:
575 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
576 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
577 ; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
578 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
579 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
582 ; AVX2-LABEL: splatvar_shift_v16i16:
584 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
585 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
588 ; XOPAVX1-LABEL: splatvar_shift_v16i16:
590 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
591 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
592 ; XOPAVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
593 ; XOPAVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
594 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
597 ; XOPAVX2-LABEL: splatvar_shift_v16i16:
599 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
600 ; XOPAVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
603 ; AVX512-LABEL: splatvar_shift_v16i16:
605 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
606 ; AVX512-NEXT: vpsllw %xmm1, %ymm0, %ymm0
609 ; AVX512VL-LABEL: splatvar_shift_v16i16:
611 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
612 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
613 ; AVX512VL-NEXT: retq
615 ; X86-AVX1-LABEL: splatvar_shift_v16i16:
617 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
618 ; X86-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
619 ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
620 ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
621 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
622 ; X86-AVX1-NEXT: retl
624 ; X86-AVX2-LABEL: splatvar_shift_v16i16:
626 ; X86-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
627 ; X86-AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
628 ; X86-AVX2-NEXT: retl
629 %splat = shufflevector <16 x i16> %b, <16 x i16> poison, <16 x i32> zeroinitializer
630 %shift = shl <16 x i16> %a, %splat
631 ret <16 x i16> %shift
634 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
635 ; AVX1-LABEL: splatvar_shift_v32i8:
637 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
638 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
639 ; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
640 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
641 ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
642 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
643 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
644 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
645 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
646 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
647 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
650 ; AVX2-LABEL: splatvar_shift_v32i8:
652 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
653 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
654 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
655 ; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
656 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
657 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
660 ; XOPAVX1-LABEL: splatvar_shift_v32i8:
662 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
663 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
664 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
665 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2
666 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
667 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
670 ; XOPAVX2-LABEL: splatvar_shift_v32i8:
672 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
673 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
674 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2
675 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
676 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
679 ; AVX512DQ-LABEL: splatvar_shift_v32i8:
681 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
682 ; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0
683 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
684 ; AVX512DQ-NEXT: vpsllw %xmm1, %xmm2, %xmm1
685 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
686 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
687 ; AVX512DQ-NEXT: retq
689 ; AVX512BW-LABEL: splatvar_shift_v32i8:
691 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
692 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
693 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
694 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
695 ; AVX512BW-NEXT: retq
697 ; AVX512DQVL-LABEL: splatvar_shift_v32i8:
698 ; AVX512DQVL: # %bb.0:
699 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
700 ; AVX512DQVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
701 ; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
702 ; AVX512DQVL-NEXT: vpsllw %xmm1, %xmm2, %xmm1
703 ; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
704 ; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
705 ; AVX512DQVL-NEXT: retq
707 ; AVX512BWVL-LABEL: splatvar_shift_v32i8:
708 ; AVX512BWVL: # %bb.0:
709 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
710 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
711 ; AVX512BWVL-NEXT: vpsllw %xmm1, %zmm0, %zmm0
712 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
713 ; AVX512BWVL-NEXT: retq
715 ; X86-AVX1-LABEL: splatvar_shift_v32i8:
717 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
718 ; X86-AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
719 ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
720 ; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
721 ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
722 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
723 ; X86-AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
724 ; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
725 ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
726 ; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
727 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
728 ; X86-AVX1-NEXT: retl
730 ; X86-AVX2-LABEL: splatvar_shift_v32i8:
732 ; X86-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
733 ; X86-AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
734 ; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
735 ; X86-AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
736 ; X86-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
737 ; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
738 ; X86-AVX2-NEXT: retl
739 %splat = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> zeroinitializer
740 %shift = shl <32 x i8> %a, %splat
745 ; Uniform Variable Modulo Shifts
748 define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
749 ; AVX1-LABEL: splatvar_modulo_shift_v4i64:
751 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
752 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
753 ; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
754 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
755 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
758 ; AVX2-LABEL: splatvar_modulo_shift_v4i64:
760 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
761 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
764 ; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64:
766 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
767 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
768 ; XOPAVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
769 ; XOPAVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
770 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
773 ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64:
775 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
776 ; XOPAVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
779 ; AVX512-LABEL: splatvar_modulo_shift_v4i64:
781 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
782 ; AVX512-NEXT: vpsllq %xmm1, %ymm0, %ymm0
785 ; AVX512VL-LABEL: splatvar_modulo_shift_v4i64:
787 ; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1
788 ; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0
789 ; AVX512VL-NEXT: retq
791 ; X86-AVX1-LABEL: splatvar_modulo_shift_v4i64:
793 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
794 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
795 ; X86-AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
796 ; X86-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
797 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
798 ; X86-AVX1-NEXT: retl
800 ; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64:
802 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
803 ; X86-AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
804 ; X86-AVX2-NEXT: retl
805 %mod = and <4 x i64> %b, <i64 63, i64 63, i64 63, i64 63>
806 %splat = shufflevector <4 x i64> %mod, <4 x i64> poison, <4 x i32> zeroinitializer
807 %shift = shl <4 x i64> %a, %splat
811 define <8 x i32> @splatvar_modulo_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
812 ; AVX1-LABEL: splatvar_modulo_shift_v8i32:
814 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
815 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
816 ; AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
817 ; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0
818 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
821 ; AVX2-LABEL: splatvar_modulo_shift_v8i32:
823 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
824 ; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
827 ; XOPAVX1-LABEL: splatvar_modulo_shift_v8i32:
829 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
830 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
831 ; XOPAVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
832 ; XOPAVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0
833 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
836 ; XOPAVX2-LABEL: splatvar_modulo_shift_v8i32:
838 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
839 ; XOPAVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
842 ; AVX512-LABEL: splatvar_modulo_shift_v8i32:
844 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
845 ; AVX512-NEXT: vpslld %xmm1, %ymm0, %ymm0
848 ; AVX512VL-LABEL: splatvar_modulo_shift_v8i32:
850 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
851 ; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0
852 ; AVX512VL-NEXT: retq
854 ; X86-AVX1-LABEL: splatvar_modulo_shift_v8i32:
856 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
857 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
858 ; X86-AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
859 ; X86-AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0
860 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
861 ; X86-AVX1-NEXT: retl
863 ; X86-AVX2-LABEL: splatvar_modulo_shift_v8i32:
865 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
866 ; X86-AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
867 ; X86-AVX2-NEXT: retl
868 %mod = and <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
869 %splat = shufflevector <8 x i32> %mod, <8 x i32> poison, <8 x i32> zeroinitializer
870 %shift = shl <8 x i32> %a, %splat
874 define <16 x i16> @splatvar_modulo_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
875 ; AVX1-LABEL: splatvar_modulo_shift_v16i16:
877 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
878 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
879 ; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
880 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
881 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
884 ; AVX2-LABEL: splatvar_modulo_shift_v16i16:
886 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
887 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
890 ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i16:
892 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
893 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
894 ; XOPAVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
895 ; XOPAVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
896 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
899 ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i16:
901 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
902 ; XOPAVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
905 ; AVX512-LABEL: splatvar_modulo_shift_v16i16:
907 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
908 ; AVX512-NEXT: vpsllw %xmm1, %ymm0, %ymm0
911 ; AVX512VL-LABEL: splatvar_modulo_shift_v16i16:
913 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
914 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
915 ; AVX512VL-NEXT: retq
917 ; X86-AVX1-LABEL: splatvar_modulo_shift_v16i16:
919 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
920 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
921 ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
922 ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
923 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
924 ; X86-AVX1-NEXT: retl
926 ; X86-AVX2-LABEL: splatvar_modulo_shift_v16i16:
928 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
929 ; X86-AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
930 ; X86-AVX2-NEXT: retl
931 %mod = and <16 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
932 %splat = shufflevector <16 x i16> %mod, <16 x i16> poison, <16 x i32> zeroinitializer
933 %shift = shl <16 x i16> %a, %splat
934 ret <16 x i16> %shift
937 define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
938 ; AVX1-LABEL: splatvar_modulo_shift_v32i8:
940 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
941 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
942 ; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
943 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
944 ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
945 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
946 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
947 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
948 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
949 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
950 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
953 ; AVX2-LABEL: splatvar_modulo_shift_v32i8:
955 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
956 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
957 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
958 ; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
959 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
960 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
963 ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
965 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
966 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
967 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
968 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
969 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2
970 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
971 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
974 ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8:
976 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
977 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
978 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
979 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2
980 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
981 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
984 ; AVX512DQ-LABEL: splatvar_modulo_shift_v32i8:
986 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
987 ; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0
988 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
989 ; AVX512DQ-NEXT: vpsllw %xmm1, %xmm2, %xmm1
990 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
991 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
992 ; AVX512DQ-NEXT: retq
994 ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
996 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
997 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
998 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
999 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
1000 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1001 ; AVX512BW-NEXT: retq
1003 ; AVX512DQVL-LABEL: splatvar_modulo_shift_v32i8:
1004 ; AVX512DQVL: # %bb.0:
1005 ; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1006 ; AVX512DQVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1007 ; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1008 ; AVX512DQVL-NEXT: vpsllw %xmm1, %xmm2, %xmm1
1009 ; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
1010 ; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
1011 ; AVX512DQVL-NEXT: retq
1013 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
1014 ; AVX512BWVL: # %bb.0:
1015 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1016 ; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
1017 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1018 ; AVX512BWVL-NEXT: vpsllw %xmm1, %zmm0, %zmm0
1019 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
1020 ; AVX512BWVL-NEXT: retq
1022 ; X86-AVX1-LABEL: splatvar_modulo_shift_v32i8:
1023 ; X86-AVX1: # %bb.0:
1024 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1025 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1026 ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
1027 ; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1028 ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
1029 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
1030 ; X86-AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
1031 ; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1032 ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
1033 ; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1034 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1035 ; X86-AVX1-NEXT: retl
1037 ; X86-AVX2-LABEL: splatvar_modulo_shift_v32i8:
1038 ; X86-AVX2: # %bb.0:
1039 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1040 ; X86-AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1041 ; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1042 ; X86-AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
1043 ; X86-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
1044 ; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1045 ; X86-AVX2-NEXT: retl
1046 %mod = and <32 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
1047 %splat = shufflevector <32 x i8> %mod, <32 x i8> poison, <32 x i32> zeroinitializer
1048 %shift = shl <32 x i8> %a, %splat
1049 ret <32 x i8> %shift
1056 define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
1057 ; AVX1-LABEL: constant_shift_v4i64:
1059 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1060 ; AVX1-NEXT: vpsllq $62, %xmm1, %xmm2
1061 ; AVX1-NEXT: vpsllq $31, %xmm1, %xmm1
1062 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1063 ; AVX1-NEXT: vpsllq $7, %xmm0, %xmm2
1064 ; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1065 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1066 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1069 ; AVX2-LABEL: constant_shift_v4i64:
1071 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1074 ; XOPAVX1-LABEL: constant_shift_v4i64:
1076 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1077 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1078 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1079 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1080 ; XOPAVX1-NEXT: retq
1082 ; XOPAVX2-LABEL: constant_shift_v4i64:
1084 ; XOPAVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1085 ; XOPAVX2-NEXT: retq
1087 ; AVX512-LABEL: constant_shift_v4i64:
1089 ; AVX512-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1092 ; AVX512VL-LABEL: constant_shift_v4i64:
1093 ; AVX512VL: # %bb.0:
1094 ; AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1095 ; AVX512VL-NEXT: retq
1097 ; X86-AVX1-LABEL: constant_shift_v4i64:
1098 ; X86-AVX1: # %bb.0:
1099 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1100 ; X86-AVX1-NEXT: vpsllq $62, %xmm1, %xmm2
1101 ; X86-AVX1-NEXT: vpsllq $31, %xmm1, %xmm1
1102 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1103 ; X86-AVX1-NEXT: vpsllq $7, %xmm0, %xmm2
1104 ; X86-AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1105 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1106 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1107 ; X86-AVX1-NEXT: retl
1109 ; X86-AVX2-LABEL: constant_shift_v4i64:
1110 ; X86-AVX2: # %bb.0:
1111 ; X86-AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1112 ; X86-AVX2-NEXT: retl
1113 %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
1114 ret <4 x i64> %shift
1117 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
1118 ; AVX1-LABEL: constant_shift_v8i32:
1120 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1121 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1122 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1123 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1126 ; AVX2-LABEL: constant_shift_v8i32:
1128 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1131 ; XOPAVX1-LABEL: constant_shift_v8i32:
1133 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1134 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1135 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1136 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1137 ; XOPAVX1-NEXT: retq
1139 ; XOPAVX2-LABEL: constant_shift_v8i32:
1141 ; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1142 ; XOPAVX2-NEXT: retq
1144 ; AVX512-LABEL: constant_shift_v8i32:
1146 ; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1149 ; AVX512VL-LABEL: constant_shift_v8i32:
1150 ; AVX512VL: # %bb.0:
1151 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1152 ; AVX512VL-NEXT: retq
1154 ; X86-AVX1-LABEL: constant_shift_v8i32:
1155 ; X86-AVX1: # %bb.0:
1156 ; X86-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
1157 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1158 ; X86-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1159 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1160 ; X86-AVX1-NEXT: retl
1162 ; X86-AVX2-LABEL: constant_shift_v8i32:
1163 ; X86-AVX2: # %bb.0:
1164 ; X86-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1165 ; X86-AVX2-NEXT: retl
1166 %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
1167 ret <8 x i32> %shift
1170 define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
1171 ; AVX1-LABEL: constant_shift_v16i16:
1173 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,2,4,8,16,32,64,128]
1174 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1175 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,512,1024,2048,4096,8192,16384,32768]
1176 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1179 ; AVX2-LABEL: constant_shift_v16i16:
1181 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1184 ; XOPAVX1-LABEL: constant_shift_v16i16:
1186 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1187 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1188 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1189 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1190 ; XOPAVX1-NEXT: retq
1192 ; XOPAVX2-LABEL: constant_shift_v16i16:
1194 ; XOPAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1195 ; XOPAVX2-NEXT: retq
1197 ; AVX512DQ-LABEL: constant_shift_v16i16:
1198 ; AVX512DQ: # %bb.0:
1199 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1200 ; AVX512DQ-NEXT: retq
1202 ; AVX512BW-LABEL: constant_shift_v16i16:
1203 ; AVX512BW: # %bb.0:
1204 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1205 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1206 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1207 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1208 ; AVX512BW-NEXT: retq
1210 ; AVX512DQVL-LABEL: constant_shift_v16i16:
1211 ; AVX512DQVL: # %bb.0:
1212 ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1213 ; AVX512DQVL-NEXT: retq
1215 ; AVX512BWVL-LABEL: constant_shift_v16i16:
1216 ; AVX512BWVL: # %bb.0:
1217 ; AVX512BWVL-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1218 ; AVX512BWVL-NEXT: retq
1220 ; X86-AVX1-LABEL: constant_shift_v16i16:
1221 ; X86-AVX1: # %bb.0:
1222 ; X86-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [1,2,4,8,16,32,64,128]
1223 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1224 ; X86-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [256,512,1024,2048,4096,8192,16384,32768]
1225 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1226 ; X86-AVX1-NEXT: retl
1228 ; X86-AVX2-LABEL: constant_shift_v16i16:
1229 ; X86-AVX2: # %bb.0:
1230 ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1231 ; X86-AVX2-NEXT: retl
1232 %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1233 ret <16 x i16> %shift
1236 define <16 x i16> @constant_shift_v16i16_pairs(<16 x i16> %a) nounwind {
1237 ; AVX1-LABEL: constant_shift_v16i16_pairs:
1239 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4,4,8,8,1,1,2,2]
1240 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1241 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,64,128,128,16,16,32,32]
1242 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1245 ; AVX2-LABEL: constant_shift_v16i16_pairs:
1247 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1248 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1251 ; XOPAVX1-LABEL: constant_shift_v16i16_pairs:
1253 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1254 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1255 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1256 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1257 ; XOPAVX1-NEXT: retq
1259 ; XOPAVX2-LABEL: constant_shift_v16i16_pairs:
1261 ; XOPAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [4,4,8,8,1,1,2,2,64,64,128,128,16,16,32,32]
1262 ; XOPAVX2-NEXT: retq
1264 ; AVX512DQ-LABEL: constant_shift_v16i16_pairs:
1265 ; AVX512DQ: # %bb.0:
1266 ; AVX512DQ-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1267 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1268 ; AVX512DQ-NEXT: retq
1270 ; AVX512BW-LABEL: constant_shift_v16i16_pairs:
1271 ; AVX512BW: # %bb.0:
1272 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1273 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,2,3,3,0,0,1,1,6,6,7,7,4,4,5,5]
1274 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1275 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1276 ; AVX512BW-NEXT: retq
1278 ; AVX512DQVL-LABEL: constant_shift_v16i16_pairs:
1279 ; AVX512DQVL: # %bb.0:
1280 ; AVX512DQVL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1281 ; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1282 ; AVX512DQVL-NEXT: retq
1284 ; AVX512BWVL-LABEL: constant_shift_v16i16_pairs:
1285 ; AVX512BWVL: # %bb.0:
1286 ; AVX512BWVL-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1287 ; AVX512BWVL-NEXT: retq
1289 ; X86-AVX1-LABEL: constant_shift_v16i16_pairs:
1290 ; X86-AVX1: # %bb.0:
1291 ; X86-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [4,4,8,8,1,1,2,2]
1292 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1293 ; X86-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [64,64,128,128,16,16,32,32]
1294 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1295 ; X86-AVX1-NEXT: retl
1297 ; X86-AVX2-LABEL: constant_shift_v16i16_pairs:
1298 ; X86-AVX2: # %bb.0:
1299 ; X86-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1300 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1301 ; X86-AVX2-NEXT: retl
1302 %shift = shl <16 x i16> %a, <i16 2, i16 2, i16 3, i16 3, i16 0, i16 0, i16 1, i16 1, i16 6, i16 6, i16 7, i16 7, i16 4, i16 4, i16 5, i16 5>
1303 ret <16 x i16> %shift
1306 define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
1307 ; AVX1-LABEL: constant_shift_v32i8:
1309 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1310 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
1311 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm3
1312 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1313 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
1314 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
1315 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
1316 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
1317 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
1318 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm2
1319 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
1320 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
1321 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
1322 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
1323 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1326 ; AVX2-LABEL: constant_shift_v32i8:
1328 ; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
1329 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
1330 ; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
1331 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1332 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1335 ; XOPAVX1-LABEL: constant_shift_v32i8:
1337 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1338 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1339 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1340 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1341 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1342 ; XOPAVX1-NEXT: retq
1344 ; XOPAVX2-LABEL: constant_shift_v32i8:
1346 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1347 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1348 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1349 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1350 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1351 ; XOPAVX2-NEXT: retq
1353 ; AVX512DQ-LABEL: constant_shift_v32i8:
1354 ; AVX512DQ: # %bb.0:
1355 ; AVX512DQ-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
1356 ; AVX512DQ-NEXT: vpsllw $8, %ymm1, %ymm1
1357 ; AVX512DQ-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
1358 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1359 ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
1360 ; AVX512DQ-NEXT: retq
1362 ; AVX512BW-LABEL: constant_shift_v32i8:
1363 ; AVX512BW: # %bb.0:
1364 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1365 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1366 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1367 ; AVX512BW-NEXT: retq
1369 ; AVX512DQVL-LABEL: constant_shift_v32i8:
1370 ; AVX512DQVL: # %bb.0:
1371 ; AVX512DQVL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
1372 ; AVX512DQVL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
1373 ; AVX512DQVL-NEXT: vpsllw $8, %ymm0, %ymm0
1374 ; AVX512DQVL-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
1375 ; AVX512DQVL-NEXT: retq
1377 ; AVX512BWVL-LABEL: constant_shift_v32i8:
1378 ; AVX512BWVL: # %bb.0:
1379 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1380 ; AVX512BWVL-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1381 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
1382 ; AVX512BWVL-NEXT: retq
1384 ; X86-AVX1-LABEL: constant_shift_v32i8:
1385 ; X86-AVX1: # %bb.0:
1386 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1387 ; X86-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
1388 ; X86-AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm3
1389 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1390 ; X86-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
1391 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
1392 ; X86-AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
1393 ; X86-AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
1394 ; X86-AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
1395 ; X86-AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm2
1396 ; X86-AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
1397 ; X86-AVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
1398 ; X86-AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
1399 ; X86-AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
1400 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1401 ; X86-AVX1-NEXT: retl
1403 ; X86-AVX2-LABEL: constant_shift_v32i8:
1404 ; X86-AVX2: # %bb.0:
1405 ; X86-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
1406 ; X86-AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
1407 ; X86-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
1408 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1409 ; X86-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1410 ; X86-AVX2-NEXT: retl
1411 %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1412 ret <32 x i8> %shift
1415 define <32 x i8> @constant_shift_v32i8_pairs(<32 x i8> %a) nounwind {
1416 ; AVX1-LABEL: constant_shift_v32i8_pairs:
1418 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [8,128,64,4,128,1,128,2]
1419 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1420 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1421 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32,1,16,128,64,2,16,1]
1422 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1423 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1426 ; AVX2-LABEL: constant_shift_v32i8_pairs:
1428 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [8,128,64,4,128,1,128,2,32,1,16,128,64,2,16,1]
1429 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1432 ; XOPAVX1-LABEL: constant_shift_v32i8_pairs:
1434 ; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1435 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1436 ; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1437 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1438 ; XOPAVX1-NEXT: retq
1440 ; XOPAVX2-LABEL: constant_shift_v32i8_pairs:
1442 ; XOPAVX2-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1443 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1444 ; XOPAVX2-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1445 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1446 ; XOPAVX2-NEXT: retq
1448 ; AVX512DQ-LABEL: constant_shift_v32i8_pairs:
1449 ; AVX512DQ: # %bb.0:
1450 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [8,128,64,4,128,1,128,2,32,1,16,128,64,2,16,1]
1451 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1452 ; AVX512DQ-NEXT: retq
1454 ; AVX512BW-LABEL: constant_shift_v32i8_pairs:
1455 ; AVX512BW: # %bb.0:
1456 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1457 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,7,6,2,7,0,7,1,5,0,4,7,6,1,4,0]
1458 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1459 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1460 ; AVX512BW-NEXT: retq
1462 ; AVX512DQVL-LABEL: constant_shift_v32i8_pairs:
1463 ; AVX512DQVL: # %bb.0:
1464 ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [8,128,64,4,128,1,128,2,32,1,16,128,64,2,16,1]
1465 ; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1466 ; AVX512DQVL-NEXT: retq
1468 ; AVX512BWVL-LABEL: constant_shift_v32i8_pairs:
1469 ; AVX512BWVL: # %bb.0:
1470 ; AVX512BWVL-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1471 ; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1472 ; AVX512BWVL-NEXT: retq
1474 ; X86-AVX1-LABEL: constant_shift_v32i8_pairs:
1475 ; X86-AVX1: # %bb.0:
1476 ; X86-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [8,128,64,4,128,1,128,2]
1477 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1478 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1479 ; X86-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [32,1,16,128,64,2,16,1]
1480 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1481 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1482 ; X86-AVX1-NEXT: retl
1484 ; X86-AVX2-LABEL: constant_shift_v32i8_pairs:
1485 ; X86-AVX2: # %bb.0:
1486 ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [8,128,64,4,128,1,128,2,32,1,16,128,64,2,16,1]
1487 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1488 ; X86-AVX2-NEXT: retl
1489 %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 7, i8 7, i8 6, i8 6, i8 2, i8 2, i8 7, i8 7, i8 0, i8 0, i8 7, i8 7, i8 1, i8 1, i8 5, i8 5, i8 0, i8 0, i8 4, i8 4, i8 7, i8 7, i8 6, i8 6, i8 1, i8 1, i8 4, i8 4, i8 0, i8 0>
1490 ret <32 x i8> %shift
1493 define <32 x i8> @constant_shift_v32i8_quads(<32 x i8> %a) nounwind {
1494 ; AVX1-LABEL: constant_shift_v32i8_quads:
1496 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4,4,8,8,1,1,2,2]
1497 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1498 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1499 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,64,128,128,16,16,32,32]
1500 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1501 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1504 ; AVX2-LABEL: constant_shift_v32i8_quads:
1506 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1507 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1510 ; XOPAVX1-LABEL: constant_shift_v32i8_quads:
1512 ; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1513 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1514 ; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1515 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1516 ; XOPAVX1-NEXT: retq
1518 ; XOPAVX2-LABEL: constant_shift_v32i8_quads:
1520 ; XOPAVX2-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1521 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1522 ; XOPAVX2-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1523 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1524 ; XOPAVX2-NEXT: retq
1526 ; AVX512-LABEL: constant_shift_v32i8_quads:
1528 ; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1529 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1532 ; AVX512VL-LABEL: constant_shift_v32i8_quads:
1533 ; AVX512VL: # %bb.0:
1534 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1535 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1536 ; AVX512VL-NEXT: retq
1538 ; X86-AVX1-LABEL: constant_shift_v32i8_quads:
1539 ; X86-AVX1: # %bb.0:
1540 ; X86-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [4,4,8,8,1,1,2,2]
1541 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1542 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1543 ; X86-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [64,64,128,128,16,16,32,32]
1544 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1545 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1546 ; X86-AVX1-NEXT: retl
1548 ; X86-AVX2-LABEL: constant_shift_v32i8_quads:
1549 ; X86-AVX2: # %bb.0:
1550 ; X86-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1551 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1552 ; X86-AVX2-NEXT: retl
1553 %shift = shl <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3, i8 3, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 6, i8 6, i8 6, i8 6, i8 7, i8 7, i8 7, i8 7, i8 4, i8 4, i8 4, i8 4, i8 5, i8 5, i8 5, i8 5>
1554 ret <32 x i8> %shift
1558 ; Uniform Constant Shifts
1561 define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
1562 ; AVX1-LABEL: splatconstant_shift_v4i64:
1564 ; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
1565 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1566 ; AVX1-NEXT: vpsllq $7, %xmm0, %xmm0
1567 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1570 ; AVX2-LABEL: splatconstant_shift_v4i64:
1572 ; AVX2-NEXT: vpsllq $7, %ymm0, %ymm0
1575 ; XOPAVX1-LABEL: splatconstant_shift_v4i64:
1577 ; XOPAVX1-NEXT: vpsllq $7, %xmm0, %xmm1
1578 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1579 ; XOPAVX1-NEXT: vpsllq $7, %xmm0, %xmm0
1580 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1581 ; XOPAVX1-NEXT: retq
1583 ; XOPAVX2-LABEL: splatconstant_shift_v4i64:
1585 ; XOPAVX2-NEXT: vpsllq $7, %ymm0, %ymm0
1586 ; XOPAVX2-NEXT: retq
1588 ; AVX512-LABEL: splatconstant_shift_v4i64:
1590 ; AVX512-NEXT: vpsllq $7, %ymm0, %ymm0
1593 ; AVX512VL-LABEL: splatconstant_shift_v4i64:
1594 ; AVX512VL: # %bb.0:
1595 ; AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0
1596 ; AVX512VL-NEXT: retq
1598 ; X86-AVX1-LABEL: splatconstant_shift_v4i64:
1599 ; X86-AVX1: # %bb.0:
1600 ; X86-AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
1601 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1602 ; X86-AVX1-NEXT: vpsllq $7, %xmm0, %xmm0
1603 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1604 ; X86-AVX1-NEXT: retl
1606 ; X86-AVX2-LABEL: splatconstant_shift_v4i64:
1607 ; X86-AVX2: # %bb.0:
1608 ; X86-AVX2-NEXT: vpsllq $7, %ymm0, %ymm0
1609 ; X86-AVX2-NEXT: retl
1610 %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
1611 ret <4 x i64> %shift
1614 define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
1615 ; AVX1-LABEL: splatconstant_shift_v8i32:
1617 ; AVX1-NEXT: vpslld $5, %xmm0, %xmm1
1618 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1619 ; AVX1-NEXT: vpslld $5, %xmm0, %xmm0
1620 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1623 ; AVX2-LABEL: splatconstant_shift_v8i32:
1625 ; AVX2-NEXT: vpslld $5, %ymm0, %ymm0
1628 ; XOPAVX1-LABEL: splatconstant_shift_v8i32:
1630 ; XOPAVX1-NEXT: vpslld $5, %xmm0, %xmm1
1631 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1632 ; XOPAVX1-NEXT: vpslld $5, %xmm0, %xmm0
1633 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1634 ; XOPAVX1-NEXT: retq
1636 ; XOPAVX2-LABEL: splatconstant_shift_v8i32:
1638 ; XOPAVX2-NEXT: vpslld $5, %ymm0, %ymm0
1639 ; XOPAVX2-NEXT: retq
1641 ; AVX512-LABEL: splatconstant_shift_v8i32:
1643 ; AVX512-NEXT: vpslld $5, %ymm0, %ymm0
1646 ; AVX512VL-LABEL: splatconstant_shift_v8i32:
1647 ; AVX512VL: # %bb.0:
1648 ; AVX512VL-NEXT: vpslld $5, %ymm0, %ymm0
1649 ; AVX512VL-NEXT: retq
1651 ; X86-AVX1-LABEL: splatconstant_shift_v8i32:
1652 ; X86-AVX1: # %bb.0:
1653 ; X86-AVX1-NEXT: vpslld $5, %xmm0, %xmm1
1654 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1655 ; X86-AVX1-NEXT: vpslld $5, %xmm0, %xmm0
1656 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1657 ; X86-AVX1-NEXT: retl
1659 ; X86-AVX2-LABEL: splatconstant_shift_v8i32:
1660 ; X86-AVX2: # %bb.0:
1661 ; X86-AVX2-NEXT: vpslld $5, %ymm0, %ymm0
1662 ; X86-AVX2-NEXT: retl
1663 %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
1664 ret <8 x i32> %shift
1667 define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
1668 ; AVX1-LABEL: splatconstant_shift_v16i16:
1670 ; AVX1-NEXT: vpsllw $3, %xmm0, %xmm1
1671 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1672 ; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0
1673 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1676 ; AVX2-LABEL: splatconstant_shift_v16i16:
1678 ; AVX2-NEXT: vpsllw $3, %ymm0, %ymm0
1681 ; XOPAVX1-LABEL: splatconstant_shift_v16i16:
1683 ; XOPAVX1-NEXT: vpsllw $3, %xmm0, %xmm1
1684 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1685 ; XOPAVX1-NEXT: vpsllw $3, %xmm0, %xmm0
1686 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1687 ; XOPAVX1-NEXT: retq
1689 ; XOPAVX2-LABEL: splatconstant_shift_v16i16:
1691 ; XOPAVX2-NEXT: vpsllw $3, %ymm0, %ymm0
1692 ; XOPAVX2-NEXT: retq
1694 ; AVX512-LABEL: splatconstant_shift_v16i16:
1696 ; AVX512-NEXT: vpsllw $3, %ymm0, %ymm0
1699 ; AVX512VL-LABEL: splatconstant_shift_v16i16:
1700 ; AVX512VL: # %bb.0:
1701 ; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0
1702 ; AVX512VL-NEXT: retq
1704 ; X86-AVX1-LABEL: splatconstant_shift_v16i16:
1705 ; X86-AVX1: # %bb.0:
1706 ; X86-AVX1-NEXT: vpsllw $3, %xmm0, %xmm1
1707 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1708 ; X86-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0
1709 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1710 ; X86-AVX1-NEXT: retl
1712 ; X86-AVX2-LABEL: splatconstant_shift_v16i16:
1713 ; X86-AVX2: # %bb.0:
1714 ; X86-AVX2-NEXT: vpsllw $3, %ymm0, %ymm0
1715 ; X86-AVX2-NEXT: retl
1716 %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1717 ret <16 x i16> %shift
1720 define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
1721 ; AVX1-LABEL: splatconstant_shift_v32i8:
1723 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1724 ; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
1725 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
1726 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1727 ; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0
1728 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1729 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1732 ; AVX2-LABEL: splatconstant_shift_v32i8:
1734 ; AVX2-NEXT: vpsllw $3, %ymm0, %ymm0
1735 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1738 ; XOPAVX1-LABEL: splatconstant_shift_v32i8:
1740 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1741 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
1742 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1743 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1744 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1745 ; XOPAVX1-NEXT: retq
1747 ; XOPAVX2-LABEL: splatconstant_shift_v32i8:
1749 ; XOPAVX2-NEXT: vpsllw $3, %ymm0, %ymm0
1750 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1751 ; XOPAVX2-NEXT: retq
1753 ; AVX512-LABEL: splatconstant_shift_v32i8:
1755 ; AVX512-NEXT: vpsllw $3, %ymm0, %ymm0
1756 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1759 ; AVX512VL-LABEL: splatconstant_shift_v32i8:
1760 ; AVX512VL: # %bb.0:
1761 ; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0
1762 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
1763 ; AVX512VL-NEXT: retq
1765 ; X86-AVX1-LABEL: splatconstant_shift_v32i8:
1766 ; X86-AVX1: # %bb.0:
1767 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1768 ; X86-AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
1769 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
1770 ; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1771 ; X86-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0
1772 ; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1773 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1774 ; X86-AVX1-NEXT: retl
1776 ; X86-AVX2-LABEL: splatconstant_shift_v32i8:
1777 ; X86-AVX2: # %bb.0:
1778 ; X86-AVX2-NEXT: vpsllw $3, %ymm0, %ymm0
1779 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1780 ; X86-AVX2-NEXT: retl
1781 %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1782 ret <32 x i8> %shift
1789 define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
1790 ; AVX1-LABEL: shift32_v4i64:
1792 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1793 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
1794 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1797 ; AVX2-LABEL: shift32_v4i64:
1799 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
1802 ; XOPAVX1-LABEL: shift32_v4i64:
1804 ; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1805 ; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
1806 ; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1807 ; XOPAVX1-NEXT: retq
1809 ; XOPAVX2-LABEL: shift32_v4i64:
1811 ; XOPAVX2-NEXT: vpsllq $32, %ymm0, %ymm0
1812 ; XOPAVX2-NEXT: retq
1814 ; AVX512-LABEL: shift32_v4i64:
1816 ; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
1819 ; AVX512VL-LABEL: shift32_v4i64:
1820 ; AVX512VL: # %bb.0:
1821 ; AVX512VL-NEXT: vpsllq $32, %ymm0, %ymm0
1822 ; AVX512VL-NEXT: retq
1824 ; X86-AVX1-LABEL: shift32_v4i64:
1825 ; X86-AVX1: # %bb.0:
1826 ; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1827 ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
1828 ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1829 ; X86-AVX1-NEXT: retl
1831 ; X86-AVX2-LABEL: shift32_v4i64:
1832 ; X86-AVX2: # %bb.0:
1833 ; X86-AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
1834 ; X86-AVX2-NEXT: retl
1835 %shift = shl <4 x i64> %a, <i64 32, i64 32, i64 32, i64 32>
1836 ret <4 x i64> %shift