1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
13 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
14 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
20 define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
21 ; SSE2-LABEL: var_shift_v2i32:
23 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
24 ; SSE2-NEXT: movdqa %xmm0, %xmm2
25 ; SSE2-NEXT: psllq %xmm1, %xmm2
26 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
27 ; SSE2-NEXT: psllq %xmm1, %xmm0
28 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
31 ; SSE41-LABEL: var_shift_v2i32:
33 ; SSE41-NEXT: pxor %xmm2, %xmm2
34 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
35 ; SSE41-NEXT: movdqa %xmm0, %xmm1
36 ; SSE41-NEXT: psllq %xmm2, %xmm1
37 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
38 ; SSE41-NEXT: psllq %xmm2, %xmm0
39 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
42 ; AVX1-LABEL: var_shift_v2i32:
44 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
45 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
46 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
47 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
48 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
49 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
52 ; AVX2-LABEL: var_shift_v2i32:
54 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
55 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
56 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
59 ; XOPAVX1-LABEL: var_shift_v2i32:
61 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
62 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
63 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
66 ; XOPAVX2-LABEL: var_shift_v2i32:
68 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
69 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
70 ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
73 ; AVX512-LABEL: var_shift_v2i32:
75 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
76 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
77 ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
80 ; AVX512VL-LABEL: var_shift_v2i32:
82 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
83 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
84 ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
87 ; X32-SSE-LABEL: var_shift_v2i32:
89 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
90 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
91 ; X32-SSE-NEXT: psllq %xmm1, %xmm2
92 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
93 ; X32-SSE-NEXT: xorps %xmm3, %xmm3
94 ; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
95 ; X32-SSE-NEXT: psllq %xmm3, %xmm0
96 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
98 %shift = shl <2 x i32> %a, %b
102 define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
103 ; SSE2-LABEL: var_shift_v4i16:
105 ; SSE2-NEXT: pslld $23, %xmm1
106 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
107 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
108 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
109 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
110 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
111 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
112 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
113 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
114 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
117 ; SSE41-LABEL: var_shift_v4i16:
119 ; SSE41-NEXT: pslld $23, %xmm1
120 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
121 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
122 ; SSE41-NEXT: pmulld %xmm1, %xmm0
125 ; AVX1-LABEL: var_shift_v4i16:
127 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
128 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
129 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
130 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
133 ; AVX2-LABEL: var_shift_v4i16:
135 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
136 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
137 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
140 ; XOPAVX1-LABEL: var_shift_v4i16:
142 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
143 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
144 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
147 ; XOPAVX2-LABEL: var_shift_v4i16:
149 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
150 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
151 ; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
154 ; AVX512-LABEL: var_shift_v4i16:
156 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
157 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
158 ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
161 ; AVX512VL-LABEL: var_shift_v4i16:
163 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
164 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
165 ; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
166 ; AVX512VL-NEXT: retq
168 ; X32-SSE-LABEL: var_shift_v4i16:
170 ; X32-SSE-NEXT: pslld $23, %xmm1
171 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
172 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
173 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
174 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
175 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
176 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
177 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
178 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
179 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
181 %shift = shl <4 x i16> %a, %b
185 define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
186 ; SSE2-LABEL: var_shift_v2i16:
188 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
189 ; SSE2-NEXT: movdqa %xmm0, %xmm2
190 ; SSE2-NEXT: psllq %xmm1, %xmm2
191 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
192 ; SSE2-NEXT: psllq %xmm1, %xmm0
193 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
196 ; SSE41-LABEL: var_shift_v2i16:
198 ; SSE41-NEXT: pxor %xmm2, %xmm2
199 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
200 ; SSE41-NEXT: movdqa %xmm0, %xmm1
201 ; SSE41-NEXT: psllq %xmm2, %xmm1
202 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
203 ; SSE41-NEXT: psllq %xmm2, %xmm0
204 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
207 ; AVX1-LABEL: var_shift_v2i16:
209 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
210 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
211 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
212 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
213 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
214 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
217 ; AVX2-LABEL: var_shift_v2i16:
219 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
220 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
221 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
224 ; XOPAVX1-LABEL: var_shift_v2i16:
226 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
227 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
228 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
231 ; XOPAVX2-LABEL: var_shift_v2i16:
233 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
234 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
235 ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
238 ; AVX512-LABEL: var_shift_v2i16:
240 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
241 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
242 ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
245 ; AVX512VL-LABEL: var_shift_v2i16:
247 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
248 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
249 ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
250 ; AVX512VL-NEXT: retq
252 ; X32-SSE-LABEL: var_shift_v2i16:
254 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
255 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
256 ; X32-SSE-NEXT: psllq %xmm1, %xmm2
257 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
258 ; X32-SSE-NEXT: psllq %xmm1, %xmm0
259 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
261 %shift = shl <2 x i16> %a, %b
265 define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
266 ; SSE2-LABEL: var_shift_v8i8:
268 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
269 ; SSE2-NEXT: pxor %xmm2, %xmm2
270 ; SSE2-NEXT: movdqa %xmm1, %xmm3
271 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
272 ; SSE2-NEXT: pslld $23, %xmm3
273 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
274 ; SSE2-NEXT: paddd %xmm4, %xmm3
275 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
276 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
277 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
278 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
279 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
280 ; SSE2-NEXT: pslld $23, %xmm1
281 ; SSE2-NEXT: paddd %xmm4, %xmm1
282 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
283 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
284 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
285 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
286 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
287 ; SSE2-NEXT: pmullw %xmm1, %xmm0
290 ; SSE41-LABEL: var_shift_v8i8:
292 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
293 ; SSE41-NEXT: pand %xmm1, %xmm2
294 ; SSE41-NEXT: pxor %xmm3, %xmm3
295 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
296 ; SSE41-NEXT: pslld $23, %xmm1
297 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
298 ; SSE41-NEXT: paddd %xmm3, %xmm1
299 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
300 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
301 ; SSE41-NEXT: pslld $23, %xmm2
302 ; SSE41-NEXT: paddd %xmm3, %xmm2
303 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
304 ; SSE41-NEXT: packusdw %xmm1, %xmm2
305 ; SSE41-NEXT: pmullw %xmm2, %xmm0
308 ; AVX1-LABEL: var_shift_v8i8:
310 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
311 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
312 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
313 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
314 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
315 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
316 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
317 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
318 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
319 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
320 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
321 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
324 ; AVX2-LABEL: var_shift_v8i8:
326 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
327 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
328 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
329 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
330 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
331 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
332 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
333 ; AVX2-NEXT: vzeroupper
336 ; XOP-LABEL: var_shift_v8i8:
338 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
339 ; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
342 ; AVX512DQ-LABEL: var_shift_v8i8:
344 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
345 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
346 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
347 ; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
348 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
349 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
350 ; AVX512DQ-NEXT: vzeroupper
351 ; AVX512DQ-NEXT: retq
353 ; AVX512BW-LABEL: var_shift_v8i8:
355 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
356 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
357 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
358 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
359 ; AVX512BW-NEXT: vzeroupper
360 ; AVX512BW-NEXT: retq
362 ; AVX512DQVL-LABEL: var_shift_v8i8:
363 ; AVX512DQVL: # %bb.0:
364 ; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
365 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
366 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
367 ; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
368 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
369 ; AVX512DQVL-NEXT: vzeroupper
370 ; AVX512DQVL-NEXT: retq
372 ; AVX512BWVL-LABEL: var_shift_v8i8:
373 ; AVX512BWVL: # %bb.0:
374 ; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
375 ; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
376 ; AVX512BWVL-NEXT: retq
378 ; X32-SSE-LABEL: var_shift_v8i8:
380 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
381 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
382 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
383 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
384 ; X32-SSE-NEXT: pslld $23, %xmm3
385 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
386 ; X32-SSE-NEXT: paddd %xmm4, %xmm3
387 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
388 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
389 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
390 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
391 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
392 ; X32-SSE-NEXT: pslld $23, %xmm1
393 ; X32-SSE-NEXT: paddd %xmm4, %xmm1
394 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
395 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
396 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
397 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
398 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
399 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0
401 %shift = shl <8 x i8> %a, %b
405 define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
406 ; SSE2-LABEL: var_shift_v4i8:
408 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
409 ; SSE2-NEXT: pslld $23, %xmm1
410 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
411 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
412 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
413 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
414 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
415 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
416 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
417 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
418 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
421 ; SSE41-LABEL: var_shift_v4i8:
423 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
424 ; SSE41-NEXT: pslld $23, %xmm1
425 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
426 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
427 ; SSE41-NEXT: pmulld %xmm1, %xmm0
430 ; AVX1-LABEL: var_shift_v4i8:
432 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
433 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
434 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
435 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
436 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
439 ; AVX2-LABEL: var_shift_v4i8:
441 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
442 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
445 ; XOPAVX1-LABEL: var_shift_v4i8:
447 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
448 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
451 ; XOPAVX2-LABEL: var_shift_v4i8:
453 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
454 ; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
457 ; AVX512-LABEL: var_shift_v4i8:
459 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
460 ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
463 ; AVX512VL-LABEL: var_shift_v4i8:
465 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
466 ; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
467 ; AVX512VL-NEXT: retq
469 ; X32-SSE-LABEL: var_shift_v4i8:
471 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
472 ; X32-SSE-NEXT: pslld $23, %xmm1
473 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
474 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
475 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
476 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
477 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
478 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
479 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
480 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
481 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
483 %shift = shl <4 x i8> %a, %b
487 define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
488 ; SSE2-LABEL: var_shift_v2i8:
490 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
491 ; SSE2-NEXT: movdqa %xmm0, %xmm2
492 ; SSE2-NEXT: psllq %xmm1, %xmm2
493 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
494 ; SSE2-NEXT: psllq %xmm1, %xmm0
495 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
498 ; SSE41-LABEL: var_shift_v2i8:
500 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
501 ; SSE41-NEXT: movdqa %xmm0, %xmm2
502 ; SSE41-NEXT: psllq %xmm1, %xmm2
503 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
504 ; SSE41-NEXT: psllq %xmm1, %xmm0
505 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
508 ; AVX1-LABEL: var_shift_v2i8:
510 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
511 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
512 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
513 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
514 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
517 ; AVX2-LABEL: var_shift_v2i8:
519 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
520 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
523 ; XOPAVX1-LABEL: var_shift_v2i8:
525 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
526 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
529 ; XOPAVX2-LABEL: var_shift_v2i8:
531 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
532 ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
535 ; AVX512-LABEL: var_shift_v2i8:
537 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
538 ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
541 ; AVX512VL-LABEL: var_shift_v2i8:
543 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
544 ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
545 ; AVX512VL-NEXT: retq
547 ; X32-SSE-LABEL: var_shift_v2i8:
549 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
550 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
551 ; X32-SSE-NEXT: psllq %xmm1, %xmm2
552 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
553 ; X32-SSE-NEXT: psllq %xmm1, %xmm0
554 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
556 %shift = shl <2 x i8> %a, %b
561 ; Uniform Variable Shifts
564 define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
565 ; SSE2-LABEL: splatvar_shift_v2i32:
567 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
568 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
569 ; SSE2-NEXT: movdqa %xmm0, %xmm2
570 ; SSE2-NEXT: psllq %xmm1, %xmm2
571 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
572 ; SSE2-NEXT: psllq %xmm1, %xmm0
573 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
576 ; SSE41-LABEL: splatvar_shift_v2i32:
578 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
579 ; SSE41-NEXT: pxor %xmm2, %xmm2
580 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
581 ; SSE41-NEXT: movdqa %xmm0, %xmm1
582 ; SSE41-NEXT: psllq %xmm2, %xmm1
583 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
584 ; SSE41-NEXT: psllq %xmm2, %xmm0
585 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
588 ; AVX1-LABEL: splatvar_shift_v2i32:
590 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
591 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
592 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
593 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
594 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
595 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
596 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
599 ; AVX2-LABEL: splatvar_shift_v2i32:
601 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
602 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
603 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
604 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
607 ; XOPAVX1-LABEL: splatvar_shift_v2i32:
609 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
610 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
611 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
612 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
615 ; XOPAVX2-LABEL: splatvar_shift_v2i32:
617 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
618 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
619 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
620 ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
623 ; AVX512-LABEL: splatvar_shift_v2i32:
625 ; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
626 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
627 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
628 ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
631 ; AVX512VL-LABEL: splatvar_shift_v2i32:
633 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
634 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
635 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
636 ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
637 ; AVX512VL-NEXT: retq
639 ; X32-SSE-LABEL: splatvar_shift_v2i32:
641 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
642 ; X32-SSE-NEXT: pand %xmm1, %xmm2
643 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
644 ; X32-SSE-NEXT: psllq %xmm2, %xmm3
645 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
646 ; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
647 ; X32-SSE-NEXT: psllq %xmm2, %xmm0
648 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
650 %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
651 %shift = shl <2 x i32> %a, %splat
655 define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
656 ; SSE2-LABEL: splatvar_shift_v4i16:
658 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
659 ; SSE2-NEXT: pslld $23, %xmm1
660 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
661 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
662 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
663 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
664 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
665 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
666 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
667 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
668 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
671 ; SSE41-LABEL: splatvar_shift_v4i16:
673 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
674 ; SSE41-NEXT: pslld $23, %xmm1
675 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
676 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
677 ; SSE41-NEXT: pmulld %xmm1, %xmm0
680 ; AVX1-LABEL: splatvar_shift_v4i16:
682 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
683 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
684 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
685 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
686 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
689 ; AVX2-LABEL: splatvar_shift_v4i16:
691 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
692 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
693 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
694 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
697 ; XOPAVX1-LABEL: splatvar_shift_v4i16:
699 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
700 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
701 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
702 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
705 ; XOPAVX2-LABEL: splatvar_shift_v4i16:
707 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
708 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
709 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
710 ; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
713 ; AVX512-LABEL: splatvar_shift_v4i16:
715 ; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
716 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
717 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
718 ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
721 ; AVX512VL-LABEL: splatvar_shift_v4i16:
723 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
724 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
725 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
726 ; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
727 ; AVX512VL-NEXT: retq
729 ; X32-SSE-LABEL: splatvar_shift_v4i16:
731 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
732 ; X32-SSE-NEXT: pslld $23, %xmm1
733 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
734 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
735 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
736 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
737 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
738 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
739 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
740 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
741 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
743 %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
744 %shift = shl <4 x i16> %a, %splat
748 define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
749 ; SSE2-LABEL: splatvar_shift_v2i16:
751 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
752 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
753 ; SSE2-NEXT: movdqa %xmm0, %xmm2
754 ; SSE2-NEXT: psllq %xmm1, %xmm2
755 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
756 ; SSE2-NEXT: psllq %xmm1, %xmm0
757 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
760 ; SSE41-LABEL: splatvar_shift_v2i16:
762 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
763 ; SSE41-NEXT: pxor %xmm2, %xmm2
764 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
765 ; SSE41-NEXT: movdqa %xmm0, %xmm1
766 ; SSE41-NEXT: psllq %xmm2, %xmm1
767 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
768 ; SSE41-NEXT: psllq %xmm2, %xmm0
769 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
772 ; AVX1-LABEL: splatvar_shift_v2i16:
774 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
775 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
776 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
777 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
778 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
779 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
780 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
783 ; AVX2-LABEL: splatvar_shift_v2i16:
785 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
786 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
787 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
788 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
791 ; XOPAVX1-LABEL: splatvar_shift_v2i16:
793 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
794 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
795 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
796 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
799 ; XOPAVX2-LABEL: splatvar_shift_v2i16:
801 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
802 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
803 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
804 ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
807 ; AVX512-LABEL: splatvar_shift_v2i16:
809 ; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
810 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
811 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
812 ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
815 ; AVX512VL-LABEL: splatvar_shift_v2i16:
817 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
818 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
819 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
820 ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
821 ; AVX512VL-NEXT: retq
823 ; X32-SSE-LABEL: splatvar_shift_v2i16:
825 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
826 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
827 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
828 ; X32-SSE-NEXT: psllq %xmm1, %xmm2
829 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
830 ; X32-SSE-NEXT: psllq %xmm1, %xmm0
831 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
833 %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
834 %shift = shl <2 x i16> %a, %splat
838 define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
839 ; SSE2-LABEL: splatvar_shift_v8i8:
841 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
842 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
843 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
844 ; SSE2-NEXT: pxor %xmm2, %xmm2
845 ; SSE2-NEXT: movdqa %xmm1, %xmm3
846 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
847 ; SSE2-NEXT: pslld $23, %xmm3
848 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
849 ; SSE2-NEXT: paddd %xmm4, %xmm3
850 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
851 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
852 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
853 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
854 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
855 ; SSE2-NEXT: pslld $23, %xmm1
856 ; SSE2-NEXT: paddd %xmm4, %xmm1
857 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
858 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
859 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
860 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
861 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
862 ; SSE2-NEXT: pmullw %xmm1, %xmm0
865 ; SSE41-LABEL: splatvar_shift_v8i8:
867 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
868 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
869 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
870 ; SSE41-NEXT: pslld $23, %xmm1
871 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
872 ; SSE41-NEXT: paddd %xmm3, %xmm1
873 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
874 ; SSE41-NEXT: pslld $23, %xmm2
875 ; SSE41-NEXT: paddd %xmm3, %xmm2
876 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
877 ; SSE41-NEXT: packusdw %xmm1, %xmm2
878 ; SSE41-NEXT: pmullw %xmm2, %xmm0
881 ; AVX1-LABEL: splatvar_shift_v8i8:
883 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
884 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
885 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
886 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
887 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
888 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
889 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
890 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
891 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
892 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
893 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
894 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
897 ; AVX2-LABEL: splatvar_shift_v8i8:
899 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
900 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
901 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
902 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
903 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
904 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
905 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
906 ; AVX2-NEXT: vzeroupper
909 ; XOP-LABEL: splatvar_shift_v8i8:
911 ; XOP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
912 ; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
915 ; AVX512DQ-LABEL: splatvar_shift_v8i8:
917 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
918 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
919 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
920 ; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
921 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
922 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
923 ; AVX512DQ-NEXT: vzeroupper
924 ; AVX512DQ-NEXT: retq
926 ; AVX512BW-LABEL: splatvar_shift_v8i8:
928 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
929 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
930 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
931 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
932 ; AVX512BW-NEXT: vzeroupper
933 ; AVX512BW-NEXT: retq
935 ; AVX512DQVL-LABEL: splatvar_shift_v8i8:
936 ; AVX512DQVL: # %bb.0:
937 ; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
938 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
939 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
940 ; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
941 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
942 ; AVX512DQVL-NEXT: vzeroupper
943 ; AVX512DQVL-NEXT: retq
945 ; AVX512BWVL-LABEL: splatvar_shift_v8i8:
946 ; AVX512BWVL: # %bb.0:
947 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
948 ; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
949 ; AVX512BWVL-NEXT: retq
951 ; X32-SSE-LABEL: splatvar_shift_v8i8:
953 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
954 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
955 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
956 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
957 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
958 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
959 ; X32-SSE-NEXT: pslld $23, %xmm3
960 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
961 ; X32-SSE-NEXT: paddd %xmm4, %xmm3
962 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
963 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
964 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
965 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
966 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
967 ; X32-SSE-NEXT: pslld $23, %xmm1
968 ; X32-SSE-NEXT: paddd %xmm4, %xmm1
969 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
970 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
971 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
972 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
973 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
974 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0
976 %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
977 %shift = shl <8 x i8> %a, %splat
981 define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
982 ; SSE2-LABEL: splatvar_shift_v4i8:
984 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
985 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
986 ; SSE2-NEXT: pslld $23, %xmm1
987 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
988 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
989 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
990 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
991 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
992 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
993 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
994 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
995 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
998 ; SSE41-LABEL: splatvar_shift_v4i8:
1000 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1001 ; SSE41-NEXT: pslld $23, %xmm1
1002 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
1003 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
1004 ; SSE41-NEXT: pmulld %xmm1, %xmm0
1007 ; AVX1-LABEL: splatvar_shift_v4i8:
1009 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1010 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
1011 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
1012 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
1013 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1016 ; AVX2-LABEL: splatvar_shift_v4i8:
1018 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1019 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
1022 ; XOPAVX1-LABEL: splatvar_shift_v4i8:
1024 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1025 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
1026 ; XOPAVX1-NEXT: retq
1028 ; XOPAVX2-LABEL: splatvar_shift_v4i8:
1030 ; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1031 ; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
1032 ; XOPAVX2-NEXT: retq
1034 ; AVX512-LABEL: splatvar_shift_v4i8:
1036 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1037 ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
1040 ; AVX512VL-LABEL: splatvar_shift_v4i8:
1041 ; AVX512VL: # %bb.0:
1042 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1043 ; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
1044 ; AVX512VL-NEXT: retq
1046 ; X32-SSE-LABEL: splatvar_shift_v4i8:
1048 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1049 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
1050 ; X32-SSE-NEXT: pslld $23, %xmm1
1051 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
1052 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
1053 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1054 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
1055 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1056 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1057 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
1058 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1059 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1060 ; X32-SSE-NEXT: retl
1061 %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
1062 %shift = shl <4 x i8> %a, %splat
1066 define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
1067 ; SSE2-LABEL: splatvar_shift_v2i8:
1069 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1070 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
1071 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1072 ; SSE2-NEXT: psllq %xmm1, %xmm2
1073 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1074 ; SSE2-NEXT: psllq %xmm1, %xmm0
1075 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1078 ; SSE41-LABEL: splatvar_shift_v2i8:
1080 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1081 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1082 ; SSE41-NEXT: psllq %xmm1, %xmm2
1083 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1084 ; SSE41-NEXT: psllq %xmm1, %xmm0
1085 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1088 ; AVX1-LABEL: splatvar_shift_v2i8:
1090 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1091 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
1092 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1093 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
1094 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1097 ; AVX2-LABEL: splatvar_shift_v2i8:
1099 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1100 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
1103 ; XOPAVX1-LABEL: splatvar_shift_v2i8:
1105 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1106 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
1107 ; XOPAVX1-NEXT: retq
1109 ; XOPAVX2-LABEL: splatvar_shift_v2i8:
1111 ; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1112 ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
1113 ; XOPAVX2-NEXT: retq
1115 ; AVX512-LABEL: splatvar_shift_v2i8:
1117 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1118 ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
1121 ; AVX512VL-LABEL: splatvar_shift_v2i8:
1122 ; AVX512VL: # %bb.0:
1123 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1124 ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
1125 ; AVX512VL-NEXT: retq
1127 ; X32-SSE-LABEL: splatvar_shift_v2i8:
1129 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1130 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
1131 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1132 ; X32-SSE-NEXT: psllq %xmm1, %xmm2
1133 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1134 ; X32-SSE-NEXT: psllq %xmm1, %xmm0
1135 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1136 ; X32-SSE-NEXT: retl
1137 %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
1138 %shift = shl <2 x i8> %a, %splat
1146 define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
1147 ; SSE2-LABEL: constant_shift_v2i32:
1149 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1150 ; SSE2-NEXT: psllq $4, %xmm1
1151 ; SSE2-NEXT: psllq $5, %xmm0
1152 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1155 ; SSE41-LABEL: constant_shift_v2i32:
1157 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1158 ; SSE41-NEXT: psllq $5, %xmm1
1159 ; SSE41-NEXT: psllq $4, %xmm0
1160 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1163 ; AVX1-LABEL: constant_shift_v2i32:
1165 ; AVX1-NEXT: vpsllq $5, %xmm0, %xmm1
1166 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0
1167 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1170 ; AVX2-LABEL: constant_shift_v2i32:
1172 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
1175 ; XOPAVX1-LABEL: constant_shift_v2i32:
1177 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
1178 ; XOPAVX1-NEXT: retq
1180 ; XOPAVX2-LABEL: constant_shift_v2i32:
1182 ; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
1183 ; XOPAVX2-NEXT: retq
1185 ; AVX512-LABEL: constant_shift_v2i32:
1187 ; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
1190 ; AVX512VL-LABEL: constant_shift_v2i32:
1191 ; AVX512VL: # %bb.0:
1192 ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
1193 ; AVX512VL-NEXT: retq
1195 ; X32-SSE-LABEL: constant_shift_v2i32:
1197 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1198 ; X32-SSE-NEXT: psllq $4, %xmm1
1199 ; X32-SSE-NEXT: psllq $5, %xmm0
1200 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1201 ; X32-SSE-NEXT: retl
1202 %shift = shl <2 x i32> %a, <i32 4, i32 5>
1203 ret <2 x i32> %shift
1206 define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
1207 ; SSE2-LABEL: constant_shift_v4i16:
1209 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
1210 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1211 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
1212 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1213 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1214 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
1215 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1216 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1219 ; SSE41-LABEL: constant_shift_v4i16:
1221 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
1224 ; AVX1-LABEL: constant_shift_v4i16:
1226 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
1229 ; AVX2-LABEL: constant_shift_v4i16:
1231 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
1234 ; XOPAVX1-LABEL: constant_shift_v4i16:
1236 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
1237 ; XOPAVX1-NEXT: retq
1239 ; XOPAVX2-LABEL: constant_shift_v4i16:
1241 ; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
1242 ; XOPAVX2-NEXT: retq
1244 ; AVX512-LABEL: constant_shift_v4i16:
1246 ; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
1249 ; AVX512VL-LABEL: constant_shift_v4i16:
1250 ; AVX512VL: # %bb.0:
1251 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
1252 ; AVX512VL-NEXT: retq
1254 ; X32-SSE-LABEL: constant_shift_v4i16:
1256 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
1257 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1258 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
1259 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1260 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1261 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
1262 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1263 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1264 ; X32-SSE-NEXT: retl
1265 %shift = shl <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
1266 ret <4 x i16> %shift
1269 define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
1270 ; SSE2-LABEL: constant_shift_v2i16:
1272 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1273 ; SSE2-NEXT: psllq $2, %xmm1
1274 ; SSE2-NEXT: psllq $3, %xmm0
1275 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1278 ; SSE41-LABEL: constant_shift_v2i16:
1280 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1281 ; SSE41-NEXT: psllq $3, %xmm1
1282 ; SSE41-NEXT: psllq $2, %xmm0
1283 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1286 ; AVX1-LABEL: constant_shift_v2i16:
1288 ; AVX1-NEXT: vpsllq $3, %xmm0, %xmm1
1289 ; AVX1-NEXT: vpsllq $2, %xmm0, %xmm0
1290 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1293 ; AVX2-LABEL: constant_shift_v2i16:
1295 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
1298 ; XOPAVX1-LABEL: constant_shift_v2i16:
1300 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
1301 ; XOPAVX1-NEXT: retq
1303 ; XOPAVX2-LABEL: constant_shift_v2i16:
1305 ; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
1306 ; XOPAVX2-NEXT: retq
1308 ; AVX512-LABEL: constant_shift_v2i16:
1310 ; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
1313 ; AVX512VL-LABEL: constant_shift_v2i16:
1314 ; AVX512VL: # %bb.0:
1315 ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
1316 ; AVX512VL-NEXT: retq
1318 ; X32-SSE-LABEL: constant_shift_v2i16:
1320 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1321 ; X32-SSE-NEXT: psllq $2, %xmm1
1322 ; X32-SSE-NEXT: psllq $3, %xmm0
1323 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1324 ; X32-SSE-NEXT: retl
1325 %shift = shl <2 x i16> %a, <i16 2, i16 3>
1326 ret <2 x i16> %shift
1329 define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
1330 ; SSE-LABEL: constant_shift_v8i8:
1332 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
1335 ; AVX-LABEL: constant_shift_v8i8:
1337 ; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
1340 ; XOP-LABEL: constant_shift_v8i8:
1342 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
1345 ; AVX512DQ-LABEL: constant_shift_v8i8:
1346 ; AVX512DQ: # %bb.0:
1347 ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
1348 ; AVX512DQ-NEXT: retq
1350 ; AVX512BW-LABEL: constant_shift_v8i8:
1351 ; AVX512BW: # %bb.0:
1352 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1353 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1354 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1355 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1356 ; AVX512BW-NEXT: vzeroupper
1357 ; AVX512BW-NEXT: retq
1359 ; AVX512DQVL-LABEL: constant_shift_v8i8:
1360 ; AVX512DQVL: # %bb.0:
1361 ; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
1362 ; AVX512DQVL-NEXT: retq
1364 ; AVX512BWVL-LABEL: constant_shift_v8i8:
1365 ; AVX512BWVL: # %bb.0:
1366 ; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
1367 ; AVX512BWVL-NEXT: retq
1369 ; X32-SSE-LABEL: constant_shift_v8i8:
1371 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
1372 ; X32-SSE-NEXT: retl
1373 %shift = shl <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1377 define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
1378 ; SSE2-LABEL: constant_shift_v4i8:
1380 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
1381 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1382 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
1383 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1384 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1385 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
1386 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1387 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1390 ; SSE41-LABEL: constant_shift_v4i8:
1392 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
1395 ; AVX1-LABEL: constant_shift_v4i8:
1397 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
1400 ; AVX2-LABEL: constant_shift_v4i8:
1402 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
1405 ; XOPAVX1-LABEL: constant_shift_v4i8:
1407 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
1408 ; XOPAVX1-NEXT: retq
1410 ; XOPAVX2-LABEL: constant_shift_v4i8:
1412 ; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
1413 ; XOPAVX2-NEXT: retq
1415 ; AVX512-LABEL: constant_shift_v4i8:
1417 ; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
1420 ; AVX512VL-LABEL: constant_shift_v4i8:
1421 ; AVX512VL: # %bb.0:
1422 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
1423 ; AVX512VL-NEXT: retq
1425 ; X32-SSE-LABEL: constant_shift_v4i8:
1427 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
1428 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1429 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
1430 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1431 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1432 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
1433 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1434 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1435 ; X32-SSE-NEXT: retl
1436 %shift = shl <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
1440 define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
1441 ; SSE2-LABEL: constant_shift_v2i8:
1443 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1444 ; SSE2-NEXT: psllq $2, %xmm1
1445 ; SSE2-NEXT: psllq $3, %xmm0
1446 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1449 ; SSE41-LABEL: constant_shift_v2i8:
1451 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1452 ; SSE41-NEXT: psllq $3, %xmm1
1453 ; SSE41-NEXT: psllq $2, %xmm0
1454 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1457 ; AVX1-LABEL: constant_shift_v2i8:
1459 ; AVX1-NEXT: vpsllq $3, %xmm0, %xmm1
1460 ; AVX1-NEXT: vpsllq $2, %xmm0, %xmm0
1461 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1464 ; AVX2-LABEL: constant_shift_v2i8:
1466 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
1469 ; XOPAVX1-LABEL: constant_shift_v2i8:
1471 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
1472 ; XOPAVX1-NEXT: retq
1474 ; XOPAVX2-LABEL: constant_shift_v2i8:
1476 ; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
1477 ; XOPAVX2-NEXT: retq
1479 ; AVX512-LABEL: constant_shift_v2i8:
1481 ; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
1484 ; AVX512VL-LABEL: constant_shift_v2i8:
1485 ; AVX512VL: # %bb.0:
1486 ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
1487 ; AVX512VL-NEXT: retq
1489 ; X32-SSE-LABEL: constant_shift_v2i8:
1491 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1492 ; X32-SSE-NEXT: psllq $2, %xmm1
1493 ; X32-SSE-NEXT: psllq $3, %xmm0
1494 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1495 ; X32-SSE-NEXT: retl
1496 %shift = shl <2 x i8> %a, <i8 2, i8 3>
1501 ; Uniform Constant Shifts
1504 define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
1505 ; SSE-LABEL: splatconstant_shift_v2i32:
1507 ; SSE-NEXT: psllq $5, %xmm0
1510 ; AVX-LABEL: splatconstant_shift_v2i32:
1512 ; AVX-NEXT: vpsllq $5, %xmm0, %xmm0
1515 ; XOP-LABEL: splatconstant_shift_v2i32:
1517 ; XOP-NEXT: vpsllq $5, %xmm0, %xmm0
1520 ; AVX512-LABEL: splatconstant_shift_v2i32:
1522 ; AVX512-NEXT: vpsllq $5, %xmm0, %xmm0
1525 ; AVX512VL-LABEL: splatconstant_shift_v2i32:
1526 ; AVX512VL: # %bb.0:
1527 ; AVX512VL-NEXT: vpsllq $5, %xmm0, %xmm0
1528 ; AVX512VL-NEXT: retq
1530 ; X32-SSE-LABEL: splatconstant_shift_v2i32:
1532 ; X32-SSE-NEXT: psllq $5, %xmm0
1533 ; X32-SSE-NEXT: retl
1534 %shift = shl <2 x i32> %a, <i32 5, i32 5>
1535 ret <2 x i32> %shift
1538 define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
1539 ; SSE-LABEL: splatconstant_shift_v4i16:
1541 ; SSE-NEXT: pslld $3, %xmm0
1544 ; AVX-LABEL: splatconstant_shift_v4i16:
1546 ; AVX-NEXT: vpslld $3, %xmm0, %xmm0
1549 ; XOP-LABEL: splatconstant_shift_v4i16:
1551 ; XOP-NEXT: vpslld $3, %xmm0, %xmm0
1554 ; AVX512-LABEL: splatconstant_shift_v4i16:
1556 ; AVX512-NEXT: vpslld $3, %xmm0, %xmm0
1559 ; AVX512VL-LABEL: splatconstant_shift_v4i16:
1560 ; AVX512VL: # %bb.0:
1561 ; AVX512VL-NEXT: vpslld $3, %xmm0, %xmm0
1562 ; AVX512VL-NEXT: retq
1564 ; X32-SSE-LABEL: splatconstant_shift_v4i16:
1566 ; X32-SSE-NEXT: pslld $3, %xmm0
1567 ; X32-SSE-NEXT: retl
1568 %shift = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
1569 ret <4 x i16> %shift
1572 define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
1573 ; SSE-LABEL: splatconstant_shift_v2i16:
1575 ; SSE-NEXT: psllq $3, %xmm0
1578 ; AVX-LABEL: splatconstant_shift_v2i16:
1580 ; AVX-NEXT: vpsllq $3, %xmm0, %xmm0
1583 ; XOP-LABEL: splatconstant_shift_v2i16:
1585 ; XOP-NEXT: vpsllq $3, %xmm0, %xmm0
1588 ; AVX512-LABEL: splatconstant_shift_v2i16:
1590 ; AVX512-NEXT: vpsllq $3, %xmm0, %xmm0
1593 ; AVX512VL-LABEL: splatconstant_shift_v2i16:
1594 ; AVX512VL: # %bb.0:
1595 ; AVX512VL-NEXT: vpsllq $3, %xmm0, %xmm0
1596 ; AVX512VL-NEXT: retq
1598 ; X32-SSE-LABEL: splatconstant_shift_v2i16:
1600 ; X32-SSE-NEXT: psllq $3, %xmm0
1601 ; X32-SSE-NEXT: retl
1602 %shift = shl <2 x i16> %a, <i16 3, i16 3>
1603 ret <2 x i16> %shift
1606 define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
1607 ; SSE-LABEL: splatconstant_shift_v8i8:
1609 ; SSE-NEXT: psllw $3, %xmm0
1612 ; AVX-LABEL: splatconstant_shift_v8i8:
1614 ; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
1617 ; XOP-LABEL: splatconstant_shift_v8i8:
1619 ; XOP-NEXT: vpsllw $3, %xmm0, %xmm0
1622 ; AVX512-LABEL: splatconstant_shift_v8i8:
1624 ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
1627 ; AVX512VL-LABEL: splatconstant_shift_v8i8:
1628 ; AVX512VL: # %bb.0:
1629 ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
1630 ; AVX512VL-NEXT: retq
1632 ; X32-SSE-LABEL: splatconstant_shift_v8i8:
1634 ; X32-SSE-NEXT: psllw $3, %xmm0
1635 ; X32-SSE-NEXT: retl
1636 %shift = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1640 define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
1641 ; SSE-LABEL: splatconstant_shift_v4i8:
1643 ; SSE-NEXT: pslld $3, %xmm0
1646 ; AVX-LABEL: splatconstant_shift_v4i8:
1648 ; AVX-NEXT: vpslld $3, %xmm0, %xmm0
1651 ; XOP-LABEL: splatconstant_shift_v4i8:
1653 ; XOP-NEXT: vpslld $3, %xmm0, %xmm0
1656 ; AVX512-LABEL: splatconstant_shift_v4i8:
1658 ; AVX512-NEXT: vpslld $3, %xmm0, %xmm0
1661 ; AVX512VL-LABEL: splatconstant_shift_v4i8:
1662 ; AVX512VL: # %bb.0:
1663 ; AVX512VL-NEXT: vpslld $3, %xmm0, %xmm0
1664 ; AVX512VL-NEXT: retq
1666 ; X32-SSE-LABEL: splatconstant_shift_v4i8:
1668 ; X32-SSE-NEXT: pslld $3, %xmm0
1669 ; X32-SSE-NEXT: retl
1670 %shift = shl <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
1674 define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
1675 ; SSE-LABEL: splatconstant_shift_v2i8:
1677 ; SSE-NEXT: psllq $3, %xmm0
1680 ; AVX-LABEL: splatconstant_shift_v2i8:
1682 ; AVX-NEXT: vpsllq $3, %xmm0, %xmm0
1685 ; XOP-LABEL: splatconstant_shift_v2i8:
1687 ; XOP-NEXT: vpsllq $3, %xmm0, %xmm0
1690 ; AVX512-LABEL: splatconstant_shift_v2i8:
1692 ; AVX512-NEXT: vpsllq $3, %xmm0, %xmm0
1695 ; AVX512VL-LABEL: splatconstant_shift_v2i8:
1696 ; AVX512VL: # %bb.0:
1697 ; AVX512VL-NEXT: vpsllq $3, %xmm0, %xmm0
1698 ; AVX512VL-NEXT: retq
1700 ; X32-SSE-LABEL: splatconstant_shift_v2i8:
1702 ; X32-SSE-NEXT: psllq $3, %xmm0
1703 ; X32-SSE-NEXT: retl
1704 %shift = shl <2 x i8> %a, <i8 3, i8 3>