1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
13 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
14 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE
20 define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
21 ; SSE2-LABEL: var_shift_v2i32:
23 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
24 ; SSE2-NEXT: movdqa %xmm0, %xmm3
25 ; SSE2-NEXT: psrld %xmm2, %xmm3
26 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
27 ; SSE2-NEXT: movdqa %xmm0, %xmm2
28 ; SSE2-NEXT: psrld %xmm4, %xmm2
29 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
30 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
31 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
32 ; SSE2-NEXT: movdqa %xmm0, %xmm4
33 ; SSE2-NEXT: psrld %xmm3, %xmm4
34 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
35 ; SSE2-NEXT: psrld %xmm1, %xmm0
36 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
37 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
38 ; SSE2-NEXT: movaps %xmm2, %xmm0
41 ; SSE41-LABEL: var_shift_v2i32:
43 ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
44 ; SSE41-NEXT: movdqa %xmm0, %xmm3
45 ; SSE41-NEXT: psrld %xmm2, %xmm3
46 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
47 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
48 ; SSE41-NEXT: movdqa %xmm0, %xmm5
49 ; SSE41-NEXT: psrld %xmm4, %xmm5
50 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
51 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
52 ; SSE41-NEXT: movdqa %xmm0, %xmm3
53 ; SSE41-NEXT: psrld %xmm1, %xmm3
54 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
55 ; SSE41-NEXT: psrld %xmm1, %xmm0
56 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
57 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
60 ; AVX1-LABEL: var_shift_v2i32:
62 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
63 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
64 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
65 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
66 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
67 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
68 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
69 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
70 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
71 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
72 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
73 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
76 ; AVX2-LABEL: var_shift_v2i32:
78 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
81 ; XOPAVX1-LABEL: var_shift_v2i32:
83 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
84 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
85 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
88 ; XOPAVX2-LABEL: var_shift_v2i32:
90 ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
93 ; AVX512-LABEL: var_shift_v2i32:
95 ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
98 ; AVX512VL-LABEL: var_shift_v2i32:
100 ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
101 ; AVX512VL-NEXT: retq
103 ; X86-SSE-LABEL: var_shift_v2i32:
105 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
106 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3
107 ; X86-SSE-NEXT: psrld %xmm2, %xmm3
108 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
109 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
110 ; X86-SSE-NEXT: psrld %xmm4, %xmm2
111 ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
112 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
113 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
114 ; X86-SSE-NEXT: movdqa %xmm0, %xmm4
115 ; X86-SSE-NEXT: psrld %xmm3, %xmm4
116 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
117 ; X86-SSE-NEXT: psrld %xmm1, %xmm0
118 ; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
119 ; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
120 ; X86-SSE-NEXT: movaps %xmm2, %xmm0
122 %shift = lshr <2 x i32> %a, %b
126 define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
127 ; SSE2-LABEL: var_shift_v4i16:
129 ; SSE2-NEXT: psllw $12, %xmm1
130 ; SSE2-NEXT: movdqa %xmm1, %xmm2
131 ; SSE2-NEXT: psraw $15, %xmm2
132 ; SSE2-NEXT: movdqa %xmm2, %xmm3
133 ; SSE2-NEXT: pandn %xmm0, %xmm3
134 ; SSE2-NEXT: psrlw $8, %xmm0
135 ; SSE2-NEXT: pand %xmm2, %xmm0
136 ; SSE2-NEXT: por %xmm3, %xmm0
137 ; SSE2-NEXT: paddw %xmm1, %xmm1
138 ; SSE2-NEXT: movdqa %xmm1, %xmm2
139 ; SSE2-NEXT: psraw $15, %xmm2
140 ; SSE2-NEXT: movdqa %xmm2, %xmm3
141 ; SSE2-NEXT: pandn %xmm0, %xmm3
142 ; SSE2-NEXT: psrlw $4, %xmm0
143 ; SSE2-NEXT: pand %xmm2, %xmm0
144 ; SSE2-NEXT: por %xmm3, %xmm0
145 ; SSE2-NEXT: paddw %xmm1, %xmm1
146 ; SSE2-NEXT: movdqa %xmm1, %xmm2
147 ; SSE2-NEXT: psraw $15, %xmm2
148 ; SSE2-NEXT: movdqa %xmm2, %xmm3
149 ; SSE2-NEXT: pandn %xmm0, %xmm3
150 ; SSE2-NEXT: psrlw $2, %xmm0
151 ; SSE2-NEXT: pand %xmm2, %xmm0
152 ; SSE2-NEXT: por %xmm3, %xmm0
153 ; SSE2-NEXT: paddw %xmm1, %xmm1
154 ; SSE2-NEXT: psraw $15, %xmm1
155 ; SSE2-NEXT: movdqa %xmm1, %xmm2
156 ; SSE2-NEXT: pandn %xmm0, %xmm2
157 ; SSE2-NEXT: psrlw $1, %xmm0
158 ; SSE2-NEXT: pand %xmm1, %xmm0
159 ; SSE2-NEXT: por %xmm2, %xmm0
162 ; SSE41-LABEL: var_shift_v4i16:
164 ; SSE41-NEXT: movdqa %xmm0, %xmm2
165 ; SSE41-NEXT: movdqa %xmm1, %xmm0
166 ; SSE41-NEXT: psllw $12, %xmm0
167 ; SSE41-NEXT: psllw $4, %xmm1
168 ; SSE41-NEXT: por %xmm1, %xmm0
169 ; SSE41-NEXT: movdqa %xmm0, %xmm1
170 ; SSE41-NEXT: paddw %xmm0, %xmm1
171 ; SSE41-NEXT: movdqa %xmm2, %xmm3
172 ; SSE41-NEXT: psrlw $8, %xmm3
173 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
174 ; SSE41-NEXT: movdqa %xmm2, %xmm3
175 ; SSE41-NEXT: psrlw $4, %xmm3
176 ; SSE41-NEXT: movdqa %xmm1, %xmm0
177 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
178 ; SSE41-NEXT: movdqa %xmm2, %xmm3
179 ; SSE41-NEXT: psrlw $2, %xmm3
180 ; SSE41-NEXT: paddw %xmm1, %xmm1
181 ; SSE41-NEXT: movdqa %xmm1, %xmm0
182 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
183 ; SSE41-NEXT: movdqa %xmm2, %xmm3
184 ; SSE41-NEXT: psrlw $1, %xmm3
185 ; SSE41-NEXT: paddw %xmm1, %xmm1
186 ; SSE41-NEXT: movdqa %xmm1, %xmm0
187 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
188 ; SSE41-NEXT: movdqa %xmm2, %xmm0
191 ; AVX1-LABEL: var_shift_v4i16:
193 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
194 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
195 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
196 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
197 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
198 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
199 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
200 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
201 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
202 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
203 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
204 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
205 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
206 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
209 ; AVX2-LABEL: var_shift_v4i16:
211 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
212 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
213 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
214 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
215 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
216 ; AVX2-NEXT: vzeroupper
219 ; XOP-LABEL: var_shift_v4i16:
221 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
222 ; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
223 ; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
226 ; AVX512DQ-LABEL: var_shift_v4i16:
228 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
229 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
230 ; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
231 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
232 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
233 ; AVX512DQ-NEXT: vzeroupper
234 ; AVX512DQ-NEXT: retq
236 ; AVX512BW-LABEL: var_shift_v4i16:
238 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
239 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
240 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
241 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
242 ; AVX512BW-NEXT: vzeroupper
243 ; AVX512BW-NEXT: retq
245 ; AVX512DQVL-LABEL: var_shift_v4i16:
246 ; AVX512DQVL: # %bb.0:
247 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
248 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
249 ; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
250 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
251 ; AVX512DQVL-NEXT: vzeroupper
252 ; AVX512DQVL-NEXT: retq
254 ; AVX512BWVL-LABEL: var_shift_v4i16:
255 ; AVX512BWVL: # %bb.0:
256 ; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
257 ; AVX512BWVL-NEXT: retq
259 ; X86-SSE-LABEL: var_shift_v4i16:
261 ; X86-SSE-NEXT: psllw $12, %xmm1
262 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
263 ; X86-SSE-NEXT: psraw $15, %xmm2
264 ; X86-SSE-NEXT: movdqa %xmm2, %xmm3
265 ; X86-SSE-NEXT: pandn %xmm0, %xmm3
266 ; X86-SSE-NEXT: psrlw $8, %xmm0
267 ; X86-SSE-NEXT: pand %xmm2, %xmm0
268 ; X86-SSE-NEXT: por %xmm3, %xmm0
269 ; X86-SSE-NEXT: paddw %xmm1, %xmm1
270 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
271 ; X86-SSE-NEXT: psraw $15, %xmm2
272 ; X86-SSE-NEXT: movdqa %xmm2, %xmm3
273 ; X86-SSE-NEXT: pandn %xmm0, %xmm3
274 ; X86-SSE-NEXT: psrlw $4, %xmm0
275 ; X86-SSE-NEXT: pand %xmm2, %xmm0
276 ; X86-SSE-NEXT: por %xmm3, %xmm0
277 ; X86-SSE-NEXT: paddw %xmm1, %xmm1
278 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
279 ; X86-SSE-NEXT: psraw $15, %xmm2
280 ; X86-SSE-NEXT: movdqa %xmm2, %xmm3
281 ; X86-SSE-NEXT: pandn %xmm0, %xmm3
282 ; X86-SSE-NEXT: psrlw $2, %xmm0
283 ; X86-SSE-NEXT: pand %xmm2, %xmm0
284 ; X86-SSE-NEXT: por %xmm3, %xmm0
285 ; X86-SSE-NEXT: paddw %xmm1, %xmm1
286 ; X86-SSE-NEXT: psraw $15, %xmm1
287 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
288 ; X86-SSE-NEXT: pandn %xmm0, %xmm2
289 ; X86-SSE-NEXT: psrlw $1, %xmm0
290 ; X86-SSE-NEXT: pand %xmm1, %xmm0
291 ; X86-SSE-NEXT: por %xmm2, %xmm0
293 %shift = lshr <4 x i16> %a, %b
297 define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
298 ; SSE2-LABEL: var_shift_v2i16:
300 ; SSE2-NEXT: psllw $12, %xmm1
301 ; SSE2-NEXT: movdqa %xmm1, %xmm2
302 ; SSE2-NEXT: psraw $15, %xmm2
303 ; SSE2-NEXT: movdqa %xmm2, %xmm3
304 ; SSE2-NEXT: pandn %xmm0, %xmm3
305 ; SSE2-NEXT: psrlw $8, %xmm0
306 ; SSE2-NEXT: pand %xmm2, %xmm0
307 ; SSE2-NEXT: por %xmm3, %xmm0
308 ; SSE2-NEXT: paddw %xmm1, %xmm1
309 ; SSE2-NEXT: movdqa %xmm1, %xmm2
310 ; SSE2-NEXT: psraw $15, %xmm2
311 ; SSE2-NEXT: movdqa %xmm2, %xmm3
312 ; SSE2-NEXT: pandn %xmm0, %xmm3
313 ; SSE2-NEXT: psrlw $4, %xmm0
314 ; SSE2-NEXT: pand %xmm2, %xmm0
315 ; SSE2-NEXT: por %xmm3, %xmm0
316 ; SSE2-NEXT: paddw %xmm1, %xmm1
317 ; SSE2-NEXT: movdqa %xmm1, %xmm2
318 ; SSE2-NEXT: psraw $15, %xmm2
319 ; SSE2-NEXT: movdqa %xmm2, %xmm3
320 ; SSE2-NEXT: pandn %xmm0, %xmm3
321 ; SSE2-NEXT: psrlw $2, %xmm0
322 ; SSE2-NEXT: pand %xmm2, %xmm0
323 ; SSE2-NEXT: por %xmm3, %xmm0
324 ; SSE2-NEXT: paddw %xmm1, %xmm1
325 ; SSE2-NEXT: psraw $15, %xmm1
326 ; SSE2-NEXT: movdqa %xmm1, %xmm2
327 ; SSE2-NEXT: pandn %xmm0, %xmm2
328 ; SSE2-NEXT: psrlw $1, %xmm0
329 ; SSE2-NEXT: pand %xmm1, %xmm0
330 ; SSE2-NEXT: por %xmm2, %xmm0
333 ; SSE41-LABEL: var_shift_v2i16:
335 ; SSE41-NEXT: movdqa %xmm0, %xmm2
336 ; SSE41-NEXT: movdqa %xmm1, %xmm0
337 ; SSE41-NEXT: psllw $12, %xmm0
338 ; SSE41-NEXT: psllw $4, %xmm1
339 ; SSE41-NEXT: por %xmm1, %xmm0
340 ; SSE41-NEXT: movdqa %xmm0, %xmm1
341 ; SSE41-NEXT: paddw %xmm0, %xmm1
342 ; SSE41-NEXT: movdqa %xmm2, %xmm3
343 ; SSE41-NEXT: psrlw $8, %xmm3
344 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
345 ; SSE41-NEXT: movdqa %xmm2, %xmm3
346 ; SSE41-NEXT: psrlw $4, %xmm3
347 ; SSE41-NEXT: movdqa %xmm1, %xmm0
348 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
349 ; SSE41-NEXT: movdqa %xmm2, %xmm3
350 ; SSE41-NEXT: psrlw $2, %xmm3
351 ; SSE41-NEXT: paddw %xmm1, %xmm1
352 ; SSE41-NEXT: movdqa %xmm1, %xmm0
353 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
354 ; SSE41-NEXT: movdqa %xmm2, %xmm3
355 ; SSE41-NEXT: psrlw $1, %xmm3
356 ; SSE41-NEXT: paddw %xmm1, %xmm1
357 ; SSE41-NEXT: movdqa %xmm1, %xmm0
358 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
359 ; SSE41-NEXT: movdqa %xmm2, %xmm0
362 ; AVX1-LABEL: var_shift_v2i16:
364 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
365 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
366 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
367 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
368 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
369 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
370 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
371 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
372 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
373 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
374 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
375 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
376 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
377 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
380 ; AVX2-LABEL: var_shift_v2i16:
382 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
383 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
384 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
385 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
386 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
387 ; AVX2-NEXT: vzeroupper
390 ; XOP-LABEL: var_shift_v2i16:
392 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
393 ; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
394 ; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
397 ; AVX512DQ-LABEL: var_shift_v2i16:
399 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
400 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
401 ; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
402 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
403 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
404 ; AVX512DQ-NEXT: vzeroupper
405 ; AVX512DQ-NEXT: retq
407 ; AVX512BW-LABEL: var_shift_v2i16:
409 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
410 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
411 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
412 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
413 ; AVX512BW-NEXT: vzeroupper
414 ; AVX512BW-NEXT: retq
416 ; AVX512DQVL-LABEL: var_shift_v2i16:
417 ; AVX512DQVL: # %bb.0:
418 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
419 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
420 ; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
421 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
422 ; AVX512DQVL-NEXT: vzeroupper
423 ; AVX512DQVL-NEXT: retq
425 ; AVX512BWVL-LABEL: var_shift_v2i16:
426 ; AVX512BWVL: # %bb.0:
427 ; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
428 ; AVX512BWVL-NEXT: retq
430 ; X86-SSE-LABEL: var_shift_v2i16:
432 ; X86-SSE-NEXT: psllw $12, %xmm1
433 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
434 ; X86-SSE-NEXT: psraw $15, %xmm2
435 ; X86-SSE-NEXT: movdqa %xmm2, %xmm3
436 ; X86-SSE-NEXT: pandn %xmm0, %xmm3
437 ; X86-SSE-NEXT: psrlw $8, %xmm0
438 ; X86-SSE-NEXT: pand %xmm2, %xmm0
439 ; X86-SSE-NEXT: por %xmm3, %xmm0
440 ; X86-SSE-NEXT: paddw %xmm1, %xmm1
441 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
442 ; X86-SSE-NEXT: psraw $15, %xmm2
443 ; X86-SSE-NEXT: movdqa %xmm2, %xmm3
444 ; X86-SSE-NEXT: pandn %xmm0, %xmm3
445 ; X86-SSE-NEXT: psrlw $4, %xmm0
446 ; X86-SSE-NEXT: pand %xmm2, %xmm0
447 ; X86-SSE-NEXT: por %xmm3, %xmm0
448 ; X86-SSE-NEXT: paddw %xmm1, %xmm1
449 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
450 ; X86-SSE-NEXT: psraw $15, %xmm2
451 ; X86-SSE-NEXT: movdqa %xmm2, %xmm3
452 ; X86-SSE-NEXT: pandn %xmm0, %xmm3
453 ; X86-SSE-NEXT: psrlw $2, %xmm0
454 ; X86-SSE-NEXT: pand %xmm2, %xmm0
455 ; X86-SSE-NEXT: por %xmm3, %xmm0
456 ; X86-SSE-NEXT: paddw %xmm1, %xmm1
457 ; X86-SSE-NEXT: psraw $15, %xmm1
458 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
459 ; X86-SSE-NEXT: pandn %xmm0, %xmm2
460 ; X86-SSE-NEXT: psrlw $1, %xmm0
461 ; X86-SSE-NEXT: pand %xmm1, %xmm0
462 ; X86-SSE-NEXT: por %xmm2, %xmm0
464 %shift = lshr <2 x i16> %a, %b
468 define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
469 ; SSE2-LABEL: var_shift_v8i8:
471 ; SSE2-NEXT: psllw $5, %xmm1
472 ; SSE2-NEXT: pxor %xmm2, %xmm2
473 ; SSE2-NEXT: pxor %xmm3, %xmm3
474 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
475 ; SSE2-NEXT: movdqa %xmm3, %xmm4
476 ; SSE2-NEXT: pandn %xmm0, %xmm4
477 ; SSE2-NEXT: psrlw $4, %xmm0
478 ; SSE2-NEXT: pand %xmm3, %xmm0
479 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
480 ; SSE2-NEXT: por %xmm4, %xmm0
481 ; SSE2-NEXT: paddb %xmm1, %xmm1
482 ; SSE2-NEXT: pxor %xmm3, %xmm3
483 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
484 ; SSE2-NEXT: movdqa %xmm3, %xmm4
485 ; SSE2-NEXT: pandn %xmm0, %xmm4
486 ; SSE2-NEXT: psrlw $2, %xmm0
487 ; SSE2-NEXT: pand %xmm3, %xmm0
488 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
489 ; SSE2-NEXT: por %xmm4, %xmm0
490 ; SSE2-NEXT: paddb %xmm1, %xmm1
491 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
492 ; SSE2-NEXT: movdqa %xmm2, %xmm1
493 ; SSE2-NEXT: pandn %xmm0, %xmm1
494 ; SSE2-NEXT: psrlw $1, %xmm0
495 ; SSE2-NEXT: pand %xmm2, %xmm0
496 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
497 ; SSE2-NEXT: por %xmm1, %xmm0
500 ; SSE41-LABEL: var_shift_v8i8:
502 ; SSE41-NEXT: movdqa %xmm0, %xmm2
503 ; SSE41-NEXT: psllw $5, %xmm1
504 ; SSE41-NEXT: movdqa %xmm0, %xmm3
505 ; SSE41-NEXT: psrlw $4, %xmm3
506 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
507 ; SSE41-NEXT: movdqa %xmm1, %xmm0
508 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
509 ; SSE41-NEXT: movdqa %xmm2, %xmm3
510 ; SSE41-NEXT: psrlw $2, %xmm3
511 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
512 ; SSE41-NEXT: paddb %xmm1, %xmm1
513 ; SSE41-NEXT: movdqa %xmm1, %xmm0
514 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
515 ; SSE41-NEXT: movdqa %xmm2, %xmm3
516 ; SSE41-NEXT: psrlw $1, %xmm3
517 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
518 ; SSE41-NEXT: paddb %xmm1, %xmm1
519 ; SSE41-NEXT: movdqa %xmm1, %xmm0
520 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
521 ; SSE41-NEXT: movdqa %xmm2, %xmm0
524 ; AVX-LABEL: var_shift_v8i8:
526 ; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
527 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
528 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
529 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
530 ; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
531 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
532 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
533 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
534 ; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
535 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
536 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
537 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
540 ; XOP-LABEL: var_shift_v8i8:
542 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
543 ; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
544 ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
547 ; AVX512DQ-LABEL: var_shift_v8i8:
549 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
550 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
551 ; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
552 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
553 ; AVX512DQ-NEXT: vzeroupper
554 ; AVX512DQ-NEXT: retq
556 ; AVX512BW-LABEL: var_shift_v8i8:
558 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
559 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
560 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
561 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
562 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
563 ; AVX512BW-NEXT: vzeroupper
564 ; AVX512BW-NEXT: retq
566 ; AVX512DQVL-LABEL: var_shift_v8i8:
567 ; AVX512DQVL: # %bb.0:
568 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
569 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
570 ; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
571 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
572 ; AVX512DQVL-NEXT: vzeroupper
573 ; AVX512DQVL-NEXT: retq
575 ; AVX512BWVL-LABEL: var_shift_v8i8:
576 ; AVX512BWVL: # %bb.0:
577 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
578 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
579 ; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
580 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
581 ; AVX512BWVL-NEXT: vzeroupper
582 ; AVX512BWVL-NEXT: retq
584 ; X86-SSE-LABEL: var_shift_v8i8:
586 ; X86-SSE-NEXT: psllw $5, %xmm1
587 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
588 ; X86-SSE-NEXT: pxor %xmm3, %xmm3
589 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3
590 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
591 ; X86-SSE-NEXT: pandn %xmm0, %xmm4
592 ; X86-SSE-NEXT: psrlw $4, %xmm0
593 ; X86-SSE-NEXT: pand %xmm3, %xmm0
594 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
595 ; X86-SSE-NEXT: por %xmm4, %xmm0
596 ; X86-SSE-NEXT: paddb %xmm1, %xmm1
597 ; X86-SSE-NEXT: pxor %xmm3, %xmm3
598 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3
599 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
600 ; X86-SSE-NEXT: pandn %xmm0, %xmm4
601 ; X86-SSE-NEXT: psrlw $2, %xmm0
602 ; X86-SSE-NEXT: pand %xmm3, %xmm0
603 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
604 ; X86-SSE-NEXT: por %xmm4, %xmm0
605 ; X86-SSE-NEXT: paddb %xmm1, %xmm1
606 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2
607 ; X86-SSE-NEXT: movdqa %xmm2, %xmm1
608 ; X86-SSE-NEXT: pandn %xmm0, %xmm1
609 ; X86-SSE-NEXT: psrlw $1, %xmm0
610 ; X86-SSE-NEXT: pand %xmm2, %xmm0
611 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
612 ; X86-SSE-NEXT: por %xmm1, %xmm0
614 %shift = lshr <8 x i8> %a, %b
618 define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
619 ; SSE2-LABEL: var_shift_v4i8:
621 ; SSE2-NEXT: psllw $5, %xmm1
622 ; SSE2-NEXT: pxor %xmm2, %xmm2
623 ; SSE2-NEXT: pxor %xmm3, %xmm3
624 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
625 ; SSE2-NEXT: movdqa %xmm3, %xmm4
626 ; SSE2-NEXT: pandn %xmm0, %xmm4
627 ; SSE2-NEXT: psrlw $4, %xmm0
628 ; SSE2-NEXT: pand %xmm3, %xmm0
629 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
630 ; SSE2-NEXT: por %xmm4, %xmm0
631 ; SSE2-NEXT: paddb %xmm1, %xmm1
632 ; SSE2-NEXT: pxor %xmm3, %xmm3
633 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
634 ; SSE2-NEXT: movdqa %xmm3, %xmm4
635 ; SSE2-NEXT: pandn %xmm0, %xmm4
636 ; SSE2-NEXT: psrlw $2, %xmm0
637 ; SSE2-NEXT: pand %xmm3, %xmm0
638 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
639 ; SSE2-NEXT: por %xmm4, %xmm0
640 ; SSE2-NEXT: paddb %xmm1, %xmm1
641 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
642 ; SSE2-NEXT: movdqa %xmm2, %xmm1
643 ; SSE2-NEXT: pandn %xmm0, %xmm1
644 ; SSE2-NEXT: psrlw $1, %xmm0
645 ; SSE2-NEXT: pand %xmm2, %xmm0
646 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
647 ; SSE2-NEXT: por %xmm1, %xmm0
650 ; SSE41-LABEL: var_shift_v4i8:
652 ; SSE41-NEXT: movdqa %xmm0, %xmm2
653 ; SSE41-NEXT: psllw $5, %xmm1
654 ; SSE41-NEXT: movdqa %xmm0, %xmm3
655 ; SSE41-NEXT: psrlw $4, %xmm3
656 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
657 ; SSE41-NEXT: movdqa %xmm1, %xmm0
658 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
659 ; SSE41-NEXT: movdqa %xmm2, %xmm3
660 ; SSE41-NEXT: psrlw $2, %xmm3
661 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
662 ; SSE41-NEXT: paddb %xmm1, %xmm1
663 ; SSE41-NEXT: movdqa %xmm1, %xmm0
664 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
665 ; SSE41-NEXT: movdqa %xmm2, %xmm3
666 ; SSE41-NEXT: psrlw $1, %xmm3
667 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
668 ; SSE41-NEXT: paddb %xmm1, %xmm1
669 ; SSE41-NEXT: movdqa %xmm1, %xmm0
670 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
671 ; SSE41-NEXT: movdqa %xmm2, %xmm0
674 ; AVX-LABEL: var_shift_v4i8:
676 ; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
677 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
678 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
679 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
680 ; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
681 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
682 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
683 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
684 ; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
685 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
686 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
687 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
690 ; XOP-LABEL: var_shift_v4i8:
692 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
693 ; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
694 ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
697 ; AVX512DQ-LABEL: var_shift_v4i8:
699 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
700 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
701 ; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
702 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
703 ; AVX512DQ-NEXT: vzeroupper
704 ; AVX512DQ-NEXT: retq
706 ; AVX512BW-LABEL: var_shift_v4i8:
708 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
709 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
710 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
711 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
712 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
713 ; AVX512BW-NEXT: vzeroupper
714 ; AVX512BW-NEXT: retq
716 ; AVX512DQVL-LABEL: var_shift_v4i8:
717 ; AVX512DQVL: # %bb.0:
718 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
719 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
720 ; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
721 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
722 ; AVX512DQVL-NEXT: vzeroupper
723 ; AVX512DQVL-NEXT: retq
725 ; AVX512BWVL-LABEL: var_shift_v4i8:
726 ; AVX512BWVL: # %bb.0:
727 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
728 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
729 ; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
730 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
731 ; AVX512BWVL-NEXT: vzeroupper
732 ; AVX512BWVL-NEXT: retq
734 ; X86-SSE-LABEL: var_shift_v4i8:
736 ; X86-SSE-NEXT: psllw $5, %xmm1
737 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
738 ; X86-SSE-NEXT: pxor %xmm3, %xmm3
739 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3
740 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
741 ; X86-SSE-NEXT: pandn %xmm0, %xmm4
742 ; X86-SSE-NEXT: psrlw $4, %xmm0
743 ; X86-SSE-NEXT: pand %xmm3, %xmm0
744 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
745 ; X86-SSE-NEXT: por %xmm4, %xmm0
746 ; X86-SSE-NEXT: paddb %xmm1, %xmm1
747 ; X86-SSE-NEXT: pxor %xmm3, %xmm3
748 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3
749 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
750 ; X86-SSE-NEXT: pandn %xmm0, %xmm4
751 ; X86-SSE-NEXT: psrlw $2, %xmm0
752 ; X86-SSE-NEXT: pand %xmm3, %xmm0
753 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
754 ; X86-SSE-NEXT: por %xmm4, %xmm0
755 ; X86-SSE-NEXT: paddb %xmm1, %xmm1
756 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2
757 ; X86-SSE-NEXT: movdqa %xmm2, %xmm1
758 ; X86-SSE-NEXT: pandn %xmm0, %xmm1
759 ; X86-SSE-NEXT: psrlw $1, %xmm0
760 ; X86-SSE-NEXT: pand %xmm2, %xmm0
761 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
762 ; X86-SSE-NEXT: por %xmm1, %xmm0
764 %shift = lshr <4 x i8> %a, %b
768 define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
769 ; SSE2-LABEL: var_shift_v2i8:
771 ; SSE2-NEXT: psllw $5, %xmm1
772 ; SSE2-NEXT: pxor %xmm2, %xmm2
773 ; SSE2-NEXT: pxor %xmm3, %xmm3
774 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
775 ; SSE2-NEXT: movdqa %xmm3, %xmm4
776 ; SSE2-NEXT: pandn %xmm0, %xmm4
777 ; SSE2-NEXT: psrlw $4, %xmm0
778 ; SSE2-NEXT: pand %xmm3, %xmm0
779 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
780 ; SSE2-NEXT: por %xmm4, %xmm0
781 ; SSE2-NEXT: paddb %xmm1, %xmm1
782 ; SSE2-NEXT: pxor %xmm3, %xmm3
783 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
784 ; SSE2-NEXT: movdqa %xmm3, %xmm4
785 ; SSE2-NEXT: pandn %xmm0, %xmm4
786 ; SSE2-NEXT: psrlw $2, %xmm0
787 ; SSE2-NEXT: pand %xmm3, %xmm0
788 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
789 ; SSE2-NEXT: por %xmm4, %xmm0
790 ; SSE2-NEXT: paddb %xmm1, %xmm1
791 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
792 ; SSE2-NEXT: movdqa %xmm2, %xmm1
793 ; SSE2-NEXT: pandn %xmm0, %xmm1
794 ; SSE2-NEXT: psrlw $1, %xmm0
795 ; SSE2-NEXT: pand %xmm2, %xmm0
796 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
797 ; SSE2-NEXT: por %xmm1, %xmm0
800 ; SSE41-LABEL: var_shift_v2i8:
802 ; SSE41-NEXT: movdqa %xmm0, %xmm2
803 ; SSE41-NEXT: psllw $5, %xmm1
804 ; SSE41-NEXT: movdqa %xmm0, %xmm3
805 ; SSE41-NEXT: psrlw $4, %xmm3
806 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
807 ; SSE41-NEXT: movdqa %xmm1, %xmm0
808 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
809 ; SSE41-NEXT: movdqa %xmm2, %xmm3
810 ; SSE41-NEXT: psrlw $2, %xmm3
811 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
812 ; SSE41-NEXT: paddb %xmm1, %xmm1
813 ; SSE41-NEXT: movdqa %xmm1, %xmm0
814 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
815 ; SSE41-NEXT: movdqa %xmm2, %xmm3
816 ; SSE41-NEXT: psrlw $1, %xmm3
817 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
818 ; SSE41-NEXT: paddb %xmm1, %xmm1
819 ; SSE41-NEXT: movdqa %xmm1, %xmm0
820 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
821 ; SSE41-NEXT: movdqa %xmm2, %xmm0
824 ; AVX-LABEL: var_shift_v2i8:
826 ; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
827 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
828 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
829 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
830 ; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
831 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
832 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
833 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
834 ; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
835 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
836 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
837 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
840 ; XOP-LABEL: var_shift_v2i8:
842 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
843 ; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
844 ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
847 ; AVX512DQ-LABEL: var_shift_v2i8:
849 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
850 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
851 ; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
852 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
853 ; AVX512DQ-NEXT: vzeroupper
854 ; AVX512DQ-NEXT: retq
856 ; AVX512BW-LABEL: var_shift_v2i8:
858 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
859 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
860 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
861 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
862 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
863 ; AVX512BW-NEXT: vzeroupper
864 ; AVX512BW-NEXT: retq
866 ; AVX512DQVL-LABEL: var_shift_v2i8:
867 ; AVX512DQVL: # %bb.0:
868 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
869 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
870 ; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
871 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
872 ; AVX512DQVL-NEXT: vzeroupper
873 ; AVX512DQVL-NEXT: retq
875 ; AVX512BWVL-LABEL: var_shift_v2i8:
876 ; AVX512BWVL: # %bb.0:
877 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
878 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
879 ; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
880 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
881 ; AVX512BWVL-NEXT: vzeroupper
882 ; AVX512BWVL-NEXT: retq
884 ; X86-SSE-LABEL: var_shift_v2i8:
886 ; X86-SSE-NEXT: psllw $5, %xmm1
887 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
888 ; X86-SSE-NEXT: pxor %xmm3, %xmm3
889 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3
890 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
891 ; X86-SSE-NEXT: pandn %xmm0, %xmm4
892 ; X86-SSE-NEXT: psrlw $4, %xmm0
893 ; X86-SSE-NEXT: pand %xmm3, %xmm0
894 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
895 ; X86-SSE-NEXT: por %xmm4, %xmm0
896 ; X86-SSE-NEXT: paddb %xmm1, %xmm1
897 ; X86-SSE-NEXT: pxor %xmm3, %xmm3
898 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3
899 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
900 ; X86-SSE-NEXT: pandn %xmm0, %xmm4
901 ; X86-SSE-NEXT: psrlw $2, %xmm0
902 ; X86-SSE-NEXT: pand %xmm3, %xmm0
903 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
904 ; X86-SSE-NEXT: por %xmm4, %xmm0
905 ; X86-SSE-NEXT: paddb %xmm1, %xmm1
906 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2
907 ; X86-SSE-NEXT: movdqa %xmm2, %xmm1
908 ; X86-SSE-NEXT: pandn %xmm0, %xmm1
909 ; X86-SSE-NEXT: psrlw $1, %xmm0
910 ; X86-SSE-NEXT: pand %xmm2, %xmm0
911 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
912 ; X86-SSE-NEXT: por %xmm1, %xmm0
914 %shift = lshr <2 x i8> %a, %b
919 ; Uniform Variable Shifts
922 define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
923 ; SSE2-LABEL: splatvar_shift_v2i32:
925 ; SSE2-NEXT: xorps %xmm2, %xmm2
926 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
927 ; SSE2-NEXT: psrld %xmm2, %xmm0
930 ; SSE41-LABEL: splatvar_shift_v2i32:
932 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
933 ; SSE41-NEXT: psrld %xmm1, %xmm0
936 ; AVX-LABEL: splatvar_shift_v2i32:
938 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
939 ; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0
942 ; XOP-LABEL: splatvar_shift_v2i32:
944 ; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
945 ; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0
948 ; AVX512-LABEL: splatvar_shift_v2i32:
950 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
951 ; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0
954 ; AVX512VL-LABEL: splatvar_shift_v2i32:
956 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
957 ; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0
958 ; AVX512VL-NEXT: retq
960 ; X86-SSE-LABEL: splatvar_shift_v2i32:
962 ; X86-SSE-NEXT: xorps %xmm2, %xmm2
963 ; X86-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
964 ; X86-SSE-NEXT: psrld %xmm2, %xmm0
966 %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
967 %shift = lshr <2 x i32> %a, %splat
971 define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
972 ; SSE2-LABEL: splatvar_shift_v4i16:
974 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
975 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
976 ; SSE2-NEXT: psrlw %xmm1, %xmm0
979 ; SSE41-LABEL: splatvar_shift_v4i16:
981 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
982 ; SSE41-NEXT: psrlw %xmm1, %xmm0
985 ; AVX-LABEL: splatvar_shift_v4i16:
987 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
988 ; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
991 ; XOP-LABEL: splatvar_shift_v4i16:
993 ; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
994 ; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
997 ; AVX512-LABEL: splatvar_shift_v4i16:
999 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1000 ; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1003 ; AVX512VL-LABEL: splatvar_shift_v4i16:
1004 ; AVX512VL: # %bb.0:
1005 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1006 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1007 ; AVX512VL-NEXT: retq
1009 ; X86-SSE-LABEL: splatvar_shift_v4i16:
1011 ; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1012 ; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1013 ; X86-SSE-NEXT: psrlw %xmm1, %xmm0
1014 ; X86-SSE-NEXT: retl
1015 %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
1016 %shift = lshr <4 x i16> %a, %splat
1017 ret <4 x i16> %shift
1020 define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
1021 ; SSE2-LABEL: splatvar_shift_v2i16:
1023 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1024 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1025 ; SSE2-NEXT: psrlw %xmm1, %xmm0
1028 ; SSE41-LABEL: splatvar_shift_v2i16:
1030 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1031 ; SSE41-NEXT: psrlw %xmm1, %xmm0
1034 ; AVX-LABEL: splatvar_shift_v2i16:
1036 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1037 ; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1040 ; XOP-LABEL: splatvar_shift_v2i16:
1042 ; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1043 ; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1046 ; AVX512-LABEL: splatvar_shift_v2i16:
1048 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1049 ; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1052 ; AVX512VL-LABEL: splatvar_shift_v2i16:
1053 ; AVX512VL: # %bb.0:
1054 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1055 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1056 ; AVX512VL-NEXT: retq
1058 ; X86-SSE-LABEL: splatvar_shift_v2i16:
1060 ; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1061 ; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1062 ; X86-SSE-NEXT: psrlw %xmm1, %xmm0
1063 ; X86-SSE-NEXT: retl
1064 %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
1065 %shift = lshr <2 x i16> %a, %splat
1066 ret <2 x i16> %shift
1069 define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
1070 ; SSE2-LABEL: splatvar_shift_v8i8:
1072 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1073 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1074 ; SSE2-NEXT: psrlw %xmm1, %xmm0
1075 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
1076 ; SSE2-NEXT: psrlw %xmm1, %xmm2
1077 ; SSE2-NEXT: psrlw $8, %xmm2
1078 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1079 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
1080 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1081 ; SSE2-NEXT: pand %xmm1, %xmm0
1084 ; SSE41-LABEL: splatvar_shift_v8i8:
1086 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1087 ; SSE41-NEXT: psrlw %xmm1, %xmm0
1088 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
1089 ; SSE41-NEXT: psrlw %xmm1, %xmm2
1090 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1091 ; SSE41-NEXT: pand %xmm2, %xmm0
1094 ; AVX1-LABEL: splatvar_shift_v8i8:
1096 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1097 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1098 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1099 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
1100 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1101 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1104 ; AVX2-LABEL: splatvar_shift_v8i8:
1106 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1107 ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1108 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1109 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
1110 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
1111 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1112 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1115 ; XOPAVX1-LABEL: splatvar_shift_v8i8:
1117 ; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1118 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1119 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1120 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1121 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
1122 ; XOPAVX1-NEXT: retq
1124 ; XOPAVX2-LABEL: splatvar_shift_v8i8:
1126 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1127 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1128 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1129 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
1130 ; XOPAVX2-NEXT: retq
1132 ; AVX512DQ-LABEL: splatvar_shift_v8i8:
1133 ; AVX512DQ: # %bb.0:
1134 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1135 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1136 ; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0
1137 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1138 ; AVX512DQ-NEXT: vzeroupper
1139 ; AVX512DQ-NEXT: retq
1141 ; AVX512BW-LABEL: splatvar_shift_v8i8:
1142 ; AVX512BW: # %bb.0:
1143 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1144 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1145 ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1146 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1147 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1148 ; AVX512BW-NEXT: vzeroupper
1149 ; AVX512BW-NEXT: retq
1151 ; AVX512DQVL-LABEL: splatvar_shift_v8i8:
1152 ; AVX512DQVL: # %bb.0:
1153 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1154 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1155 ; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0
1156 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
1157 ; AVX512DQVL-NEXT: vzeroupper
1158 ; AVX512DQVL-NEXT: retq
1160 ; AVX512BWVL-LABEL: splatvar_shift_v8i8:
1161 ; AVX512BWVL: # %bb.0:
1162 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1163 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1164 ; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1165 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
1166 ; AVX512BWVL-NEXT: vzeroupper
1167 ; AVX512BWVL-NEXT: retq
1169 ; X86-SSE-LABEL: splatvar_shift_v8i8:
1171 ; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1172 ; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1173 ; X86-SSE-NEXT: psrlw %xmm1, %xmm0
1174 ; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2
1175 ; X86-SSE-NEXT: psrlw %xmm1, %xmm2
1176 ; X86-SSE-NEXT: psrlw $8, %xmm2
1177 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1178 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
1179 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1180 ; X86-SSE-NEXT: pand %xmm1, %xmm0
1181 ; X86-SSE-NEXT: retl
1182 %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
1183 %shift = lshr <8 x i8> %a, %splat
1187 define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
1188 ; SSE2-LABEL: splatvar_shift_v4i8:
1190 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1191 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1192 ; SSE2-NEXT: psrlw %xmm1, %xmm0
1193 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
1194 ; SSE2-NEXT: psrlw %xmm1, %xmm2
1195 ; SSE2-NEXT: psrlw $8, %xmm2
1196 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1197 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
1198 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1199 ; SSE2-NEXT: pand %xmm1, %xmm0
1202 ; SSE41-LABEL: splatvar_shift_v4i8:
1204 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1205 ; SSE41-NEXT: psrlw %xmm1, %xmm0
1206 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
1207 ; SSE41-NEXT: psrlw %xmm1, %xmm2
1208 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1209 ; SSE41-NEXT: pand %xmm2, %xmm0
1212 ; AVX1-LABEL: splatvar_shift_v4i8:
1214 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1215 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1216 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1217 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
1218 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1219 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1222 ; AVX2-LABEL: splatvar_shift_v4i8:
1224 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1225 ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1226 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1227 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
1228 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
1229 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1230 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1233 ; XOPAVX1-LABEL: splatvar_shift_v4i8:
1235 ; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1236 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1237 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1238 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1239 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
1240 ; XOPAVX1-NEXT: retq
1242 ; XOPAVX2-LABEL: splatvar_shift_v4i8:
1244 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1245 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1246 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1247 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
1248 ; XOPAVX2-NEXT: retq
1250 ; AVX512DQ-LABEL: splatvar_shift_v4i8:
1251 ; AVX512DQ: # %bb.0:
1252 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1253 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1254 ; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0
1255 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1256 ; AVX512DQ-NEXT: vzeroupper
1257 ; AVX512DQ-NEXT: retq
1259 ; AVX512BW-LABEL: splatvar_shift_v4i8:
1260 ; AVX512BW: # %bb.0:
1261 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1262 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1263 ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1264 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1265 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1266 ; AVX512BW-NEXT: vzeroupper
1267 ; AVX512BW-NEXT: retq
1269 ; AVX512DQVL-LABEL: splatvar_shift_v4i8:
1270 ; AVX512DQVL: # %bb.0:
1271 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1272 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1273 ; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0
1274 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
1275 ; AVX512DQVL-NEXT: vzeroupper
1276 ; AVX512DQVL-NEXT: retq
1278 ; AVX512BWVL-LABEL: splatvar_shift_v4i8:
1279 ; AVX512BWVL: # %bb.0:
1280 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1281 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1282 ; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1283 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
1284 ; AVX512BWVL-NEXT: vzeroupper
1285 ; AVX512BWVL-NEXT: retq
1287 ; X86-SSE-LABEL: splatvar_shift_v4i8:
1289 ; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1290 ; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1291 ; X86-SSE-NEXT: psrlw %xmm1, %xmm0
1292 ; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2
1293 ; X86-SSE-NEXT: psrlw %xmm1, %xmm2
1294 ; X86-SSE-NEXT: psrlw $8, %xmm2
1295 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1296 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
1297 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1298 ; X86-SSE-NEXT: pand %xmm1, %xmm0
1299 ; X86-SSE-NEXT: retl
1300 %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
1301 %shift = lshr <4 x i8> %a, %splat
1305 define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
1306 ; SSE2-LABEL: splatvar_shift_v2i8:
1308 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1309 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1310 ; SSE2-NEXT: psrlw %xmm1, %xmm0
1311 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
1312 ; SSE2-NEXT: psrlw %xmm1, %xmm2
1313 ; SSE2-NEXT: psrlw $8, %xmm2
1314 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1315 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
1316 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1317 ; SSE2-NEXT: pand %xmm1, %xmm0
1320 ; SSE41-LABEL: splatvar_shift_v2i8:
1322 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1323 ; SSE41-NEXT: psrlw %xmm1, %xmm0
1324 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
1325 ; SSE41-NEXT: psrlw %xmm1, %xmm2
1326 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1327 ; SSE41-NEXT: pand %xmm2, %xmm0
1330 ; AVX1-LABEL: splatvar_shift_v2i8:
1332 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1333 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1334 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1335 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
1336 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1337 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1340 ; AVX2-LABEL: splatvar_shift_v2i8:
1342 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1343 ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1344 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1345 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
1346 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
1347 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1348 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1351 ; XOP-LABEL: splatvar_shift_v2i8:
1353 ; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
1354 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
1355 ; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1356 ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
1359 ; AVX512DQ-LABEL: splatvar_shift_v2i8:
1360 ; AVX512DQ: # %bb.0:
1361 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1362 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1363 ; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0
1364 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1365 ; AVX512DQ-NEXT: vzeroupper
1366 ; AVX512DQ-NEXT: retq
1368 ; AVX512BW-LABEL: splatvar_shift_v2i8:
1369 ; AVX512BW: # %bb.0:
1370 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1371 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1372 ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1373 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1374 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1375 ; AVX512BW-NEXT: vzeroupper
1376 ; AVX512BW-NEXT: retq
1378 ; AVX512DQVL-LABEL: splatvar_shift_v2i8:
1379 ; AVX512DQVL: # %bb.0:
1380 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1381 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1382 ; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0
1383 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
1384 ; AVX512DQVL-NEXT: vzeroupper
1385 ; AVX512DQVL-NEXT: retq
1387 ; AVX512BWVL-LABEL: splatvar_shift_v2i8:
1388 ; AVX512BWVL: # %bb.0:
1389 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1390 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1391 ; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1392 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
1393 ; AVX512BWVL-NEXT: vzeroupper
1394 ; AVX512BWVL-NEXT: retq
1396 ; X86-SSE-LABEL: splatvar_shift_v2i8:
1398 ; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1399 ; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1400 ; X86-SSE-NEXT: psrlw %xmm1, %xmm0
1401 ; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2
1402 ; X86-SSE-NEXT: psrlw %xmm1, %xmm2
1403 ; X86-SSE-NEXT: psrlw $8, %xmm2
1404 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1405 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
1406 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1407 ; X86-SSE-NEXT: pand %xmm1, %xmm0
1408 ; X86-SSE-NEXT: retl
1409 %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
1410 %shift = lshr <2 x i8> %a, %splat
1418 define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
1419 ; SSE2-LABEL: constant_shift_v2i32:
1421 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1422 ; SSE2-NEXT: psrld $4, %xmm0
1423 ; SSE2-NEXT: psrld $5, %xmm1
1424 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1427 ; SSE41-LABEL: constant_shift_v2i32:
1429 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1430 ; SSE41-NEXT: psrld $5, %xmm1
1431 ; SSE41-NEXT: psrld $4, %xmm0
1432 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1435 ; AVX1-LABEL: constant_shift_v2i32:
1437 ; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
1438 ; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0
1439 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1442 ; AVX2-LABEL: constant_shift_v2i32:
1444 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1447 ; XOPAVX1-LABEL: constant_shift_v2i32:
1449 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1450 ; XOPAVX1-NEXT: retq
1452 ; XOPAVX2-LABEL: constant_shift_v2i32:
1454 ; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1455 ; XOPAVX2-NEXT: retq
1457 ; AVX512-LABEL: constant_shift_v2i32:
1459 ; AVX512-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1462 ; AVX512VL-LABEL: constant_shift_v2i32:
1463 ; AVX512VL: # %bb.0:
1464 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1465 ; AVX512VL-NEXT: retq
1467 ; X86-SSE-LABEL: constant_shift_v2i32:
1469 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1470 ; X86-SSE-NEXT: psrld $4, %xmm0
1471 ; X86-SSE-NEXT: psrld $5, %xmm1
1472 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1473 ; X86-SSE-NEXT: retl
1474 %shift = lshr <2 x i32> %a, <i32 4, i32 5>
1475 ret <2 x i32> %shift
1478 define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
1479 ; SSE2-LABEL: constant_shift_v4i16:
1481 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
1482 ; SSE2-NEXT: pandn %xmm0, %xmm1
1483 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1484 ; SSE2-NEXT: por %xmm1, %xmm0
1487 ; SSE41-LABEL: constant_shift_v4i16:
1489 ; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0]
1490 ; SSE41-NEXT: pmulhuw %xmm0, %xmm1
1491 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1494 ; AVX-LABEL: constant_shift_v4i16:
1496 ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u]
1497 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1500 ; XOP-LABEL: constant_shift_v4i16:
1502 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1505 ; AVX512DQ-LABEL: constant_shift_v4i16:
1506 ; AVX512DQ: # %bb.0:
1507 ; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u]
1508 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1509 ; AVX512DQ-NEXT: retq
1511 ; AVX512BW-LABEL: constant_shift_v4i16:
1512 ; AVX512BW: # %bb.0:
1513 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1514 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
1515 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1516 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1517 ; AVX512BW-NEXT: vzeroupper
1518 ; AVX512BW-NEXT: retq
1520 ; AVX512DQVL-LABEL: constant_shift_v4i16:
1521 ; AVX512DQVL: # %bb.0:
1522 ; AVX512DQVL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u]
1523 ; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1524 ; AVX512DQVL-NEXT: retq
1526 ; AVX512BWVL-LABEL: constant_shift_v4i16:
1527 ; AVX512BWVL: # %bb.0:
1528 ; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1529 ; AVX512BWVL-NEXT: retq
1531 ; X86-SSE-LABEL: constant_shift_v4i16:
1533 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
1534 ; X86-SSE-NEXT: pandn %xmm0, %xmm1
1535 ; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1536 ; X86-SSE-NEXT: por %xmm1, %xmm0
1537 ; X86-SSE-NEXT: retl
1538 %shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
1539 ret <4 x i16> %shift
1542 define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
1543 ; SSE2-LABEL: constant_shift_v2i16:
1545 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1546 ; SSE2-NEXT: psrlw $3, %xmm1
1547 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
1548 ; SSE2-NEXT: psrlw $2, %xmm0
1549 ; SSE2-NEXT: pand %xmm2, %xmm0
1550 ; SSE2-NEXT: pandn %xmm1, %xmm2
1551 ; SSE2-NEXT: por %xmm2, %xmm0
1554 ; SSE41-LABEL: constant_shift_v2i16:
1556 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1557 ; SSE41-NEXT: psrlw $3, %xmm1
1558 ; SSE41-NEXT: psrlw $2, %xmm0
1559 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1562 ; AVX-LABEL: constant_shift_v2i16:
1564 ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1
1565 ; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
1566 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1569 ; XOP-LABEL: constant_shift_v2i16:
1571 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1574 ; AVX512DQ-LABEL: constant_shift_v2i16:
1575 ; AVX512DQ: # %bb.0:
1576 ; AVX512DQ-NEXT: vpsrlw $3, %xmm0, %xmm1
1577 ; AVX512DQ-NEXT: vpsrlw $2, %xmm0, %xmm0
1578 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1579 ; AVX512DQ-NEXT: retq
1581 ; AVX512BW-LABEL: constant_shift_v2i16:
1582 ; AVX512BW: # %bb.0:
1583 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1584 ; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
1585 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1586 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1587 ; AVX512BW-NEXT: vzeroupper
1588 ; AVX512BW-NEXT: retq
1590 ; AVX512DQVL-LABEL: constant_shift_v2i16:
1591 ; AVX512DQVL: # %bb.0:
1592 ; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm1
1593 ; AVX512DQVL-NEXT: vpsrlw $2, %xmm0, %xmm0
1594 ; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1595 ; AVX512DQVL-NEXT: retq
1597 ; AVX512BWVL-LABEL: constant_shift_v2i16:
1598 ; AVX512BWVL: # %bb.0:
1599 ; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1600 ; AVX512BWVL-NEXT: retq
1602 ; X86-SSE-LABEL: constant_shift_v2i16:
1604 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1
1605 ; X86-SSE-NEXT: psrlw $3, %xmm1
1606 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
1607 ; X86-SSE-NEXT: psrlw $2, %xmm0
1608 ; X86-SSE-NEXT: pand %xmm2, %xmm0
1609 ; X86-SSE-NEXT: pandn %xmm1, %xmm2
1610 ; X86-SSE-NEXT: por %xmm2, %xmm0
1611 ; X86-SSE-NEXT: retl
1612 %shift = lshr <2 x i16> %a, <i16 2, i16 3>
1613 ret <2 x i16> %shift
1616 define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
1617 ; SSE2-LABEL: constant_shift_v8i8:
1619 ; SSE2-NEXT: pxor %xmm1, %xmm1
1620 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1621 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1622 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1623 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2]
1624 ; SSE2-NEXT: psrlw $8, %xmm0
1625 ; SSE2-NEXT: packuswb %xmm2, %xmm0
1628 ; SSE41-LABEL: constant_shift_v8i8:
1630 ; SSE41-NEXT: pxor %xmm2, %xmm2
1631 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1632 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1633 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2]
1634 ; SSE41-NEXT: psrlw $8, %xmm1
1635 ; SSE41-NEXT: packuswb %xmm0, %xmm1
1636 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1639 ; AVX1-LABEL: constant_shift_v8i8:
1641 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1642 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1643 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1644 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2]
1645 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1646 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1649 ; AVX2-LABEL: constant_shift_v8i8:
1651 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1652 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256]
1653 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1654 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1655 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1656 ; AVX2-NEXT: vzeroupper
1659 ; XOP-LABEL: constant_shift_v8i8:
1661 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1664 ; AVX512DQ-LABEL: constant_shift_v8i8:
1665 ; AVX512DQ: # %bb.0:
1666 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1667 ; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1668 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1669 ; AVX512DQ-NEXT: vzeroupper
1670 ; AVX512DQ-NEXT: retq
1672 ; AVX512BW-LABEL: constant_shift_v8i8:
1673 ; AVX512BW: # %bb.0:
1674 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
1675 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1676 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1677 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1678 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1679 ; AVX512BW-NEXT: vzeroupper
1680 ; AVX512BW-NEXT: retq
1682 ; AVX512DQVL-LABEL: constant_shift_v8i8:
1683 ; AVX512DQVL: # %bb.0:
1684 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1685 ; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1686 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
1687 ; AVX512DQVL-NEXT: vzeroupper
1688 ; AVX512DQVL-NEXT: retq
1690 ; AVX512BWVL-LABEL: constant_shift_v8i8:
1691 ; AVX512BWVL: # %bb.0:
1692 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1693 ; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1694 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
1695 ; AVX512BWVL-NEXT: vzeroupper
1696 ; AVX512BWVL-NEXT: retq
1698 ; X86-SSE-LABEL: constant_shift_v8i8:
1700 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1701 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1702 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1703 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1704 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2]
1705 ; X86-SSE-NEXT: psrlw $8, %xmm0
1706 ; X86-SSE-NEXT: packuswb %xmm2, %xmm0
1707 ; X86-SSE-NEXT: retl
1708 %shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1712 define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
1713 ; SSE2-LABEL: constant_shift_v4i8:
1715 ; SSE2-NEXT: pxor %xmm1, %xmm1
1716 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1717 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1718 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1719 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256]
1720 ; SSE2-NEXT: psrlw $8, %xmm0
1721 ; SSE2-NEXT: packuswb %xmm2, %xmm0
1724 ; SSE41-LABEL: constant_shift_v4i8:
1726 ; SSE41-NEXT: pxor %xmm2, %xmm2
1727 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1728 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1729 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,256,256,256,256]
1730 ; SSE41-NEXT: psrlw $8, %xmm1
1731 ; SSE41-NEXT: packuswb %xmm0, %xmm1
1732 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1735 ; AVX1-LABEL: constant_shift_v4i8:
1737 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1738 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1739 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1740 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256]
1741 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1742 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1745 ; AVX2-LABEL: constant_shift_v4i8:
1747 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1748 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256]
1749 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1750 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1751 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1752 ; AVX2-NEXT: vzeroupper
1755 ; XOP-LABEL: constant_shift_v4i8:
1757 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1760 ; AVX512DQ-LABEL: constant_shift_v4i8:
1761 ; AVX512DQ: # %bb.0:
1762 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1763 ; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1764 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1765 ; AVX512DQ-NEXT: vzeroupper
1766 ; AVX512DQ-NEXT: retq
1768 ; AVX512BW-LABEL: constant_shift_v4i8:
1769 ; AVX512BW: # %bb.0:
1770 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
1771 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1772 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1773 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1774 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1775 ; AVX512BW-NEXT: vzeroupper
1776 ; AVX512BW-NEXT: retq
1778 ; AVX512DQVL-LABEL: constant_shift_v4i8:
1779 ; AVX512DQVL: # %bb.0:
1780 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1781 ; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1782 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
1783 ; AVX512DQVL-NEXT: vzeroupper
1784 ; AVX512DQVL-NEXT: retq
1786 ; AVX512BWVL-LABEL: constant_shift_v4i8:
1787 ; AVX512BWVL: # %bb.0:
1788 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1789 ; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1790 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
1791 ; AVX512BWVL-NEXT: vzeroupper
1792 ; AVX512BWVL-NEXT: retq
1794 ; X86-SSE-LABEL: constant_shift_v4i8:
1796 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1797 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1798 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1799 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1800 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256]
1801 ; X86-SSE-NEXT: psrlw $8, %xmm0
1802 ; X86-SSE-NEXT: packuswb %xmm2, %xmm0
1803 ; X86-SSE-NEXT: retl
1804 %shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
1808 define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
1809 ; SSE2-LABEL: constant_shift_v2i8:
1811 ; SSE2-NEXT: pxor %xmm1, %xmm1
1812 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1813 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1814 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1815 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256]
1816 ; SSE2-NEXT: psrlw $8, %xmm0
1817 ; SSE2-NEXT: packuswb %xmm2, %xmm0
1820 ; SSE41-LABEL: constant_shift_v2i8:
1822 ; SSE41-NEXT: pxor %xmm2, %xmm2
1823 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1824 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1825 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,256,256,256,256,256,256]
1826 ; SSE41-NEXT: psrlw $8, %xmm1
1827 ; SSE41-NEXT: packuswb %xmm0, %xmm1
1828 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1831 ; AVX1-LABEL: constant_shift_v2i8:
1833 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1834 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1835 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1836 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256]
1837 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1838 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1841 ; AVX2-LABEL: constant_shift_v2i8:
1843 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1844 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
1845 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1846 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1847 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1848 ; AVX2-NEXT: vzeroupper
1851 ; XOP-LABEL: constant_shift_v2i8:
1853 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1856 ; AVX512DQ-LABEL: constant_shift_v2i8:
1857 ; AVX512DQ: # %bb.0:
1858 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1859 ; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1860 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1861 ; AVX512DQ-NEXT: vzeroupper
1862 ; AVX512DQ-NEXT: retq
1864 ; AVX512BW-LABEL: constant_shift_v2i8:
1865 ; AVX512BW: # %bb.0:
1866 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
1867 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1868 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1869 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1870 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1871 ; AVX512BW-NEXT: vzeroupper
1872 ; AVX512BW-NEXT: retq
1874 ; AVX512DQVL-LABEL: constant_shift_v2i8:
1875 ; AVX512DQVL: # %bb.0:
1876 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1877 ; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1878 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
1879 ; AVX512DQVL-NEXT: vzeroupper
1880 ; AVX512DQVL-NEXT: retq
1882 ; AVX512BWVL-LABEL: constant_shift_v2i8:
1883 ; AVX512BWVL: # %bb.0:
1884 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1885 ; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1886 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
1887 ; AVX512BWVL-NEXT: vzeroupper
1888 ; AVX512BWVL-NEXT: retq
1890 ; X86-SSE-LABEL: constant_shift_v2i8:
1892 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1893 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1894 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1895 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1896 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256]
1897 ; X86-SSE-NEXT: psrlw $8, %xmm0
1898 ; X86-SSE-NEXT: packuswb %xmm2, %xmm0
1899 ; X86-SSE-NEXT: retl
1900 %shift = lshr <2 x i8> %a, <i8 2, i8 3>
1905 ; Uniform Constant Shifts
1908 define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
1909 ; SSE-LABEL: splatconstant_shift_v2i32:
1911 ; SSE-NEXT: psrld $5, %xmm0
1914 ; AVX-LABEL: splatconstant_shift_v2i32:
1916 ; AVX-NEXT: vpsrld $5, %xmm0, %xmm0
1919 ; XOP-LABEL: splatconstant_shift_v2i32:
1921 ; XOP-NEXT: vpsrld $5, %xmm0, %xmm0
1924 ; AVX512-LABEL: splatconstant_shift_v2i32:
1926 ; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0
1929 ; AVX512VL-LABEL: splatconstant_shift_v2i32:
1930 ; AVX512VL: # %bb.0:
1931 ; AVX512VL-NEXT: vpsrld $5, %xmm0, %xmm0
1932 ; AVX512VL-NEXT: retq
1934 ; X86-SSE-LABEL: splatconstant_shift_v2i32:
1936 ; X86-SSE-NEXT: psrld $5, %xmm0
1937 ; X86-SSE-NEXT: retl
1938 %shift = lshr <2 x i32> %a, <i32 5, i32 5>
1939 ret <2 x i32> %shift
1942 define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
1943 ; SSE-LABEL: splatconstant_shift_v4i16:
1945 ; SSE-NEXT: psrlw $3, %xmm0
1948 ; AVX-LABEL: splatconstant_shift_v4i16:
1950 ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
1953 ; XOP-LABEL: splatconstant_shift_v4i16:
1955 ; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
1958 ; AVX512-LABEL: splatconstant_shift_v4i16:
1960 ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
1963 ; AVX512VL-LABEL: splatconstant_shift_v4i16:
1964 ; AVX512VL: # %bb.0:
1965 ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
1966 ; AVX512VL-NEXT: retq
1968 ; X86-SSE-LABEL: splatconstant_shift_v4i16:
1970 ; X86-SSE-NEXT: psrlw $3, %xmm0
1971 ; X86-SSE-NEXT: retl
1972 %shift = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
1973 ret <4 x i16> %shift
1976 define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
1977 ; SSE-LABEL: splatconstant_shift_v2i16:
1979 ; SSE-NEXT: psrlw $3, %xmm0
1982 ; AVX-LABEL: splatconstant_shift_v2i16:
1984 ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
1987 ; XOP-LABEL: splatconstant_shift_v2i16:
1989 ; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
1992 ; AVX512-LABEL: splatconstant_shift_v2i16:
1994 ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
1997 ; AVX512VL-LABEL: splatconstant_shift_v2i16:
1998 ; AVX512VL: # %bb.0:
1999 ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
2000 ; AVX512VL-NEXT: retq
2002 ; X86-SSE-LABEL: splatconstant_shift_v2i16:
2004 ; X86-SSE-NEXT: psrlw $3, %xmm0
2005 ; X86-SSE-NEXT: retl
2006 %shift = lshr <2 x i16> %a, <i16 3, i16 3>
2007 ret <2 x i16> %shift
2010 define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
2011 ; SSE-LABEL: splatconstant_shift_v8i8:
2013 ; SSE-NEXT: psrlw $3, %xmm0
2014 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2017 ; AVX-LABEL: splatconstant_shift_v8i8:
2019 ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
2020 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2023 ; XOP-LABEL: splatconstant_shift_v8i8:
2025 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2028 ; AVX512-LABEL: splatconstant_shift_v8i8:
2030 ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
2031 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2034 ; AVX512VL-LABEL: splatconstant_shift_v8i8:
2035 ; AVX512VL: # %bb.0:
2036 ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
2037 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
2038 ; AVX512VL-NEXT: retq
2040 ; X86-SSE-LABEL: splatconstant_shift_v8i8:
2042 ; X86-SSE-NEXT: psrlw $3, %xmm0
2043 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2044 ; X86-SSE-NEXT: retl
2045 %shift = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
2049 define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
2050 ; SSE-LABEL: splatconstant_shift_v4i8:
2052 ; SSE-NEXT: psrlw $3, %xmm0
2053 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2056 ; AVX-LABEL: splatconstant_shift_v4i8:
2058 ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
2059 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2062 ; XOP-LABEL: splatconstant_shift_v4i8:
2064 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2067 ; AVX512-LABEL: splatconstant_shift_v4i8:
2069 ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
2070 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2073 ; AVX512VL-LABEL: splatconstant_shift_v4i8:
2074 ; AVX512VL: # %bb.0:
2075 ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
2076 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
2077 ; AVX512VL-NEXT: retq
2079 ; X86-SSE-LABEL: splatconstant_shift_v4i8:
2081 ; X86-SSE-NEXT: psrlw $3, %xmm0
2082 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2083 ; X86-SSE-NEXT: retl
2084 %shift = lshr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
2088 define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
2089 ; SSE-LABEL: splatconstant_shift_v2i8:
2091 ; SSE-NEXT: psrlw $3, %xmm0
2092 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2095 ; AVX-LABEL: splatconstant_shift_v2i8:
2097 ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
2098 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2101 ; XOP-LABEL: splatconstant_shift_v2i8:
2103 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2106 ; AVX512-LABEL: splatconstant_shift_v2i8:
2108 ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
2109 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2112 ; AVX512VL-LABEL: splatconstant_shift_v2i8:
2113 ; AVX512VL: # %bb.0:
2114 ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
2115 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
2116 ; AVX512VL-NEXT: retq
2118 ; X86-SSE-LABEL: splatconstant_shift_v2i8:
2120 ; X86-SSE-NEXT: psrlw $3, %xmm0
2121 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2122 ; X86-SSE-NEXT: retl
2123 %shift = lshr <2 x i8> %a, <i8 3, i8 3>