1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
13 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
14 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
20 define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
21 ; SSE2-LABEL: var_shift_v2i32:
23 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
24 ; SSE2-NEXT: pand %xmm2, %xmm1
25 ; SSE2-NEXT: pand %xmm2, %xmm0
26 ; SSE2-NEXT: movdqa %xmm0, %xmm2
27 ; SSE2-NEXT: psrlq %xmm1, %xmm2
28 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
29 ; SSE2-NEXT: psrlq %xmm1, %xmm0
30 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
33 ; SSE41-LABEL: var_shift_v2i32:
35 ; SSE41-NEXT: pxor %xmm2, %xmm2
36 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
37 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
38 ; SSE41-NEXT: movdqa %xmm0, %xmm2
39 ; SSE41-NEXT: psrlq %xmm1, %xmm2
40 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
41 ; SSE41-NEXT: psrlq %xmm1, %xmm0
42 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
45 ; AVX1-LABEL: var_shift_v2i32:
47 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
48 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
49 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
50 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
51 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
52 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
53 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
56 ; AVX2-LABEL: var_shift_v2i32:
58 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
59 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
60 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
61 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
64 ; XOPAVX1-LABEL: var_shift_v2i32:
66 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
67 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
68 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
69 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
70 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
73 ; XOPAVX2-LABEL: var_shift_v2i32:
75 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
76 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
77 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
78 ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
81 ; AVX512-LABEL: var_shift_v2i32:
83 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
84 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
85 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
86 ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
89 ; AVX512VL-LABEL: var_shift_v2i32:
91 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
92 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
93 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
94 ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
97 ; X32-SSE-LABEL: var_shift_v2i32:
99 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
100 ; X32-SSE-NEXT: pand %xmm2, %xmm1
101 ; X32-SSE-NEXT: pand %xmm2, %xmm0
102 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
103 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2
104 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
105 ; X32-SSE-NEXT: xorps %xmm3, %xmm3
106 ; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
107 ; X32-SSE-NEXT: psrlq %xmm3, %xmm0
108 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
110 %shift = lshr <2 x i32> %a, %b
114 define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
115 ; SSE2-LABEL: var_shift_v4i16:
117 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
118 ; SSE2-NEXT: pand %xmm2, %xmm0
119 ; SSE2-NEXT: pand %xmm2, %xmm1
120 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
121 ; SSE2-NEXT: movdqa %xmm0, %xmm3
122 ; SSE2-NEXT: psrld %xmm2, %xmm3
123 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
124 ; SSE2-NEXT: movdqa %xmm0, %xmm2
125 ; SSE2-NEXT: psrld %xmm4, %xmm2
126 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
127 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
128 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
129 ; SSE2-NEXT: movdqa %xmm0, %xmm4
130 ; SSE2-NEXT: psrld %xmm3, %xmm4
131 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
132 ; SSE2-NEXT: psrld %xmm1, %xmm0
133 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
134 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
135 ; SSE2-NEXT: movaps %xmm2, %xmm0
138 ; SSE41-LABEL: var_shift_v4i16:
140 ; SSE41-NEXT: pxor %xmm2, %xmm2
141 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
142 ; SSE41-NEXT: movdqa %xmm1, %xmm3
143 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
144 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
145 ; SSE41-NEXT: movdqa %xmm0, %xmm5
146 ; SSE41-NEXT: psrld %xmm4, %xmm5
147 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
148 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
149 ; SSE41-NEXT: movdqa %xmm0, %xmm6
150 ; SSE41-NEXT: psrld %xmm4, %xmm6
151 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
152 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
153 ; SSE41-NEXT: movdqa %xmm0, %xmm2
154 ; SSE41-NEXT: psrld %xmm1, %xmm2
155 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
156 ; SSE41-NEXT: psrld %xmm1, %xmm0
157 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
158 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
161 ; AVX1-LABEL: var_shift_v4i16:
163 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
164 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
165 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
166 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
167 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
168 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
169 ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
170 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
171 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
172 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
173 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
174 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
175 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
176 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
179 ; AVX2-LABEL: var_shift_v4i16:
181 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
182 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
183 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
184 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
187 ; XOPAVX1-LABEL: var_shift_v4i16:
189 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
190 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
191 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
192 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
193 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
196 ; XOPAVX2-LABEL: var_shift_v4i16:
198 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
199 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
200 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
201 ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
204 ; AVX512-LABEL: var_shift_v4i16:
206 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
207 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
208 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
209 ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
212 ; AVX512VL-LABEL: var_shift_v4i16:
214 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
215 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
216 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
217 ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
218 ; AVX512VL-NEXT: retq
220 ; X32-SSE-LABEL: var_shift_v4i16:
222 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
223 ; X32-SSE-NEXT: pand %xmm2, %xmm0
224 ; X32-SSE-NEXT: pand %xmm2, %xmm1
225 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
226 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
227 ; X32-SSE-NEXT: psrld %xmm2, %xmm3
228 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
229 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
230 ; X32-SSE-NEXT: psrld %xmm4, %xmm2
231 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
232 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
233 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
234 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
235 ; X32-SSE-NEXT: psrld %xmm3, %xmm4
236 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
237 ; X32-SSE-NEXT: psrld %xmm1, %xmm0
238 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
239 ; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
240 ; X32-SSE-NEXT: movaps %xmm2, %xmm0
242 %shift = lshr <4 x i16> %a, %b
246 define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
247 ; SSE2-LABEL: var_shift_v2i16:
249 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
250 ; SSE2-NEXT: pand %xmm2, %xmm1
251 ; SSE2-NEXT: pand %xmm2, %xmm0
252 ; SSE2-NEXT: movdqa %xmm0, %xmm2
253 ; SSE2-NEXT: psrlq %xmm1, %xmm2
254 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
255 ; SSE2-NEXT: psrlq %xmm1, %xmm0
256 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
259 ; SSE41-LABEL: var_shift_v2i16:
261 ; SSE41-NEXT: pxor %xmm2, %xmm2
262 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
263 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
264 ; SSE41-NEXT: movdqa %xmm0, %xmm2
265 ; SSE41-NEXT: psrlq %xmm1, %xmm2
266 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
267 ; SSE41-NEXT: psrlq %xmm1, %xmm0
268 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
271 ; AVX1-LABEL: var_shift_v2i16:
273 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
274 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
275 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
276 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
277 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
278 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
279 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
282 ; AVX2-LABEL: var_shift_v2i16:
284 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
285 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
286 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
287 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
290 ; XOPAVX1-LABEL: var_shift_v2i16:
292 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
293 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
294 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
295 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
296 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
299 ; XOPAVX2-LABEL: var_shift_v2i16:
301 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
302 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
303 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
304 ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
307 ; AVX512-LABEL: var_shift_v2i16:
309 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
310 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
311 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
312 ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
315 ; AVX512VL-LABEL: var_shift_v2i16:
317 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
318 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
319 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
320 ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
321 ; AVX512VL-NEXT: retq
323 ; X32-SSE-LABEL: var_shift_v2i16:
325 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
326 ; X32-SSE-NEXT: pand %xmm2, %xmm1
327 ; X32-SSE-NEXT: pand %xmm2, %xmm0
328 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
329 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2
330 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
331 ; X32-SSE-NEXT: psrlq %xmm1, %xmm0
332 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
334 %shift = lshr <2 x i16> %a, %b
338 define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
339 ; SSE2-LABEL: var_shift_v8i8:
341 ; SSE2-NEXT: movdqa %xmm0, %xmm2
342 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
343 ; SSE2-NEXT: psllw $12, %xmm1
344 ; SSE2-NEXT: movdqa %xmm1, %xmm0
345 ; SSE2-NEXT: psraw $15, %xmm0
346 ; SSE2-NEXT: pandn %xmm2, %xmm0
347 ; SSE2-NEXT: paddw %xmm1, %xmm1
348 ; SSE2-NEXT: movdqa %xmm1, %xmm2
349 ; SSE2-NEXT: psraw $15, %xmm2
350 ; SSE2-NEXT: movdqa %xmm2, %xmm3
351 ; SSE2-NEXT: pandn %xmm0, %xmm3
352 ; SSE2-NEXT: psrlw $4, %xmm0
353 ; SSE2-NEXT: pand %xmm2, %xmm0
354 ; SSE2-NEXT: por %xmm3, %xmm0
355 ; SSE2-NEXT: paddw %xmm1, %xmm1
356 ; SSE2-NEXT: movdqa %xmm1, %xmm2
357 ; SSE2-NEXT: psraw $15, %xmm2
358 ; SSE2-NEXT: movdqa %xmm2, %xmm3
359 ; SSE2-NEXT: pandn %xmm0, %xmm3
360 ; SSE2-NEXT: psrlw $2, %xmm0
361 ; SSE2-NEXT: pand %xmm2, %xmm0
362 ; SSE2-NEXT: por %xmm3, %xmm0
363 ; SSE2-NEXT: paddw %xmm1, %xmm1
364 ; SSE2-NEXT: psraw $15, %xmm1
365 ; SSE2-NEXT: movdqa %xmm1, %xmm2
366 ; SSE2-NEXT: pandn %xmm0, %xmm2
367 ; SSE2-NEXT: psrlw $1, %xmm0
368 ; SSE2-NEXT: pand %xmm1, %xmm0
369 ; SSE2-NEXT: por %xmm2, %xmm0
372 ; SSE41-LABEL: var_shift_v8i8:
374 ; SSE41-NEXT: movdqa %xmm1, %xmm2
375 ; SSE41-NEXT: movdqa %xmm0, %xmm1
376 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
377 ; SSE41-NEXT: pand %xmm0, %xmm1
378 ; SSE41-NEXT: pand %xmm0, %xmm2
379 ; SSE41-NEXT: movdqa %xmm2, %xmm0
380 ; SSE41-NEXT: psllw $12, %xmm0
381 ; SSE41-NEXT: psllw $4, %xmm2
382 ; SSE41-NEXT: por %xmm0, %xmm2
383 ; SSE41-NEXT: movdqa %xmm2, %xmm3
384 ; SSE41-NEXT: paddw %xmm2, %xmm3
385 ; SSE41-NEXT: pxor %xmm4, %xmm4
386 ; SSE41-NEXT: movdqa %xmm2, %xmm0
387 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
388 ; SSE41-NEXT: movdqa %xmm1, %xmm2
389 ; SSE41-NEXT: psrlw $4, %xmm2
390 ; SSE41-NEXT: movdqa %xmm3, %xmm0
391 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
392 ; SSE41-NEXT: movdqa %xmm1, %xmm2
393 ; SSE41-NEXT: psrlw $2, %xmm2
394 ; SSE41-NEXT: paddw %xmm3, %xmm3
395 ; SSE41-NEXT: movdqa %xmm3, %xmm0
396 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
397 ; SSE41-NEXT: movdqa %xmm1, %xmm2
398 ; SSE41-NEXT: psrlw $1, %xmm2
399 ; SSE41-NEXT: paddw %xmm3, %xmm3
400 ; SSE41-NEXT: movdqa %xmm3, %xmm0
401 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
402 ; SSE41-NEXT: movdqa %xmm1, %xmm0
405 ; AVX1-LABEL: var_shift_v8i8:
407 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
408 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
409 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
410 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
411 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
412 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
413 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
414 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
415 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
416 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
417 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
418 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
419 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
420 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
421 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
422 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
423 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
426 ; AVX2-LABEL: var_shift_v8i8:
428 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
429 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
430 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
431 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
432 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
433 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
434 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
435 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
436 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
437 ; AVX2-NEXT: vzeroupper
440 ; XOPAVX1-LABEL: var_shift_v8i8:
442 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
443 ; XOPAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
444 ; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
445 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
446 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
447 ; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0
450 ; XOPAVX2-LABEL: var_shift_v8i8:
452 ; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
453 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
454 ; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
455 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
456 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
457 ; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0
460 ; AVX512DQ-LABEL: var_shift_v8i8:
462 ; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
463 ; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0
464 ; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1
465 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
466 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
467 ; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
468 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
469 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
470 ; AVX512DQ-NEXT: vzeroupper
471 ; AVX512DQ-NEXT: retq
473 ; AVX512BW-LABEL: var_shift_v8i8:
475 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
476 ; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
477 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
478 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
479 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
480 ; AVX512BW-NEXT: vzeroupper
481 ; AVX512BW-NEXT: retq
483 ; AVX512DQVL-LABEL: var_shift_v8i8:
484 ; AVX512DQVL: # %bb.0:
485 ; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
486 ; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0
487 ; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1
488 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
489 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
490 ; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
491 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
492 ; AVX512DQVL-NEXT: vzeroupper
493 ; AVX512DQVL-NEXT: retq
495 ; AVX512BWVL-LABEL: var_shift_v8i8:
496 ; AVX512BWVL: # %bb.0:
497 ; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
498 ; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1
499 ; AVX512BWVL-NEXT: vpand %xmm2, %xmm0, %xmm0
500 ; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
501 ; AVX512BWVL-NEXT: retq
503 ; X32-SSE-LABEL: var_shift_v8i8:
505 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
506 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
507 ; X32-SSE-NEXT: psllw $12, %xmm1
508 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0
509 ; X32-SSE-NEXT: psraw $15, %xmm0
510 ; X32-SSE-NEXT: pandn %xmm2, %xmm0
511 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
512 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
513 ; X32-SSE-NEXT: psraw $15, %xmm2
514 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
515 ; X32-SSE-NEXT: pandn %xmm0, %xmm3
516 ; X32-SSE-NEXT: psrlw $4, %xmm0
517 ; X32-SSE-NEXT: pand %xmm2, %xmm0
518 ; X32-SSE-NEXT: por %xmm3, %xmm0
519 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
520 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
521 ; X32-SSE-NEXT: psraw $15, %xmm2
522 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
523 ; X32-SSE-NEXT: pandn %xmm0, %xmm3
524 ; X32-SSE-NEXT: psrlw $2, %xmm0
525 ; X32-SSE-NEXT: pand %xmm2, %xmm0
526 ; X32-SSE-NEXT: por %xmm3, %xmm0
527 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
528 ; X32-SSE-NEXT: psraw $15, %xmm1
529 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
530 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
531 ; X32-SSE-NEXT: psrlw $1, %xmm0
532 ; X32-SSE-NEXT: pand %xmm1, %xmm0
533 ; X32-SSE-NEXT: por %xmm2, %xmm0
535 %shift = lshr <8 x i8> %a, %b
539 define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
540 ; SSE2-LABEL: var_shift_v4i8:
542 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
543 ; SSE2-NEXT: pand %xmm2, %xmm0
544 ; SSE2-NEXT: pand %xmm2, %xmm1
545 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
546 ; SSE2-NEXT: movdqa %xmm0, %xmm3
547 ; SSE2-NEXT: psrld %xmm2, %xmm3
548 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
549 ; SSE2-NEXT: movdqa %xmm0, %xmm2
550 ; SSE2-NEXT: psrld %xmm4, %xmm2
551 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
552 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
553 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
554 ; SSE2-NEXT: movdqa %xmm0, %xmm4
555 ; SSE2-NEXT: psrld %xmm3, %xmm4
556 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
557 ; SSE2-NEXT: psrld %xmm1, %xmm0
558 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
559 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
560 ; SSE2-NEXT: movaps %xmm2, %xmm0
563 ; SSE41-LABEL: var_shift_v4i8:
565 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
566 ; SSE41-NEXT: pand %xmm2, %xmm0
567 ; SSE41-NEXT: pand %xmm2, %xmm1
568 ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
569 ; SSE41-NEXT: movdqa %xmm0, %xmm3
570 ; SSE41-NEXT: psrld %xmm2, %xmm3
571 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
572 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
573 ; SSE41-NEXT: movdqa %xmm0, %xmm5
574 ; SSE41-NEXT: psrld %xmm4, %xmm5
575 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
576 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
577 ; SSE41-NEXT: movdqa %xmm0, %xmm3
578 ; SSE41-NEXT: psrld %xmm1, %xmm3
579 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
580 ; SSE41-NEXT: psrld %xmm1, %xmm0
581 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
582 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
585 ; AVX1-LABEL: var_shift_v4i8:
587 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
588 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
589 ; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
590 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
591 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
592 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
593 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
594 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
595 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
596 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
597 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
598 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
599 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
600 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
601 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
604 ; AVX2-LABEL: var_shift_v4i8:
606 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
607 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
608 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
609 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
612 ; XOPAVX1-LABEL: var_shift_v4i8:
614 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
615 ; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
616 ; XOPAVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
617 ; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
618 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
619 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
622 ; XOPAVX2-LABEL: var_shift_v4i8:
624 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
625 ; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
626 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
627 ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
630 ; AVX512-LABEL: var_shift_v4i8:
632 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
633 ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
634 ; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
635 ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
638 ; AVX512VL-LABEL: var_shift_v4i8:
640 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
641 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
642 ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
643 ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
644 ; AVX512VL-NEXT: retq
646 ; X32-SSE-LABEL: var_shift_v4i8:
648 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
649 ; X32-SSE-NEXT: pand %xmm2, %xmm0
650 ; X32-SSE-NEXT: pand %xmm2, %xmm1
651 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
652 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
653 ; X32-SSE-NEXT: psrld %xmm2, %xmm3
654 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
655 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
656 ; X32-SSE-NEXT: psrld %xmm4, %xmm2
657 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
658 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
659 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
660 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
661 ; X32-SSE-NEXT: psrld %xmm3, %xmm4
662 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
663 ; X32-SSE-NEXT: psrld %xmm1, %xmm0
664 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
665 ; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
666 ; X32-SSE-NEXT: movaps %xmm2, %xmm0
668 %shift = lshr <4 x i8> %a, %b
672 define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
673 ; SSE2-LABEL: var_shift_v2i8:
675 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
676 ; SSE2-NEXT: pand %xmm2, %xmm1
677 ; SSE2-NEXT: pand %xmm2, %xmm0
678 ; SSE2-NEXT: movdqa %xmm0, %xmm2
679 ; SSE2-NEXT: psrlq %xmm1, %xmm2
680 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
681 ; SSE2-NEXT: psrlq %xmm1, %xmm0
682 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
685 ; SSE41-LABEL: var_shift_v2i8:
687 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
688 ; SSE41-NEXT: pand %xmm2, %xmm1
689 ; SSE41-NEXT: pand %xmm2, %xmm0
690 ; SSE41-NEXT: movdqa %xmm0, %xmm2
691 ; SSE41-NEXT: psrlq %xmm1, %xmm2
692 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
693 ; SSE41-NEXT: psrlq %xmm1, %xmm0
694 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
697 ; AVX1-LABEL: var_shift_v2i8:
699 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
700 ; AVX1-NEXT: # xmm2 = mem[0,0]
701 ; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
702 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
703 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
704 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
705 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
706 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
709 ; AVX2-LABEL: var_shift_v2i8:
711 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
712 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
713 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
714 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
717 ; XOPAVX1-LABEL: var_shift_v2i8:
719 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
720 ; XOPAVX1-NEXT: # xmm2 = mem[0,0]
721 ; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
722 ; XOPAVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
723 ; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
724 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
725 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
728 ; XOPAVX2-LABEL: var_shift_v2i8:
730 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
731 ; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
732 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
733 ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
736 ; AVX512-LABEL: var_shift_v2i8:
738 ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
739 ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
740 ; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
741 ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
744 ; AVX512VL-LABEL: var_shift_v2i8:
746 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
747 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
748 ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
749 ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
750 ; AVX512VL-NEXT: retq
752 ; X32-SSE-LABEL: var_shift_v2i8:
754 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
755 ; X32-SSE-NEXT: pand %xmm2, %xmm1
756 ; X32-SSE-NEXT: pand %xmm2, %xmm0
757 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
758 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2
759 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
760 ; X32-SSE-NEXT: psrlq %xmm1, %xmm0
761 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
763 %shift = lshr <2 x i8> %a, %b
768 ; Uniform Variable Shifts
771 define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
772 ; SSE2-LABEL: splatvar_shift_v2i32:
774 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
775 ; SSE2-NEXT: pand %xmm2, %xmm0
776 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
777 ; SSE2-NEXT: pand %xmm2, %xmm1
778 ; SSE2-NEXT: movdqa %xmm0, %xmm2
779 ; SSE2-NEXT: psrlq %xmm1, %xmm2
780 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
781 ; SSE2-NEXT: psrlq %xmm1, %xmm0
782 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
785 ; SSE41-LABEL: splatvar_shift_v2i32:
787 ; SSE41-NEXT: pxor %xmm2, %xmm2
788 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
789 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
790 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
791 ; SSE41-NEXT: movdqa %xmm0, %xmm2
792 ; SSE41-NEXT: psrlq %xmm1, %xmm2
793 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
794 ; SSE41-NEXT: psrlq %xmm1, %xmm0
795 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
798 ; AVX1-LABEL: splatvar_shift_v2i32:
800 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
801 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
802 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
803 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
804 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
805 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
806 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
807 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
810 ; AVX2-LABEL: splatvar_shift_v2i32:
812 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
813 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
814 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
815 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
816 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
819 ; XOPAVX1-LABEL: splatvar_shift_v2i32:
821 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
822 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
823 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
824 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
825 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
826 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
829 ; XOPAVX2-LABEL: splatvar_shift_v2i32:
831 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
832 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
833 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
834 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
835 ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
838 ; AVX512-LABEL: splatvar_shift_v2i32:
840 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
841 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
842 ; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
843 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
844 ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
847 ; AVX512VL-LABEL: splatvar_shift_v2i32:
849 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
850 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
851 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
852 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
853 ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
854 ; AVX512VL-NEXT: retq
856 ; X32-SSE-LABEL: splatvar_shift_v2i32:
858 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
859 ; X32-SSE-NEXT: pand %xmm2, %xmm0
860 ; X32-SSE-NEXT: pand %xmm1, %xmm2
861 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
862 ; X32-SSE-NEXT: psrlq %xmm2, %xmm3
863 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
864 ; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
865 ; X32-SSE-NEXT: psrlq %xmm2, %xmm0
866 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
868 %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
869 %shift = lshr <2 x i32> %a, %splat
873 define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
874 ; SSE2-LABEL: splatvar_shift_v4i16:
876 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
877 ; SSE2-NEXT: pand %xmm2, %xmm0
878 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
879 ; SSE2-NEXT: pand %xmm2, %xmm3
880 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
881 ; SSE2-NEXT: movdqa %xmm0, %xmm2
882 ; SSE2-NEXT: psrld %xmm1, %xmm2
883 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
884 ; SSE2-NEXT: movdqa %xmm0, %xmm1
885 ; SSE2-NEXT: psrld %xmm4, %xmm1
886 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
887 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
888 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
889 ; SSE2-NEXT: movdqa %xmm0, %xmm4
890 ; SSE2-NEXT: psrld %xmm3, %xmm4
891 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
892 ; SSE2-NEXT: psrld %xmm2, %xmm0
893 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
894 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
895 ; SSE2-NEXT: movaps %xmm1, %xmm0
898 ; SSE41-LABEL: splatvar_shift_v4i16:
900 ; SSE41-NEXT: pxor %xmm2, %xmm2
901 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
902 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
903 ; SSE41-NEXT: movdqa %xmm1, %xmm3
904 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
905 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
906 ; SSE41-NEXT: movdqa %xmm0, %xmm5
907 ; SSE41-NEXT: psrld %xmm4, %xmm5
908 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
909 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
910 ; SSE41-NEXT: movdqa %xmm0, %xmm6
911 ; SSE41-NEXT: psrld %xmm4, %xmm6
912 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
913 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
914 ; SSE41-NEXT: movdqa %xmm0, %xmm2
915 ; SSE41-NEXT: psrld %xmm1, %xmm2
916 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
917 ; SSE41-NEXT: psrld %xmm1, %xmm0
918 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
919 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
922 ; AVX1-LABEL: splatvar_shift_v4i16:
924 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
925 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
926 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
927 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
928 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
929 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
930 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
931 ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
932 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
933 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
934 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
935 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
936 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
937 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
938 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
941 ; AVX2-LABEL: splatvar_shift_v4i16:
943 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
944 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
945 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
946 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
947 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
950 ; XOPAVX1-LABEL: splatvar_shift_v4i16:
952 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
953 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
954 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
955 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
956 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
957 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
960 ; XOPAVX2-LABEL: splatvar_shift_v4i16:
962 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
963 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
964 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
965 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
966 ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
969 ; AVX512-LABEL: splatvar_shift_v4i16:
971 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
972 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
973 ; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
974 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
975 ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
978 ; AVX512VL-LABEL: splatvar_shift_v4i16:
980 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
981 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
982 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
983 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
984 ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
985 ; AVX512VL-NEXT: retq
987 ; X32-SSE-LABEL: splatvar_shift_v4i16:
989 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
990 ; X32-SSE-NEXT: pand %xmm2, %xmm0
991 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
992 ; X32-SSE-NEXT: pand %xmm2, %xmm3
993 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
994 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
995 ; X32-SSE-NEXT: psrld %xmm1, %xmm2
996 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
997 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
998 ; X32-SSE-NEXT: psrld %xmm4, %xmm1
999 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1000 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
1001 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
1002 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
1003 ; X32-SSE-NEXT: psrld %xmm3, %xmm4
1004 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
1005 ; X32-SSE-NEXT: psrld %xmm2, %xmm0
1006 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
1007 ; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
1008 ; X32-SSE-NEXT: movaps %xmm1, %xmm0
1009 ; X32-SSE-NEXT: retl
1010 %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
1011 %shift = lshr <4 x i16> %a, %splat
1012 ret <4 x i16> %shift
1015 define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
1016 ; SSE2-LABEL: splatvar_shift_v2i16:
1018 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
1019 ; SSE2-NEXT: pand %xmm2, %xmm0
1020 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1021 ; SSE2-NEXT: pand %xmm2, %xmm1
1022 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1023 ; SSE2-NEXT: psrlq %xmm1, %xmm2
1024 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1025 ; SSE2-NEXT: psrlq %xmm1, %xmm0
1026 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1029 ; SSE41-LABEL: splatvar_shift_v2i16:
1031 ; SSE41-NEXT: pxor %xmm2, %xmm2
1032 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1033 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1034 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1035 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1036 ; SSE41-NEXT: psrlq %xmm1, %xmm2
1037 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1038 ; SSE41-NEXT: psrlq %xmm1, %xmm0
1039 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1042 ; AVX1-LABEL: splatvar_shift_v2i16:
1044 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1045 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1046 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1047 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1048 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
1049 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1050 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
1051 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1054 ; AVX2-LABEL: splatvar_shift_v2i16:
1056 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1057 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1058 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
1059 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1060 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1063 ; XOPAVX1-LABEL: splatvar_shift_v2i16:
1065 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1066 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1067 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1068 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1069 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
1070 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
1071 ; XOPAVX1-NEXT: retq
1073 ; XOPAVX2-LABEL: splatvar_shift_v2i16:
1075 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1076 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1077 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
1078 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1079 ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1080 ; XOPAVX2-NEXT: retq
1082 ; AVX512-LABEL: splatvar_shift_v2i16:
1084 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
1085 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1086 ; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
1087 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1088 ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1091 ; AVX512VL-LABEL: splatvar_shift_v2i16:
1092 ; AVX512VL: # %bb.0:
1093 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
1094 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1095 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
1096 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1097 ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1098 ; AVX512VL-NEXT: retq
1100 ; X32-SSE-LABEL: splatvar_shift_v2i16:
1102 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
1103 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1104 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1105 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1106 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1107 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2
1108 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1109 ; X32-SSE-NEXT: psrlq %xmm1, %xmm0
1110 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1111 ; X32-SSE-NEXT: retl
1112 %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
1113 %shift = lshr <2 x i16> %a, %splat
1114 ret <2 x i16> %shift
1117 define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
1118 ; SSE2-LABEL: splatvar_shift_v8i8:
1120 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1121 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1122 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
1123 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
1124 ; SSE2-NEXT: psllw $12, %xmm1
1125 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1126 ; SSE2-NEXT: psraw $15, %xmm0
1127 ; SSE2-NEXT: pandn %xmm2, %xmm0
1128 ; SSE2-NEXT: paddw %xmm1, %xmm1
1129 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1130 ; SSE2-NEXT: psraw $15, %xmm2
1131 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1132 ; SSE2-NEXT: pandn %xmm0, %xmm3
1133 ; SSE2-NEXT: psrlw $4, %xmm0
1134 ; SSE2-NEXT: pand %xmm2, %xmm0
1135 ; SSE2-NEXT: por %xmm3, %xmm0
1136 ; SSE2-NEXT: paddw %xmm1, %xmm1
1137 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1138 ; SSE2-NEXT: psraw $15, %xmm2
1139 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1140 ; SSE2-NEXT: pandn %xmm0, %xmm3
1141 ; SSE2-NEXT: psrlw $2, %xmm0
1142 ; SSE2-NEXT: pand %xmm2, %xmm0
1143 ; SSE2-NEXT: por %xmm3, %xmm0
1144 ; SSE2-NEXT: paddw %xmm1, %xmm1
1145 ; SSE2-NEXT: psraw $15, %xmm1
1146 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1147 ; SSE2-NEXT: pandn %xmm0, %xmm2
1148 ; SSE2-NEXT: psrlw $1, %xmm0
1149 ; SSE2-NEXT: pand %xmm1, %xmm0
1150 ; SSE2-NEXT: por %xmm2, %xmm0
1153 ; SSE41-LABEL: splatvar_shift_v8i8:
1155 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1156 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1157 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
1158 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero
1159 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1160 ; SSE41-NEXT: psllw $12, %xmm0
1161 ; SSE41-NEXT: psllw $4, %xmm2
1162 ; SSE41-NEXT: por %xmm0, %xmm2
1163 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1164 ; SSE41-NEXT: paddw %xmm2, %xmm3
1165 ; SSE41-NEXT: pxor %xmm4, %xmm4
1166 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1167 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
1168 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1169 ; SSE41-NEXT: psrlw $4, %xmm2
1170 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1171 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
1172 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1173 ; SSE41-NEXT: psrlw $2, %xmm2
1174 ; SSE41-NEXT: paddw %xmm3, %xmm3
1175 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1176 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
1177 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1178 ; SSE41-NEXT: psrlw $1, %xmm2
1179 ; SSE41-NEXT: paddw %xmm3, %xmm3
1180 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1181 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
1182 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1185 ; AVX1-LABEL: splatvar_shift_v8i8:
1187 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1188 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1189 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
1190 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1191 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1192 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
1193 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1194 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1195 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
1196 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1197 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
1198 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
1199 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1200 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
1201 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
1202 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1205 ; AVX2-LABEL: splatvar_shift_v8i8:
1207 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1208 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1209 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1210 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1211 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1212 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
1213 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1214 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1215 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1216 ; AVX2-NEXT: vzeroupper
1219 ; XOPAVX1-LABEL: splatvar_shift_v8i8:
1221 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1222 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1223 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1224 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
1225 ; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0
1226 ; XOPAVX1-NEXT: retq
1228 ; XOPAVX2-LABEL: splatvar_shift_v8i8:
1230 ; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1231 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1232 ; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1233 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1234 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
1235 ; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0
1236 ; XOPAVX2-NEXT: retq
1238 ; AVX512DQ-LABEL: splatvar_shift_v8i8:
1239 ; AVX512DQ: # %bb.0:
1240 ; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1241 ; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0
1242 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1243 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1244 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1245 ; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
1246 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
1247 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1248 ; AVX512DQ-NEXT: vzeroupper
1249 ; AVX512DQ-NEXT: retq
1251 ; AVX512BW-LABEL: splatvar_shift_v8i8:
1252 ; AVX512BW: # %bb.0:
1253 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1254 ; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
1255 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1256 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1257 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1258 ; AVX512BW-NEXT: vzeroupper
1259 ; AVX512BW-NEXT: retq
1261 ; AVX512DQVL-LABEL: splatvar_shift_v8i8:
1262 ; AVX512DQVL: # %bb.0:
1263 ; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1264 ; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0
1265 ; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1266 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1267 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1268 ; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
1269 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
1270 ; AVX512DQVL-NEXT: vzeroupper
1271 ; AVX512DQVL-NEXT: retq
1273 ; AVX512BWVL-LABEL: splatvar_shift_v8i8:
1274 ; AVX512BWVL: # %bb.0:
1275 ; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1276 ; AVX512BWVL-NEXT: vpand %xmm2, %xmm0, %xmm0
1277 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1278 ; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
1279 ; AVX512BWVL-NEXT: retq
1281 ; X32-SSE-LABEL: splatvar_shift_v8i8:
1283 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1284 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1285 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
1286 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
1287 ; X32-SSE-NEXT: psllw $12, %xmm1
1288 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0
1289 ; X32-SSE-NEXT: psraw $15, %xmm0
1290 ; X32-SSE-NEXT: pandn %xmm2, %xmm0
1291 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
1292 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
1293 ; X32-SSE-NEXT: psraw $15, %xmm2
1294 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
1295 ; X32-SSE-NEXT: pandn %xmm0, %xmm3
1296 ; X32-SSE-NEXT: psrlw $4, %xmm0
1297 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1298 ; X32-SSE-NEXT: por %xmm3, %xmm0
1299 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
1300 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
1301 ; X32-SSE-NEXT: psraw $15, %xmm2
1302 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
1303 ; X32-SSE-NEXT: pandn %xmm0, %xmm3
1304 ; X32-SSE-NEXT: psrlw $2, %xmm0
1305 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1306 ; X32-SSE-NEXT: por %xmm3, %xmm0
1307 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
1308 ; X32-SSE-NEXT: psraw $15, %xmm1
1309 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
1310 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1311 ; X32-SSE-NEXT: psrlw $1, %xmm0
1312 ; X32-SSE-NEXT: pand %xmm1, %xmm0
1313 ; X32-SSE-NEXT: por %xmm2, %xmm0
1314 ; X32-SSE-NEXT: retl
1315 %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
1316 %shift = lshr <8 x i8> %a, %splat
1320 define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
1321 ; SSE2-LABEL: splatvar_shift_v4i8:
1323 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1324 ; SSE2-NEXT: pand %xmm2, %xmm0
1325 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
1326 ; SSE2-NEXT: pand %xmm2, %xmm3
1327 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
1328 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1329 ; SSE2-NEXT: psrld %xmm1, %xmm2
1330 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
1331 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1332 ; SSE2-NEXT: psrld %xmm4, %xmm1
1333 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1334 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
1335 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
1336 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1337 ; SSE2-NEXT: psrld %xmm3, %xmm4
1338 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
1339 ; SSE2-NEXT: psrld %xmm2, %xmm0
1340 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
1341 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
1342 ; SSE2-NEXT: movaps %xmm1, %xmm0
1345 ; SSE41-LABEL: splatvar_shift_v4i8:
1347 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
1348 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1349 ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
1350 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1351 ; SSE41-NEXT: psrld %xmm2, %xmm3
1352 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1353 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
1354 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1355 ; SSE41-NEXT: psrld %xmm4, %xmm5
1356 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
1357 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
1358 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1359 ; SSE41-NEXT: psrld %xmm1, %xmm3
1360 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
1361 ; SSE41-NEXT: psrld %xmm1, %xmm0
1362 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
1363 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
1366 ; AVX1-LABEL: splatvar_shift_v4i8:
1368 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
1369 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
1370 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1371 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1372 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
1373 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
1374 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
1375 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1376 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1377 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1378 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
1379 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1380 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
1381 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
1382 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1385 ; AVX2-LABEL: splatvar_shift_v4i8:
1387 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
1388 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1389 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1390 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
1393 ; XOPAVX1-LABEL: splatvar_shift_v4i8:
1395 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
1396 ; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
1397 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1398 ; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
1399 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
1400 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
1401 ; XOPAVX1-NEXT: retq
1403 ; XOPAVX2-LABEL: splatvar_shift_v4i8:
1405 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
1406 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1407 ; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1408 ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
1409 ; XOPAVX2-NEXT: retq
1411 ; AVX512-LABEL: splatvar_shift_v4i8:
1413 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
1414 ; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
1415 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1416 ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
1419 ; AVX512VL-LABEL: splatvar_shift_v4i8:
1420 ; AVX512VL: # %bb.0:
1421 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
1422 ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
1423 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1424 ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
1425 ; AVX512VL-NEXT: retq
1427 ; X32-SSE-LABEL: splatvar_shift_v4i8:
1429 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1430 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1431 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
1432 ; X32-SSE-NEXT: pand %xmm2, %xmm3
1433 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
1434 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1435 ; X32-SSE-NEXT: psrld %xmm1, %xmm2
1436 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
1437 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1438 ; X32-SSE-NEXT: psrld %xmm4, %xmm1
1439 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1440 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
1441 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
1442 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
1443 ; X32-SSE-NEXT: psrld %xmm3, %xmm4
1444 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
1445 ; X32-SSE-NEXT: psrld %xmm2, %xmm0
1446 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
1447 ; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
1448 ; X32-SSE-NEXT: movaps %xmm1, %xmm0
1449 ; X32-SSE-NEXT: retl
1450 %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
1451 %shift = lshr <4 x i8> %a, %splat
1455 define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
1456 ; SSE2-LABEL: splatvar_shift_v2i8:
1458 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1459 ; SSE2-NEXT: pand %xmm2, %xmm0
1460 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1461 ; SSE2-NEXT: pand %xmm2, %xmm1
1462 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1463 ; SSE2-NEXT: psrlq %xmm1, %xmm2
1464 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1465 ; SSE2-NEXT: psrlq %xmm1, %xmm0
1466 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1469 ; SSE41-LABEL: splatvar_shift_v2i8:
1471 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
1472 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1473 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1474 ; SSE41-NEXT: psrlq %xmm1, %xmm2
1475 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1476 ; SSE41-NEXT: psrlq %xmm1, %xmm0
1477 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1480 ; AVX1-LABEL: splatvar_shift_v2i8:
1482 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
1483 ; AVX1-NEXT: # xmm2 = mem[0,0]
1484 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
1485 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1486 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
1487 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1488 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
1489 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1492 ; AVX2-LABEL: splatvar_shift_v2i8:
1494 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
1495 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1496 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1497 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1500 ; XOPAVX1-LABEL: splatvar_shift_v2i8:
1502 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
1503 ; XOPAVX1-NEXT: # xmm2 = mem[0,0]
1504 ; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
1505 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1506 ; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
1507 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
1508 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
1509 ; XOPAVX1-NEXT: retq
1511 ; XOPAVX2-LABEL: splatvar_shift_v2i8:
1513 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
1514 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1515 ; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1516 ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1517 ; XOPAVX2-NEXT: retq
1519 ; AVX512-LABEL: splatvar_shift_v2i8:
1521 ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
1522 ; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
1523 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1524 ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1527 ; AVX512VL-LABEL: splatvar_shift_v2i8:
1528 ; AVX512VL: # %bb.0:
1529 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
1530 ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
1531 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1532 ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1533 ; AVX512VL-NEXT: retq
1535 ; X32-SSE-LABEL: splatvar_shift_v2i8:
1537 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1538 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1539 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1540 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1541 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1542 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2
1543 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1544 ; X32-SSE-NEXT: psrlq %xmm1, %xmm0
1545 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1546 ; X32-SSE-NEXT: retl
1547 %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
1548 %shift = lshr <2 x i8> %a, %splat
1556 define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
1557 ; SSE2-LABEL: constant_shift_v2i32:
1559 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1560 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1561 ; SSE2-NEXT: psrlq $4, %xmm1
1562 ; SSE2-NEXT: psrlq $5, %xmm0
1563 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1566 ; SSE41-LABEL: constant_shift_v2i32:
1568 ; SSE41-NEXT: pxor %xmm1, %xmm1
1569 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1570 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1571 ; SSE41-NEXT: psrlq $5, %xmm0
1572 ; SSE41-NEXT: psrlq $4, %xmm1
1573 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1574 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1577 ; AVX1-LABEL: constant_shift_v2i32:
1579 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1580 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1581 ; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm1
1582 ; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm0
1583 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1586 ; AVX2-LABEL: constant_shift_v2i32:
1588 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1589 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1590 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1593 ; XOPAVX1-LABEL: constant_shift_v2i32:
1595 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1596 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1597 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
1598 ; XOPAVX1-NEXT: retq
1600 ; XOPAVX2-LABEL: constant_shift_v2i32:
1602 ; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1603 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1604 ; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1605 ; XOPAVX2-NEXT: retq
1607 ; AVX512-LABEL: constant_shift_v2i32:
1609 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1610 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1611 ; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1614 ; AVX512VL-LABEL: constant_shift_v2i32:
1615 ; AVX512VL: # %bb.0:
1616 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1617 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1618 ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1619 ; AVX512VL-NEXT: retq
1621 ; X32-SSE-LABEL: constant_shift_v2i32:
1623 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1624 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1625 ; X32-SSE-NEXT: psrlq $4, %xmm1
1626 ; X32-SSE-NEXT: psrlq $5, %xmm0
1627 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1628 ; X32-SSE-NEXT: retl
1629 %shift = lshr <2 x i32> %a, <i32 4, i32 5>
1630 ret <2 x i32> %shift
1633 define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
1634 ; SSE2-LABEL: constant_shift_v4i16:
1636 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1637 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1638 ; SSE2-NEXT: psrld $3, %xmm1
1639 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1640 ; SSE2-NEXT: psrld $2, %xmm2
1641 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1642 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1643 ; SSE2-NEXT: psrld $1, %xmm1
1644 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1645 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1648 ; SSE41-LABEL: constant_shift_v4i16:
1650 ; SSE41-NEXT: pxor %xmm1, %xmm1
1651 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1652 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1653 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1654 ; SSE41-NEXT: psrld $2, %xmm0
1655 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1656 ; SSE41-NEXT: psrld $3, %xmm1
1657 ; SSE41-NEXT: psrld $1, %xmm2
1658 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1659 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1662 ; AVX1-LABEL: constant_shift_v4i16:
1664 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1665 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1666 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1
1667 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2
1668 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1669 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
1670 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1671 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1674 ; AVX2-LABEL: constant_shift_v4i16:
1676 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1677 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1678 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1681 ; XOPAVX1-LABEL: constant_shift_v4i16:
1683 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1684 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1685 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
1686 ; XOPAVX1-NEXT: retq
1688 ; XOPAVX2-LABEL: constant_shift_v4i16:
1690 ; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1691 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1692 ; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1693 ; XOPAVX2-NEXT: retq
1695 ; AVX512-LABEL: constant_shift_v4i16:
1697 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1698 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1699 ; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1702 ; AVX512VL-LABEL: constant_shift_v4i16:
1703 ; AVX512VL: # %bb.0:
1704 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1705 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1706 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1707 ; AVX512VL-NEXT: retq
1709 ; X32-SSE-LABEL: constant_shift_v4i16:
1711 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1712 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1713 ; X32-SSE-NEXT: psrld $3, %xmm1
1714 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1715 ; X32-SSE-NEXT: psrld $2, %xmm2
1716 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1717 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1718 ; X32-SSE-NEXT: psrld $1, %xmm1
1719 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1720 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1721 ; X32-SSE-NEXT: retl
1722 %shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
1723 ret <4 x i16> %shift
1726 define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
1727 ; SSE2-LABEL: constant_shift_v2i16:
1729 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1730 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1731 ; SSE2-NEXT: psrlq $2, %xmm1
1732 ; SSE2-NEXT: psrlq $3, %xmm0
1733 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1736 ; SSE41-LABEL: constant_shift_v2i16:
1738 ; SSE41-NEXT: pxor %xmm1, %xmm1
1739 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1740 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1741 ; SSE41-NEXT: psrlq $3, %xmm0
1742 ; SSE41-NEXT: psrlq $2, %xmm1
1743 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1744 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1747 ; AVX1-LABEL: constant_shift_v2i16:
1749 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1750 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1751 ; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1
1752 ; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
1753 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1756 ; AVX2-LABEL: constant_shift_v2i16:
1758 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1759 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1760 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1763 ; XOPAVX1-LABEL: constant_shift_v2i16:
1765 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1766 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1767 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
1768 ; XOPAVX1-NEXT: retq
1770 ; XOPAVX2-LABEL: constant_shift_v2i16:
1772 ; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1773 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1774 ; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1775 ; XOPAVX2-NEXT: retq
1777 ; AVX512-LABEL: constant_shift_v2i16:
1779 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1780 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1781 ; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1784 ; AVX512VL-LABEL: constant_shift_v2i16:
1785 ; AVX512VL: # %bb.0:
1786 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1787 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1788 ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1789 ; AVX512VL-NEXT: retq
1791 ; X32-SSE-LABEL: constant_shift_v2i16:
1793 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1794 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1795 ; X32-SSE-NEXT: psrlq $2, %xmm1
1796 ; X32-SSE-NEXT: psrlq $3, %xmm0
1797 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1798 ; X32-SSE-NEXT: retl
1799 %shift = lshr <2 x i16> %a, <i16 2, i16 3>
1800 ret <2 x i16> %shift
1803 define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
1804 ; SSE2-LABEL: constant_shift_v8i8:
1806 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1807 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
1808 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1809 ; SSE2-NEXT: pandn %xmm0, %xmm2
1810 ; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0
1811 ; SSE2-NEXT: pand %xmm1, %xmm0
1812 ; SSE2-NEXT: por %xmm2, %xmm0
1815 ; SSE41-LABEL: constant_shift_v8i8:
1817 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
1818 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,4096,2048,1024,512>
1819 ; SSE41-NEXT: pmulhuw %xmm0, %xmm1
1820 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1823 ; AVX-LABEL: constant_shift_v8i8:
1825 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1826 ; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
1827 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1830 ; XOP-LABEL: constant_shift_v8i8:
1832 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1833 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
1836 ; AVX512DQ-LABEL: constant_shift_v8i8:
1837 ; AVX512DQ: # %bb.0:
1838 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1839 ; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
1840 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1841 ; AVX512DQ-NEXT: retq
1843 ; AVX512BW-LABEL: constant_shift_v8i8:
1844 ; AVX512BW: # %bb.0:
1845 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1846 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1847 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1848 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1849 ; AVX512BW-NEXT: vzeroupper
1850 ; AVX512BW-NEXT: retq
1852 ; AVX512DQVL-LABEL: constant_shift_v8i8:
1853 ; AVX512DQVL: # %bb.0:
1854 ; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1855 ; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
1856 ; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1857 ; AVX512DQVL-NEXT: retq
1859 ; AVX512BWVL-LABEL: constant_shift_v8i8:
1860 ; AVX512BWVL: # %bb.0:
1861 ; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1862 ; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
1863 ; AVX512BWVL-NEXT: retq
1865 ; X32-SSE-LABEL: constant_shift_v8i8:
1867 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1868 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
1869 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
1870 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1871 ; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0
1872 ; X32-SSE-NEXT: pand %xmm1, %xmm0
1873 ; X32-SSE-NEXT: por %xmm2, %xmm0
1874 ; X32-SSE-NEXT: retl
1875 %shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1879 define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
1880 ; SSE2-LABEL: constant_shift_v4i8:
1882 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1883 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1884 ; SSE2-NEXT: psrld $3, %xmm1
1885 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1886 ; SSE2-NEXT: psrld $2, %xmm2
1887 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1888 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1889 ; SSE2-NEXT: psrld $1, %xmm1
1890 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1891 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1894 ; SSE41-LABEL: constant_shift_v4i8:
1896 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
1897 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1898 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1899 ; SSE41-NEXT: psrld $2, %xmm1
1900 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1901 ; SSE41-NEXT: psrld $3, %xmm0
1902 ; SSE41-NEXT: psrld $1, %xmm2
1903 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1904 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1905 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1908 ; AVX1-LABEL: constant_shift_v4i8:
1910 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1911 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1
1912 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2
1913 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1914 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
1915 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1916 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1919 ; AVX2-LABEL: constant_shift_v4i8:
1921 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1922 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1925 ; XOPAVX1-LABEL: constant_shift_v4i8:
1927 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1928 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
1929 ; XOPAVX1-NEXT: retq
1931 ; XOPAVX2-LABEL: constant_shift_v4i8:
1933 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1934 ; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1935 ; XOPAVX2-NEXT: retq
1937 ; AVX512-LABEL: constant_shift_v4i8:
1939 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1940 ; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1943 ; AVX512VL-LABEL: constant_shift_v4i8:
1944 ; AVX512VL: # %bb.0:
1945 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1946 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1947 ; AVX512VL-NEXT: retq
1949 ; X32-SSE-LABEL: constant_shift_v4i8:
1951 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1952 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1953 ; X32-SSE-NEXT: psrld $3, %xmm1
1954 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1955 ; X32-SSE-NEXT: psrld $2, %xmm2
1956 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1957 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1958 ; X32-SSE-NEXT: psrld $1, %xmm1
1959 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1960 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1961 ; X32-SSE-NEXT: retl
1962 %shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
1966 define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
1967 ; SSE2-LABEL: constant_shift_v2i8:
1969 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1970 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1971 ; SSE2-NEXT: psrlq $2, %xmm1
1972 ; SSE2-NEXT: psrlq $3, %xmm0
1973 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1976 ; SSE41-LABEL: constant_shift_v2i8:
1978 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
1979 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1980 ; SSE41-NEXT: psrlq $3, %xmm1
1981 ; SSE41-NEXT: psrlq $2, %xmm0
1982 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1985 ; AVX1-LABEL: constant_shift_v2i8:
1987 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1988 ; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1
1989 ; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
1990 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1993 ; AVX2-LABEL: constant_shift_v2i8:
1995 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1996 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1999 ; XOPAVX1-LABEL: constant_shift_v2i8:
2001 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2002 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
2003 ; XOPAVX1-NEXT: retq
2005 ; XOPAVX2-LABEL: constant_shift_v2i8:
2007 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2008 ; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
2009 ; XOPAVX2-NEXT: retq
2011 ; AVX512-LABEL: constant_shift_v2i8:
2013 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2014 ; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
2017 ; AVX512VL-LABEL: constant_shift_v2i8:
2018 ; AVX512VL: # %bb.0:
2019 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2020 ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
2021 ; AVX512VL-NEXT: retq
2023 ; X32-SSE-LABEL: constant_shift_v2i8:
2025 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2026 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
2027 ; X32-SSE-NEXT: psrlq $2, %xmm1
2028 ; X32-SSE-NEXT: psrlq $3, %xmm0
2029 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2030 ; X32-SSE-NEXT: retl
2031 %shift = lshr <2 x i8> %a, <i8 2, i8 3>
2036 ; Uniform Constant Shifts
2039 define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
2040 ; SSE2-LABEL: splatconstant_shift_v2i32:
2042 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
2043 ; SSE2-NEXT: psrlq $5, %xmm0
2046 ; SSE41-LABEL: splatconstant_shift_v2i32:
2048 ; SSE41-NEXT: pxor %xmm1, %xmm1
2049 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
2050 ; SSE41-NEXT: psrlq $5, %xmm0
2053 ; AVX1-LABEL: splatconstant_shift_v2i32:
2055 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2056 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
2057 ; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm0
2060 ; AVX2-LABEL: splatconstant_shift_v2i32:
2062 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2063 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2064 ; AVX2-NEXT: vpsrlq $5, %xmm0, %xmm0
2067 ; XOPAVX1-LABEL: splatconstant_shift_v2i32:
2069 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2070 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
2071 ; XOPAVX1-NEXT: vpsrlq $5, %xmm0, %xmm0
2072 ; XOPAVX1-NEXT: retq
2074 ; XOPAVX2-LABEL: splatconstant_shift_v2i32:
2076 ; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2077 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2078 ; XOPAVX2-NEXT: vpsrlq $5, %xmm0, %xmm0
2079 ; XOPAVX2-NEXT: retq
2081 ; AVX512-LABEL: splatconstant_shift_v2i32:
2083 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
2084 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2085 ; AVX512-NEXT: vpsrlq $5, %xmm0, %xmm0
2088 ; AVX512VL-LABEL: splatconstant_shift_v2i32:
2089 ; AVX512VL: # %bb.0:
2090 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
2091 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2092 ; AVX512VL-NEXT: vpsrlq $5, %xmm0, %xmm0
2093 ; AVX512VL-NEXT: retq
2095 ; X32-SSE-LABEL: splatconstant_shift_v2i32:
2097 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2098 ; X32-SSE-NEXT: psrlq $5, %xmm0
2099 ; X32-SSE-NEXT: retl
2100 %shift = lshr <2 x i32> %a, <i32 5, i32 5>
2101 ret <2 x i32> %shift
2104 define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
2105 ; SSE2-LABEL: splatconstant_shift_v4i16:
2107 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
2108 ; SSE2-NEXT: psrld $3, %xmm0
2111 ; SSE41-LABEL: splatconstant_shift_v4i16:
2113 ; SSE41-NEXT: pxor %xmm1, %xmm1
2114 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2115 ; SSE41-NEXT: psrld $3, %xmm0
2118 ; AVX-LABEL: splatconstant_shift_v4i16:
2120 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
2121 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2122 ; AVX-NEXT: vpsrld $3, %xmm0, %xmm0
2125 ; XOP-LABEL: splatconstant_shift_v4i16:
2127 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2128 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2129 ; XOP-NEXT: vpsrld $3, %xmm0, %xmm0
2132 ; AVX512-LABEL: splatconstant_shift_v4i16:
2134 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
2135 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2136 ; AVX512-NEXT: vpsrld $3, %xmm0, %xmm0
2139 ; AVX512VL-LABEL: splatconstant_shift_v4i16:
2140 ; AVX512VL: # %bb.0:
2141 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
2142 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2143 ; AVX512VL-NEXT: vpsrld $3, %xmm0, %xmm0
2144 ; AVX512VL-NEXT: retq
2146 ; X32-SSE-LABEL: splatconstant_shift_v4i16:
2148 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2149 ; X32-SSE-NEXT: psrld $3, %xmm0
2150 ; X32-SSE-NEXT: retl
2151 %shift = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
2152 ret <4 x i16> %shift
2155 define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
2156 ; SSE2-LABEL: splatconstant_shift_v2i16:
2158 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
2159 ; SSE2-NEXT: psrlq $3, %xmm0
2162 ; SSE41-LABEL: splatconstant_shift_v2i16:
2164 ; SSE41-NEXT: pxor %xmm1, %xmm1
2165 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
2166 ; SSE41-NEXT: psrlq $3, %xmm0
2169 ; AVX-LABEL: splatconstant_shift_v2i16:
2171 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
2172 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
2173 ; AVX-NEXT: vpsrlq $3, %xmm0, %xmm0
2176 ; XOP-LABEL: splatconstant_shift_v2i16:
2178 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2179 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
2180 ; XOP-NEXT: vpsrlq $3, %xmm0, %xmm0
2183 ; AVX512-LABEL: splatconstant_shift_v2i16:
2185 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
2186 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
2187 ; AVX512-NEXT: vpsrlq $3, %xmm0, %xmm0
2190 ; AVX512VL-LABEL: splatconstant_shift_v2i16:
2191 ; AVX512VL: # %bb.0:
2192 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
2193 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
2194 ; AVX512VL-NEXT: vpsrlq $3, %xmm0, %xmm0
2195 ; AVX512VL-NEXT: retq
2197 ; X32-SSE-LABEL: splatconstant_shift_v2i16:
2199 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2200 ; X32-SSE-NEXT: psrlq $3, %xmm0
2201 ; X32-SSE-NEXT: retl
2202 %shift = lshr <2 x i16> %a, <i16 3, i16 3>
2203 ret <2 x i16> %shift
2206 define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
2207 ; SSE-LABEL: splatconstant_shift_v8i8:
2209 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
2210 ; SSE-NEXT: psrlw $3, %xmm0
2213 ; AVX-LABEL: splatconstant_shift_v8i8:
2215 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2216 ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
2219 ; XOP-LABEL: splatconstant_shift_v8i8:
2221 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2222 ; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
2225 ; AVX512-LABEL: splatconstant_shift_v8i8:
2227 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2228 ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
2231 ; AVX512VL-LABEL: splatconstant_shift_v8i8:
2232 ; AVX512VL: # %bb.0:
2233 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2234 ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
2235 ; AVX512VL-NEXT: retq
2237 ; X32-SSE-LABEL: splatconstant_shift_v8i8:
2239 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2240 ; X32-SSE-NEXT: psrlw $3, %xmm0
2241 ; X32-SSE-NEXT: retl
2242 %shift = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
2246 define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
2247 ; SSE-LABEL: splatconstant_shift_v4i8:
2249 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
2250 ; SSE-NEXT: psrld $3, %xmm0
2253 ; AVX-LABEL: splatconstant_shift_v4i8:
2255 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2256 ; AVX-NEXT: vpsrld $3, %xmm0, %xmm0
2259 ; XOP-LABEL: splatconstant_shift_v4i8:
2261 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2262 ; XOP-NEXT: vpsrld $3, %xmm0, %xmm0
2265 ; AVX512-LABEL: splatconstant_shift_v4i8:
2267 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2268 ; AVX512-NEXT: vpsrld $3, %xmm0, %xmm0
2271 ; AVX512VL-LABEL: splatconstant_shift_v4i8:
2272 ; AVX512VL: # %bb.0:
2273 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2274 ; AVX512VL-NEXT: vpsrld $3, %xmm0, %xmm0
2275 ; AVX512VL-NEXT: retq
2277 ; X32-SSE-LABEL: splatconstant_shift_v4i8:
2279 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2280 ; X32-SSE-NEXT: psrld $3, %xmm0
2281 ; X32-SSE-NEXT: retl
2282 %shift = lshr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
2286 define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
2287 ; SSE-LABEL: splatconstant_shift_v2i8:
2289 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
2290 ; SSE-NEXT: psrlq $3, %xmm0
2293 ; AVX-LABEL: splatconstant_shift_v2i8:
2295 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2296 ; AVX-NEXT: vpsrlq $3, %xmm0, %xmm0
2299 ; XOP-LABEL: splatconstant_shift_v2i8:
2301 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2302 ; XOP-NEXT: vpsrlq $3, %xmm0, %xmm0
2305 ; AVX512-LABEL: splatconstant_shift_v2i8:
2307 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2308 ; AVX512-NEXT: vpsrlq $3, %xmm0, %xmm0
2311 ; AVX512VL-LABEL: splatconstant_shift_v2i8:
2312 ; AVX512VL: # %bb.0:
2313 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2314 ; AVX512VL-NEXT: vpsrlq $3, %xmm0, %xmm0
2315 ; AVX512VL-NEXT: retq
2317 ; X32-SSE-LABEL: splatconstant_shift_v2i8:
2319 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2320 ; X32-SSE-NEXT: psrlq $3, %xmm0
2321 ; X32-SSE-NEXT: retl
2322 %shift = lshr <2 x i8> %a, <i8 3, i8 3>