1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
13 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
14 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
20 define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
21 ; SSE2-LABEL: var_shift_v2i32:
23 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
24 ; SSE2-NEXT: pand %xmm2, %xmm1
25 ; SSE2-NEXT: pand %xmm2, %xmm0
26 ; SSE2-NEXT: movdqa %xmm0, %xmm2
27 ; SSE2-NEXT: psrlq %xmm1, %xmm2
28 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
29 ; SSE2-NEXT: psrlq %xmm1, %xmm0
30 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
33 ; SSE41-LABEL: var_shift_v2i32:
35 ; SSE41-NEXT: pxor %xmm2, %xmm2
36 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
37 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
38 ; SSE41-NEXT: movdqa %xmm0, %xmm2
39 ; SSE41-NEXT: psrlq %xmm1, %xmm2
40 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
41 ; SSE41-NEXT: psrlq %xmm1, %xmm0
42 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
45 ; AVX1-LABEL: var_shift_v2i32:
47 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
48 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
49 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
50 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
51 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
52 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
53 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
56 ; AVX2-LABEL: var_shift_v2i32:
58 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
59 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
60 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
61 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
64 ; XOPAVX1-LABEL: var_shift_v2i32:
66 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
67 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
68 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
69 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
70 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
73 ; XOPAVX2-LABEL: var_shift_v2i32:
75 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
76 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
77 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
78 ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
81 ; AVX512-LABEL: var_shift_v2i32:
83 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
84 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
85 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
86 ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
89 ; AVX512VL-LABEL: var_shift_v2i32:
91 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
92 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
93 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
94 ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
97 ; X32-SSE-LABEL: var_shift_v2i32:
99 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
100 ; X32-SSE-NEXT: pand %xmm2, %xmm1
101 ; X32-SSE-NEXT: pand %xmm2, %xmm0
102 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
103 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2
104 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
105 ; X32-SSE-NEXT: xorps %xmm3, %xmm3
106 ; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
107 ; X32-SSE-NEXT: psrlq %xmm3, %xmm0
108 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
110 %shift = lshr <2 x i32> %a, %b
114 define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
115 ; SSE2-LABEL: var_shift_v4i16:
117 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
118 ; SSE2-NEXT: pand %xmm2, %xmm0
119 ; SSE2-NEXT: pand %xmm2, %xmm1
120 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
121 ; SSE2-NEXT: movdqa %xmm0, %xmm3
122 ; SSE2-NEXT: psrld %xmm2, %xmm3
123 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
124 ; SSE2-NEXT: movdqa %xmm0, %xmm2
125 ; SSE2-NEXT: psrld %xmm4, %xmm2
126 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
127 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
128 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
129 ; SSE2-NEXT: movdqa %xmm0, %xmm4
130 ; SSE2-NEXT: psrld %xmm3, %xmm4
131 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
132 ; SSE2-NEXT: psrld %xmm1, %xmm0
133 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
134 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
135 ; SSE2-NEXT: movaps %xmm2, %xmm0
138 ; SSE41-LABEL: var_shift_v4i16:
140 ; SSE41-NEXT: pxor %xmm2, %xmm2
141 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
142 ; SSE41-NEXT: movdqa %xmm1, %xmm3
143 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
144 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
145 ; SSE41-NEXT: movdqa %xmm0, %xmm5
146 ; SSE41-NEXT: psrld %xmm4, %xmm5
147 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
148 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
149 ; SSE41-NEXT: movdqa %xmm0, %xmm6
150 ; SSE41-NEXT: psrld %xmm4, %xmm6
151 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
152 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
153 ; SSE41-NEXT: movdqa %xmm0, %xmm2
154 ; SSE41-NEXT: psrld %xmm1, %xmm2
155 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
156 ; SSE41-NEXT: psrld %xmm1, %xmm0
157 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
158 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
161 ; AVX1-LABEL: var_shift_v4i16:
163 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
164 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
165 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
166 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
167 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
168 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
169 ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
170 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
171 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
172 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
173 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
174 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
175 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
176 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
179 ; AVX2-LABEL: var_shift_v4i16:
181 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
182 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
183 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
184 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
187 ; XOPAVX1-LABEL: var_shift_v4i16:
189 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
190 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
191 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
192 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
193 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
196 ; XOPAVX2-LABEL: var_shift_v4i16:
198 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
199 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
200 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
201 ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
204 ; AVX512-LABEL: var_shift_v4i16:
206 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
207 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
208 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
209 ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
212 ; AVX512VL-LABEL: var_shift_v4i16:
214 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
215 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
216 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
217 ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
218 ; AVX512VL-NEXT: retq
220 ; X32-SSE-LABEL: var_shift_v4i16:
222 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
223 ; X32-SSE-NEXT: pand %xmm2, %xmm0
224 ; X32-SSE-NEXT: pand %xmm2, %xmm1
225 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
226 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
227 ; X32-SSE-NEXT: psrld %xmm2, %xmm3
228 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
229 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
230 ; X32-SSE-NEXT: psrld %xmm4, %xmm2
231 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
232 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
233 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
234 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
235 ; X32-SSE-NEXT: psrld %xmm3, %xmm4
236 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
237 ; X32-SSE-NEXT: psrld %xmm1, %xmm0
238 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
239 ; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
240 ; X32-SSE-NEXT: movaps %xmm2, %xmm0
242 %shift = lshr <4 x i16> %a, %b
246 define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
247 ; SSE2-LABEL: var_shift_v2i16:
249 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
250 ; SSE2-NEXT: pand %xmm2, %xmm1
251 ; SSE2-NEXT: pand %xmm2, %xmm0
252 ; SSE2-NEXT: movdqa %xmm0, %xmm2
253 ; SSE2-NEXT: psrlq %xmm1, %xmm2
254 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
255 ; SSE2-NEXT: psrlq %xmm1, %xmm0
256 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
259 ; SSE41-LABEL: var_shift_v2i16:
261 ; SSE41-NEXT: pxor %xmm2, %xmm2
262 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
263 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
264 ; SSE41-NEXT: movdqa %xmm0, %xmm2
265 ; SSE41-NEXT: psrlq %xmm1, %xmm2
266 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
267 ; SSE41-NEXT: psrlq %xmm1, %xmm0
268 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
271 ; AVX1-LABEL: var_shift_v2i16:
273 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
274 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
275 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
276 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
277 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
278 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
279 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
282 ; AVX2-LABEL: var_shift_v2i16:
284 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
285 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
286 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
287 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
290 ; XOPAVX1-LABEL: var_shift_v2i16:
292 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
293 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
294 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
295 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
296 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
299 ; XOPAVX2-LABEL: var_shift_v2i16:
301 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
302 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
303 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
304 ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
307 ; AVX512-LABEL: var_shift_v2i16:
309 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
310 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
311 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
312 ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
315 ; AVX512VL-LABEL: var_shift_v2i16:
317 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
318 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
319 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
320 ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
321 ; AVX512VL-NEXT: retq
323 ; X32-SSE-LABEL: var_shift_v2i16:
325 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
326 ; X32-SSE-NEXT: pand %xmm2, %xmm1
327 ; X32-SSE-NEXT: pand %xmm2, %xmm0
328 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
329 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2
330 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
331 ; X32-SSE-NEXT: psrlq %xmm1, %xmm0
332 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
334 %shift = lshr <2 x i16> %a, %b
338 define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
339 ; SSE2-LABEL: var_shift_v8i8:
341 ; SSE2-NEXT: movdqa %xmm0, %xmm2
342 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
343 ; SSE2-NEXT: pand %xmm0, %xmm2
344 ; SSE2-NEXT: pand %xmm0, %xmm1
345 ; SSE2-NEXT: psllw $12, %xmm1
346 ; SSE2-NEXT: movdqa %xmm1, %xmm0
347 ; SSE2-NEXT: psraw $15, %xmm0
348 ; SSE2-NEXT: pandn %xmm2, %xmm0
349 ; SSE2-NEXT: paddw %xmm1, %xmm1
350 ; SSE2-NEXT: movdqa %xmm1, %xmm2
351 ; SSE2-NEXT: psraw $15, %xmm2
352 ; SSE2-NEXT: movdqa %xmm2, %xmm3
353 ; SSE2-NEXT: pandn %xmm0, %xmm3
354 ; SSE2-NEXT: psrlw $4, %xmm0
355 ; SSE2-NEXT: pand %xmm2, %xmm0
356 ; SSE2-NEXT: por %xmm3, %xmm0
357 ; SSE2-NEXT: paddw %xmm1, %xmm1
358 ; SSE2-NEXT: movdqa %xmm1, %xmm2
359 ; SSE2-NEXT: psraw $15, %xmm2
360 ; SSE2-NEXT: movdqa %xmm2, %xmm3
361 ; SSE2-NEXT: pandn %xmm0, %xmm3
362 ; SSE2-NEXT: psrlw $2, %xmm0
363 ; SSE2-NEXT: pand %xmm2, %xmm0
364 ; SSE2-NEXT: por %xmm3, %xmm0
365 ; SSE2-NEXT: paddw %xmm1, %xmm1
366 ; SSE2-NEXT: psraw $15, %xmm1
367 ; SSE2-NEXT: movdqa %xmm1, %xmm2
368 ; SSE2-NEXT: pandn %xmm0, %xmm2
369 ; SSE2-NEXT: psrlw $1, %xmm0
370 ; SSE2-NEXT: pand %xmm1, %xmm0
371 ; SSE2-NEXT: por %xmm2, %xmm0
374 ; SSE41-LABEL: var_shift_v8i8:
376 ; SSE41-NEXT: movdqa %xmm1, %xmm2
377 ; SSE41-NEXT: movdqa %xmm0, %xmm1
378 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
379 ; SSE41-NEXT: pand %xmm0, %xmm1
380 ; SSE41-NEXT: pand %xmm0, %xmm2
381 ; SSE41-NEXT: movdqa %xmm2, %xmm0
382 ; SSE41-NEXT: psllw $12, %xmm0
383 ; SSE41-NEXT: psllw $4, %xmm2
384 ; SSE41-NEXT: por %xmm0, %xmm2
385 ; SSE41-NEXT: movdqa %xmm2, %xmm3
386 ; SSE41-NEXT: paddw %xmm2, %xmm3
387 ; SSE41-NEXT: pxor %xmm4, %xmm4
388 ; SSE41-NEXT: movdqa %xmm2, %xmm0
389 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
390 ; SSE41-NEXT: movdqa %xmm1, %xmm2
391 ; SSE41-NEXT: psrlw $4, %xmm2
392 ; SSE41-NEXT: movdqa %xmm3, %xmm0
393 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
394 ; SSE41-NEXT: movdqa %xmm1, %xmm2
395 ; SSE41-NEXT: psrlw $2, %xmm2
396 ; SSE41-NEXT: paddw %xmm3, %xmm3
397 ; SSE41-NEXT: movdqa %xmm3, %xmm0
398 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
399 ; SSE41-NEXT: movdqa %xmm1, %xmm2
400 ; SSE41-NEXT: psrlw $1, %xmm2
401 ; SSE41-NEXT: paddw %xmm3, %xmm3
402 ; SSE41-NEXT: movdqa %xmm3, %xmm0
403 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
404 ; SSE41-NEXT: movdqa %xmm1, %xmm0
407 ; AVX1-LABEL: var_shift_v8i8:
409 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
410 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
411 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
412 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
413 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
414 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
415 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
416 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
417 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
418 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
419 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
420 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
421 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
422 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
423 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
424 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
425 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
428 ; AVX2-LABEL: var_shift_v8i8:
430 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
431 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
432 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
433 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
434 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
435 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
436 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
437 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
438 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
439 ; AVX2-NEXT: vzeroupper
442 ; XOPAVX1-LABEL: var_shift_v8i8:
444 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
445 ; XOPAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
446 ; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
447 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
448 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
449 ; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0
452 ; XOPAVX2-LABEL: var_shift_v8i8:
454 ; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
455 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
456 ; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
457 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
458 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
459 ; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0
462 ; AVX512DQ-LABEL: var_shift_v8i8:
464 ; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
465 ; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0
466 ; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1
467 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
468 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
469 ; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
470 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
471 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
472 ; AVX512DQ-NEXT: vzeroupper
473 ; AVX512DQ-NEXT: retq
475 ; AVX512BW-LABEL: var_shift_v8i8:
477 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
478 ; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
479 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
480 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
481 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
482 ; AVX512BW-NEXT: vzeroupper
483 ; AVX512BW-NEXT: retq
485 ; AVX512DQVL-LABEL: var_shift_v8i8:
486 ; AVX512DQVL: # %bb.0:
487 ; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
488 ; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0
489 ; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1
490 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
491 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
492 ; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
493 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
494 ; AVX512DQVL-NEXT: vzeroupper
495 ; AVX512DQVL-NEXT: retq
497 ; AVX512BWVL-LABEL: var_shift_v8i8:
498 ; AVX512BWVL: # %bb.0:
499 ; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
500 ; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1
501 ; AVX512BWVL-NEXT: vpand %xmm2, %xmm0, %xmm0
502 ; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
503 ; AVX512BWVL-NEXT: retq
505 ; X32-SSE-LABEL: var_shift_v8i8:
507 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
508 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
509 ; X32-SSE-NEXT: pand %xmm0, %xmm2
510 ; X32-SSE-NEXT: pand %xmm0, %xmm1
511 ; X32-SSE-NEXT: psllw $12, %xmm1
512 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0
513 ; X32-SSE-NEXT: psraw $15, %xmm0
514 ; X32-SSE-NEXT: pandn %xmm2, %xmm0
515 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
516 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
517 ; X32-SSE-NEXT: psraw $15, %xmm2
518 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
519 ; X32-SSE-NEXT: pandn %xmm0, %xmm3
520 ; X32-SSE-NEXT: psrlw $4, %xmm0
521 ; X32-SSE-NEXT: pand %xmm2, %xmm0
522 ; X32-SSE-NEXT: por %xmm3, %xmm0
523 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
524 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
525 ; X32-SSE-NEXT: psraw $15, %xmm2
526 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
527 ; X32-SSE-NEXT: pandn %xmm0, %xmm3
528 ; X32-SSE-NEXT: psrlw $2, %xmm0
529 ; X32-SSE-NEXT: pand %xmm2, %xmm0
530 ; X32-SSE-NEXT: por %xmm3, %xmm0
531 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
532 ; X32-SSE-NEXT: psraw $15, %xmm1
533 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
534 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
535 ; X32-SSE-NEXT: psrlw $1, %xmm0
536 ; X32-SSE-NEXT: pand %xmm1, %xmm0
537 ; X32-SSE-NEXT: por %xmm2, %xmm0
539 %shift = lshr <8 x i8> %a, %b
543 define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
544 ; SSE2-LABEL: var_shift_v4i8:
546 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
547 ; SSE2-NEXT: pand %xmm2, %xmm0
548 ; SSE2-NEXT: pand %xmm2, %xmm1
549 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
550 ; SSE2-NEXT: movdqa %xmm0, %xmm3
551 ; SSE2-NEXT: psrld %xmm2, %xmm3
552 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
553 ; SSE2-NEXT: movdqa %xmm0, %xmm2
554 ; SSE2-NEXT: psrld %xmm4, %xmm2
555 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
556 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
557 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
558 ; SSE2-NEXT: movdqa %xmm0, %xmm4
559 ; SSE2-NEXT: psrld %xmm3, %xmm4
560 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
561 ; SSE2-NEXT: psrld %xmm1, %xmm0
562 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
563 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
564 ; SSE2-NEXT: movaps %xmm2, %xmm0
567 ; SSE41-LABEL: var_shift_v4i8:
569 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
570 ; SSE41-NEXT: pand %xmm2, %xmm0
571 ; SSE41-NEXT: pand %xmm2, %xmm1
572 ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
573 ; SSE41-NEXT: movdqa %xmm0, %xmm3
574 ; SSE41-NEXT: psrld %xmm2, %xmm3
575 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
576 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
577 ; SSE41-NEXT: movdqa %xmm0, %xmm5
578 ; SSE41-NEXT: psrld %xmm4, %xmm5
579 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
580 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
581 ; SSE41-NEXT: movdqa %xmm0, %xmm3
582 ; SSE41-NEXT: psrld %xmm1, %xmm3
583 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
584 ; SSE41-NEXT: psrld %xmm1, %xmm0
585 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
586 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
589 ; AVX1-LABEL: var_shift_v4i8:
591 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
592 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
593 ; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
594 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
595 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
596 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
597 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
598 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
599 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
600 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
601 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
602 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
603 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
604 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
605 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
608 ; AVX2-LABEL: var_shift_v4i8:
610 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
611 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
612 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
613 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
616 ; XOPAVX1-LABEL: var_shift_v4i8:
618 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
619 ; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
620 ; XOPAVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
621 ; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
622 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
623 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
626 ; XOPAVX2-LABEL: var_shift_v4i8:
628 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
629 ; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
630 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
631 ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
634 ; AVX512-LABEL: var_shift_v4i8:
636 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
637 ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
638 ; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
639 ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
642 ; AVX512VL-LABEL: var_shift_v4i8:
644 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
645 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
646 ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
647 ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
648 ; AVX512VL-NEXT: retq
650 ; X32-SSE-LABEL: var_shift_v4i8:
652 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
653 ; X32-SSE-NEXT: pand %xmm2, %xmm0
654 ; X32-SSE-NEXT: pand %xmm2, %xmm1
655 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
656 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
657 ; X32-SSE-NEXT: psrld %xmm2, %xmm3
658 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
659 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
660 ; X32-SSE-NEXT: psrld %xmm4, %xmm2
661 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
662 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
663 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
664 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
665 ; X32-SSE-NEXT: psrld %xmm3, %xmm4
666 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
667 ; X32-SSE-NEXT: psrld %xmm1, %xmm0
668 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
669 ; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
670 ; X32-SSE-NEXT: movaps %xmm2, %xmm0
672 %shift = lshr <4 x i8> %a, %b
676 define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
677 ; SSE2-LABEL: var_shift_v2i8:
679 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
680 ; SSE2-NEXT: pand %xmm2, %xmm1
681 ; SSE2-NEXT: pand %xmm2, %xmm0
682 ; SSE2-NEXT: movdqa %xmm0, %xmm2
683 ; SSE2-NEXT: psrlq %xmm1, %xmm2
684 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
685 ; SSE2-NEXT: psrlq %xmm1, %xmm0
686 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
689 ; SSE41-LABEL: var_shift_v2i8:
691 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
692 ; SSE41-NEXT: pand %xmm2, %xmm1
693 ; SSE41-NEXT: pand %xmm2, %xmm0
694 ; SSE41-NEXT: movdqa %xmm0, %xmm2
695 ; SSE41-NEXT: psrlq %xmm1, %xmm2
696 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
697 ; SSE41-NEXT: psrlq %xmm1, %xmm0
698 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
701 ; AVX1-LABEL: var_shift_v2i8:
703 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
704 ; AVX1-NEXT: # xmm2 = mem[0,0]
705 ; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
706 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
707 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
708 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
709 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
710 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
713 ; AVX2-LABEL: var_shift_v2i8:
715 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
716 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
717 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
718 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
721 ; XOPAVX1-LABEL: var_shift_v2i8:
723 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
724 ; XOPAVX1-NEXT: # xmm2 = mem[0,0]
725 ; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
726 ; XOPAVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
727 ; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
728 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
729 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
732 ; XOPAVX2-LABEL: var_shift_v2i8:
734 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
735 ; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
736 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
737 ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
740 ; AVX512-LABEL: var_shift_v2i8:
742 ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
743 ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
744 ; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
745 ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
748 ; AVX512VL-LABEL: var_shift_v2i8:
750 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
751 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
752 ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
753 ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
754 ; AVX512VL-NEXT: retq
756 ; X32-SSE-LABEL: var_shift_v2i8:
758 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
759 ; X32-SSE-NEXT: pand %xmm2, %xmm1
760 ; X32-SSE-NEXT: pand %xmm2, %xmm0
761 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
762 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2
763 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
764 ; X32-SSE-NEXT: psrlq %xmm1, %xmm0
765 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
767 %shift = lshr <2 x i8> %a, %b
772 ; Uniform Variable Shifts
775 define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
776 ; SSE2-LABEL: splatvar_shift_v2i32:
778 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
779 ; SSE2-NEXT: pand %xmm2, %xmm0
780 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
781 ; SSE2-NEXT: pand %xmm2, %xmm1
782 ; SSE2-NEXT: movdqa %xmm0, %xmm2
783 ; SSE2-NEXT: psrlq %xmm1, %xmm2
784 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
785 ; SSE2-NEXT: psrlq %xmm1, %xmm0
786 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
789 ; SSE41-LABEL: splatvar_shift_v2i32:
791 ; SSE41-NEXT: pxor %xmm2, %xmm2
792 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
793 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
794 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
795 ; SSE41-NEXT: movdqa %xmm0, %xmm2
796 ; SSE41-NEXT: psrlq %xmm1, %xmm2
797 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
798 ; SSE41-NEXT: psrlq %xmm1, %xmm0
799 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
802 ; AVX1-LABEL: splatvar_shift_v2i32:
804 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
805 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
806 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
807 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
808 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
809 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
810 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
811 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
814 ; AVX2-LABEL: splatvar_shift_v2i32:
816 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
817 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
818 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
819 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
820 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
823 ; XOPAVX1-LABEL: splatvar_shift_v2i32:
825 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
826 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
827 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
828 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
829 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
830 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
833 ; XOPAVX2-LABEL: splatvar_shift_v2i32:
835 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
836 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
837 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
838 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
839 ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
842 ; AVX512-LABEL: splatvar_shift_v2i32:
844 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
845 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
846 ; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
847 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
848 ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
851 ; AVX512VL-LABEL: splatvar_shift_v2i32:
853 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
854 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
855 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
856 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
857 ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
858 ; AVX512VL-NEXT: retq
860 ; X32-SSE-LABEL: splatvar_shift_v2i32:
862 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
863 ; X32-SSE-NEXT: pand %xmm2, %xmm0
864 ; X32-SSE-NEXT: pand %xmm1, %xmm2
865 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
866 ; X32-SSE-NEXT: psrlq %xmm2, %xmm3
867 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
868 ; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
869 ; X32-SSE-NEXT: psrlq %xmm2, %xmm0
870 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
872 %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
873 %shift = lshr <2 x i32> %a, %splat
877 define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
878 ; SSE2-LABEL: splatvar_shift_v4i16:
880 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
881 ; SSE2-NEXT: pand %xmm2, %xmm0
882 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
883 ; SSE2-NEXT: pand %xmm2, %xmm3
884 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
885 ; SSE2-NEXT: movdqa %xmm0, %xmm2
886 ; SSE2-NEXT: psrld %xmm1, %xmm2
887 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
888 ; SSE2-NEXT: movdqa %xmm0, %xmm1
889 ; SSE2-NEXT: psrld %xmm4, %xmm1
890 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
891 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
892 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
893 ; SSE2-NEXT: movdqa %xmm0, %xmm4
894 ; SSE2-NEXT: psrld %xmm3, %xmm4
895 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
896 ; SSE2-NEXT: psrld %xmm2, %xmm0
897 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
898 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
899 ; SSE2-NEXT: movaps %xmm1, %xmm0
902 ; SSE41-LABEL: splatvar_shift_v4i16:
904 ; SSE41-NEXT: pxor %xmm2, %xmm2
905 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
906 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
907 ; SSE41-NEXT: movdqa %xmm1, %xmm3
908 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
909 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
910 ; SSE41-NEXT: movdqa %xmm0, %xmm5
911 ; SSE41-NEXT: psrld %xmm4, %xmm5
912 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
913 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
914 ; SSE41-NEXT: movdqa %xmm0, %xmm6
915 ; SSE41-NEXT: psrld %xmm4, %xmm6
916 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
917 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
918 ; SSE41-NEXT: movdqa %xmm0, %xmm2
919 ; SSE41-NEXT: psrld %xmm1, %xmm2
920 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
921 ; SSE41-NEXT: psrld %xmm1, %xmm0
922 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
923 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
926 ; AVX1-LABEL: splatvar_shift_v4i16:
928 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
929 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
930 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
931 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
932 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
933 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
934 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
935 ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
936 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
937 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
938 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
939 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
940 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
941 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
942 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
945 ; AVX2-LABEL: splatvar_shift_v4i16:
947 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
948 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
949 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
950 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
951 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
954 ; XOPAVX1-LABEL: splatvar_shift_v4i16:
956 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
957 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
958 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
959 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
960 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
961 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
964 ; XOPAVX2-LABEL: splatvar_shift_v4i16:
966 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
967 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
968 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
969 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
970 ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
973 ; AVX512-LABEL: splatvar_shift_v4i16:
975 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
976 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
977 ; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
978 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
979 ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
982 ; AVX512VL-LABEL: splatvar_shift_v4i16:
984 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
985 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
986 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
987 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
988 ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
989 ; AVX512VL-NEXT: retq
991 ; X32-SSE-LABEL: splatvar_shift_v4i16:
993 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
994 ; X32-SSE-NEXT: pand %xmm2, %xmm0
995 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
996 ; X32-SSE-NEXT: pand %xmm2, %xmm3
997 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
998 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
999 ; X32-SSE-NEXT: psrld %xmm1, %xmm2
1000 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
1001 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1002 ; X32-SSE-NEXT: psrld %xmm4, %xmm1
1003 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1004 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
1005 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
1006 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
1007 ; X32-SSE-NEXT: psrld %xmm3, %xmm4
1008 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
1009 ; X32-SSE-NEXT: psrld %xmm2, %xmm0
1010 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
1011 ; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
1012 ; X32-SSE-NEXT: movaps %xmm1, %xmm0
1013 ; X32-SSE-NEXT: retl
1014 %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
1015 %shift = lshr <4 x i16> %a, %splat
1016 ret <4 x i16> %shift
1019 define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
1020 ; SSE2-LABEL: splatvar_shift_v2i16:
1022 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
1023 ; SSE2-NEXT: pand %xmm2, %xmm0
1024 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1025 ; SSE2-NEXT: pand %xmm2, %xmm1
1026 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1027 ; SSE2-NEXT: psrlq %xmm1, %xmm2
1028 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1029 ; SSE2-NEXT: psrlq %xmm1, %xmm0
1030 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1033 ; SSE41-LABEL: splatvar_shift_v2i16:
1035 ; SSE41-NEXT: pxor %xmm2, %xmm2
1036 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1037 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1038 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1039 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1040 ; SSE41-NEXT: psrlq %xmm1, %xmm2
1041 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1042 ; SSE41-NEXT: psrlq %xmm1, %xmm0
1043 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1046 ; AVX1-LABEL: splatvar_shift_v2i16:
1048 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1049 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1050 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1051 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1052 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
1053 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1054 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
1055 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1058 ; AVX2-LABEL: splatvar_shift_v2i16:
1060 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1061 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1062 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
1063 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1064 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1067 ; XOPAVX1-LABEL: splatvar_shift_v2i16:
1069 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1070 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1071 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1072 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1073 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
1074 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
1075 ; XOPAVX1-NEXT: retq
1077 ; XOPAVX2-LABEL: splatvar_shift_v2i16:
1079 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1080 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1081 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
1082 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1083 ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1084 ; XOPAVX2-NEXT: retq
1086 ; AVX512-LABEL: splatvar_shift_v2i16:
1088 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
1089 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1090 ; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
1091 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1092 ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1095 ; AVX512VL-LABEL: splatvar_shift_v2i16:
1096 ; AVX512VL: # %bb.0:
1097 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
1098 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1099 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
1100 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1101 ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1102 ; AVX512VL-NEXT: retq
1104 ; X32-SSE-LABEL: splatvar_shift_v2i16:
1106 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
1107 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1108 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1109 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1110 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1111 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2
1112 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1113 ; X32-SSE-NEXT: psrlq %xmm1, %xmm0
1114 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1115 ; X32-SSE-NEXT: retl
1116 %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
1117 %shift = lshr <2 x i16> %a, %splat
1118 ret <2 x i16> %shift
1121 define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
1122 ; SSE2-LABEL: splatvar_shift_v8i8:
1124 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1125 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1126 ; SSE2-NEXT: pand %xmm0, %xmm2
1127 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
1128 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1129 ; SSE2-NEXT: pand %xmm0, %xmm1
1130 ; SSE2-NEXT: psllw $12, %xmm1
1131 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1132 ; SSE2-NEXT: psraw $15, %xmm0
1133 ; SSE2-NEXT: pandn %xmm2, %xmm0
1134 ; SSE2-NEXT: paddw %xmm1, %xmm1
1135 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1136 ; SSE2-NEXT: psraw $15, %xmm2
1137 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1138 ; SSE2-NEXT: pandn %xmm0, %xmm3
1139 ; SSE2-NEXT: psrlw $4, %xmm0
1140 ; SSE2-NEXT: pand %xmm2, %xmm0
1141 ; SSE2-NEXT: por %xmm3, %xmm0
1142 ; SSE2-NEXT: paddw %xmm1, %xmm1
1143 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1144 ; SSE2-NEXT: psraw $15, %xmm2
1145 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1146 ; SSE2-NEXT: pandn %xmm0, %xmm3
1147 ; SSE2-NEXT: psrlw $2, %xmm0
1148 ; SSE2-NEXT: pand %xmm2, %xmm0
1149 ; SSE2-NEXT: por %xmm3, %xmm0
1150 ; SSE2-NEXT: paddw %xmm1, %xmm1
1151 ; SSE2-NEXT: psraw $15, %xmm1
1152 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1153 ; SSE2-NEXT: pandn %xmm0, %xmm2
1154 ; SSE2-NEXT: psrlw $1, %xmm0
1155 ; SSE2-NEXT: pand %xmm1, %xmm0
1156 ; SSE2-NEXT: por %xmm2, %xmm0
1159 ; SSE41-LABEL: splatvar_shift_v8i8:
1161 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1162 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1163 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
1164 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero
1165 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1166 ; SSE41-NEXT: psllw $12, %xmm0
1167 ; SSE41-NEXT: psllw $4, %xmm2
1168 ; SSE41-NEXT: por %xmm0, %xmm2
1169 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1170 ; SSE41-NEXT: paddw %xmm2, %xmm3
1171 ; SSE41-NEXT: pxor %xmm4, %xmm4
1172 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1173 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
1174 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1175 ; SSE41-NEXT: psrlw $4, %xmm2
1176 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1177 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
1178 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1179 ; SSE41-NEXT: psrlw $2, %xmm2
1180 ; SSE41-NEXT: paddw %xmm3, %xmm3
1181 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1182 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
1183 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1184 ; SSE41-NEXT: psrlw $1, %xmm2
1185 ; SSE41-NEXT: paddw %xmm3, %xmm3
1186 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1187 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
1188 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1191 ; AVX1-LABEL: splatvar_shift_v8i8:
1193 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1194 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1195 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
1196 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1197 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1198 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
1199 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1200 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1201 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
1202 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1203 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
1204 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
1205 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1206 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
1207 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
1208 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1211 ; AVX2-LABEL: splatvar_shift_v8i8:
1213 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1214 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1215 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1216 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1217 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1218 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
1219 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1220 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1221 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1222 ; AVX2-NEXT: vzeroupper
1225 ; XOPAVX1-LABEL: splatvar_shift_v8i8:
1227 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1228 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1229 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1230 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
1231 ; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0
1232 ; XOPAVX1-NEXT: retq
1234 ; XOPAVX2-LABEL: splatvar_shift_v8i8:
1236 ; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1237 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1238 ; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1239 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1240 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
1241 ; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0
1242 ; XOPAVX2-NEXT: retq
1244 ; AVX512DQ-LABEL: splatvar_shift_v8i8:
1245 ; AVX512DQ: # %bb.0:
1246 ; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1247 ; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0
1248 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1249 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1250 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1251 ; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
1252 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
1253 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1254 ; AVX512DQ-NEXT: vzeroupper
1255 ; AVX512DQ-NEXT: retq
1257 ; AVX512BW-LABEL: splatvar_shift_v8i8:
1258 ; AVX512BW: # %bb.0:
1259 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1260 ; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
1261 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1262 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1263 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1264 ; AVX512BW-NEXT: vzeroupper
1265 ; AVX512BW-NEXT: retq
1267 ; AVX512DQVL-LABEL: splatvar_shift_v8i8:
1268 ; AVX512DQVL: # %bb.0:
1269 ; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1270 ; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0
1271 ; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1272 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1273 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1274 ; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
1275 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
1276 ; AVX512DQVL-NEXT: vzeroupper
1277 ; AVX512DQVL-NEXT: retq
1279 ; AVX512BWVL-LABEL: splatvar_shift_v8i8:
1280 ; AVX512BWVL: # %bb.0:
1281 ; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1282 ; AVX512BWVL-NEXT: vpand %xmm2, %xmm0, %xmm0
1283 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
1284 ; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
1285 ; AVX512BWVL-NEXT: retq
1287 ; X32-SSE-LABEL: splatvar_shift_v8i8:
1289 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1290 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1291 ; X32-SSE-NEXT: pand %xmm0, %xmm2
1292 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
1293 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1294 ; X32-SSE-NEXT: pand %xmm0, %xmm1
1295 ; X32-SSE-NEXT: psllw $12, %xmm1
1296 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0
1297 ; X32-SSE-NEXT: psraw $15, %xmm0
1298 ; X32-SSE-NEXT: pandn %xmm2, %xmm0
1299 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
1300 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
1301 ; X32-SSE-NEXT: psraw $15, %xmm2
1302 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
1303 ; X32-SSE-NEXT: pandn %xmm0, %xmm3
1304 ; X32-SSE-NEXT: psrlw $4, %xmm0
1305 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1306 ; X32-SSE-NEXT: por %xmm3, %xmm0
1307 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
1308 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
1309 ; X32-SSE-NEXT: psraw $15, %xmm2
1310 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
1311 ; X32-SSE-NEXT: pandn %xmm0, %xmm3
1312 ; X32-SSE-NEXT: psrlw $2, %xmm0
1313 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1314 ; X32-SSE-NEXT: por %xmm3, %xmm0
1315 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
1316 ; X32-SSE-NEXT: psraw $15, %xmm1
1317 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
1318 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1319 ; X32-SSE-NEXT: psrlw $1, %xmm0
1320 ; X32-SSE-NEXT: pand %xmm1, %xmm0
1321 ; X32-SSE-NEXT: por %xmm2, %xmm0
1322 ; X32-SSE-NEXT: retl
1323 %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
1324 %shift = lshr <8 x i8> %a, %splat
1328 define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
1329 ; SSE2-LABEL: splatvar_shift_v4i8:
1331 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1332 ; SSE2-NEXT: pand %xmm2, %xmm0
1333 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
1334 ; SSE2-NEXT: pand %xmm2, %xmm3
1335 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
1336 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1337 ; SSE2-NEXT: psrld %xmm1, %xmm2
1338 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
1339 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1340 ; SSE2-NEXT: psrld %xmm4, %xmm1
1341 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1342 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
1343 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
1344 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1345 ; SSE2-NEXT: psrld %xmm3, %xmm4
1346 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
1347 ; SSE2-NEXT: psrld %xmm2, %xmm0
1348 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
1349 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
1350 ; SSE2-NEXT: movaps %xmm1, %xmm0
1353 ; SSE41-LABEL: splatvar_shift_v4i8:
1355 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
1356 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1357 ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
1358 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1359 ; SSE41-NEXT: psrld %xmm2, %xmm3
1360 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1361 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
1362 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1363 ; SSE41-NEXT: psrld %xmm4, %xmm5
1364 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
1365 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
1366 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1367 ; SSE41-NEXT: psrld %xmm1, %xmm3
1368 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
1369 ; SSE41-NEXT: psrld %xmm1, %xmm0
1370 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
1371 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
1374 ; AVX1-LABEL: splatvar_shift_v4i8:
1376 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
1377 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
1378 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1379 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1380 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
1381 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
1382 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
1383 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1384 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1385 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1386 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
1387 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1388 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
1389 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
1390 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1393 ; AVX2-LABEL: splatvar_shift_v4i8:
1395 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
1396 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1397 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1398 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
1401 ; XOPAVX1-LABEL: splatvar_shift_v4i8:
1403 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
1404 ; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
1405 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1406 ; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
1407 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
1408 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
1409 ; XOPAVX1-NEXT: retq
1411 ; XOPAVX2-LABEL: splatvar_shift_v4i8:
1413 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
1414 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1415 ; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1416 ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
1417 ; XOPAVX2-NEXT: retq
1419 ; AVX512-LABEL: splatvar_shift_v4i8:
1421 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
1422 ; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
1423 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1424 ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
1427 ; AVX512VL-LABEL: splatvar_shift_v4i8:
1428 ; AVX512VL: # %bb.0:
1429 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
1430 ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
1431 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
1432 ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
1433 ; AVX512VL-NEXT: retq
1435 ; X32-SSE-LABEL: splatvar_shift_v4i8:
1437 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1438 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1439 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
1440 ; X32-SSE-NEXT: pand %xmm2, %xmm3
1441 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
1442 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1443 ; X32-SSE-NEXT: psrld %xmm1, %xmm2
1444 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
1445 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1446 ; X32-SSE-NEXT: psrld %xmm4, %xmm1
1447 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1448 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
1449 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
1450 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
1451 ; X32-SSE-NEXT: psrld %xmm3, %xmm4
1452 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
1453 ; X32-SSE-NEXT: psrld %xmm2, %xmm0
1454 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
1455 ; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
1456 ; X32-SSE-NEXT: movaps %xmm1, %xmm0
1457 ; X32-SSE-NEXT: retl
1458 %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
1459 %shift = lshr <4 x i8> %a, %splat
1463 define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
1464 ; SSE2-LABEL: splatvar_shift_v2i8:
1466 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1467 ; SSE2-NEXT: pand %xmm2, %xmm0
1468 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1469 ; SSE2-NEXT: pand %xmm2, %xmm1
1470 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1471 ; SSE2-NEXT: psrlq %xmm1, %xmm2
1472 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1473 ; SSE2-NEXT: psrlq %xmm1, %xmm0
1474 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1477 ; SSE41-LABEL: splatvar_shift_v2i8:
1479 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
1480 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1481 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1482 ; SSE41-NEXT: psrlq %xmm1, %xmm2
1483 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1484 ; SSE41-NEXT: psrlq %xmm1, %xmm0
1485 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1488 ; AVX1-LABEL: splatvar_shift_v2i8:
1490 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
1491 ; AVX1-NEXT: # xmm2 = mem[0,0]
1492 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
1493 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1494 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
1495 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1496 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
1497 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1500 ; AVX2-LABEL: splatvar_shift_v2i8:
1502 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
1503 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1504 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1505 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1508 ; XOPAVX1-LABEL: splatvar_shift_v2i8:
1510 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
1511 ; XOPAVX1-NEXT: # xmm2 = mem[0,0]
1512 ; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
1513 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1514 ; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
1515 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
1516 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
1517 ; XOPAVX1-NEXT: retq
1519 ; XOPAVX2-LABEL: splatvar_shift_v2i8:
1521 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
1522 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1523 ; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1524 ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1525 ; XOPAVX2-NEXT: retq
1527 ; AVX512-LABEL: splatvar_shift_v2i8:
1529 ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
1530 ; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
1531 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1532 ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1535 ; AVX512VL-LABEL: splatvar_shift_v2i8:
1536 ; AVX512VL: # %bb.0:
1537 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
1538 ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
1539 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
1540 ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
1541 ; AVX512VL-NEXT: retq
1543 ; X32-SSE-LABEL: splatvar_shift_v2i8:
1545 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1546 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1547 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1548 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1549 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1550 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2
1551 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1552 ; X32-SSE-NEXT: psrlq %xmm1, %xmm0
1553 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1554 ; X32-SSE-NEXT: retl
1555 %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
1556 %shift = lshr <2 x i8> %a, %splat
1564 define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
1565 ; SSE2-LABEL: constant_shift_v2i32:
1567 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1568 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1569 ; SSE2-NEXT: psrlq $4, %xmm1
1570 ; SSE2-NEXT: psrlq $5, %xmm0
1571 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1574 ; SSE41-LABEL: constant_shift_v2i32:
1576 ; SSE41-NEXT: pxor %xmm1, %xmm1
1577 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1578 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1579 ; SSE41-NEXT: psrlq $5, %xmm0
1580 ; SSE41-NEXT: psrlq $4, %xmm1
1581 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1582 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1585 ; AVX1-LABEL: constant_shift_v2i32:
1587 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1588 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1589 ; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm1
1590 ; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm0
1591 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1594 ; AVX2-LABEL: constant_shift_v2i32:
1596 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1597 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1598 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1601 ; XOPAVX1-LABEL: constant_shift_v2i32:
1603 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1604 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1605 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
1606 ; XOPAVX1-NEXT: retq
1608 ; XOPAVX2-LABEL: constant_shift_v2i32:
1610 ; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1611 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1612 ; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1613 ; XOPAVX2-NEXT: retq
1615 ; AVX512-LABEL: constant_shift_v2i32:
1617 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1618 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1619 ; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1622 ; AVX512VL-LABEL: constant_shift_v2i32:
1623 ; AVX512VL: # %bb.0:
1624 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1625 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1626 ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1627 ; AVX512VL-NEXT: retq
1629 ; X32-SSE-LABEL: constant_shift_v2i32:
1631 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1632 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1633 ; X32-SSE-NEXT: psrlq $4, %xmm1
1634 ; X32-SSE-NEXT: psrlq $5, %xmm0
1635 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1636 ; X32-SSE-NEXT: retl
1637 %shift = lshr <2 x i32> %a, <i32 4, i32 5>
1638 ret <2 x i32> %shift
1641 define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
1642 ; SSE2-LABEL: constant_shift_v4i16:
1644 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1645 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1646 ; SSE2-NEXT: psrld $3, %xmm1
1647 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1648 ; SSE2-NEXT: psrld $2, %xmm2
1649 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1650 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1651 ; SSE2-NEXT: psrld $1, %xmm1
1652 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1653 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1656 ; SSE41-LABEL: constant_shift_v4i16:
1658 ; SSE41-NEXT: pxor %xmm1, %xmm1
1659 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1660 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1661 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1662 ; SSE41-NEXT: psrld $2, %xmm0
1663 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1664 ; SSE41-NEXT: psrld $3, %xmm1
1665 ; SSE41-NEXT: psrld $1, %xmm2
1666 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1667 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1670 ; AVX1-LABEL: constant_shift_v4i16:
1672 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1673 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1674 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1
1675 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2
1676 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1677 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
1678 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1679 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1682 ; AVX2-LABEL: constant_shift_v4i16:
1684 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1685 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1686 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1689 ; XOPAVX1-LABEL: constant_shift_v4i16:
1691 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1692 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1693 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
1694 ; XOPAVX1-NEXT: retq
1696 ; XOPAVX2-LABEL: constant_shift_v4i16:
1698 ; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1699 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1700 ; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1701 ; XOPAVX2-NEXT: retq
1703 ; AVX512-LABEL: constant_shift_v4i16:
1705 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1706 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1707 ; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1710 ; AVX512VL-LABEL: constant_shift_v4i16:
1711 ; AVX512VL: # %bb.0:
1712 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1713 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1714 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1715 ; AVX512VL-NEXT: retq
1717 ; X32-SSE-LABEL: constant_shift_v4i16:
1719 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1720 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1721 ; X32-SSE-NEXT: psrld $3, %xmm1
1722 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1723 ; X32-SSE-NEXT: psrld $2, %xmm2
1724 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1725 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1726 ; X32-SSE-NEXT: psrld $1, %xmm1
1727 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1728 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1729 ; X32-SSE-NEXT: retl
1730 %shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
1731 ret <4 x i16> %shift
1734 define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
1735 ; SSE2-LABEL: constant_shift_v2i16:
1737 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1738 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1739 ; SSE2-NEXT: psrlq $2, %xmm1
1740 ; SSE2-NEXT: psrlq $3, %xmm0
1741 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1744 ; SSE41-LABEL: constant_shift_v2i16:
1746 ; SSE41-NEXT: pxor %xmm1, %xmm1
1747 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1748 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1749 ; SSE41-NEXT: psrlq $3, %xmm0
1750 ; SSE41-NEXT: psrlq $2, %xmm1
1751 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1752 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1755 ; AVX1-LABEL: constant_shift_v2i16:
1757 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1758 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1759 ; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1
1760 ; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
1761 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1764 ; AVX2-LABEL: constant_shift_v2i16:
1766 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1767 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1768 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1771 ; XOPAVX1-LABEL: constant_shift_v2i16:
1773 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1774 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1775 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
1776 ; XOPAVX1-NEXT: retq
1778 ; XOPAVX2-LABEL: constant_shift_v2i16:
1780 ; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1781 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1782 ; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1783 ; XOPAVX2-NEXT: retq
1785 ; AVX512-LABEL: constant_shift_v2i16:
1787 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1788 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1789 ; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1792 ; AVX512VL-LABEL: constant_shift_v2i16:
1793 ; AVX512VL: # %bb.0:
1794 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1795 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1796 ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1797 ; AVX512VL-NEXT: retq
1799 ; X32-SSE-LABEL: constant_shift_v2i16:
1801 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1802 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1803 ; X32-SSE-NEXT: psrlq {{\.LCPI.*}}, %xmm1
1804 ; X32-SSE-NEXT: psrlq {{\.LCPI.*}}, %xmm0
1805 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1806 ; X32-SSE-NEXT: retl
1807 %shift = lshr <2 x i16> %a, <i16 2, i16 3>
1808 ret <2 x i16> %shift
1811 define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
1812 ; SSE2-LABEL: constant_shift_v8i8:
1814 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1815 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
1816 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1817 ; SSE2-NEXT: pandn %xmm0, %xmm2
1818 ; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0
1819 ; SSE2-NEXT: pand %xmm1, %xmm0
1820 ; SSE2-NEXT: por %xmm2, %xmm0
1823 ; SSE41-LABEL: constant_shift_v8i8:
1825 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
1826 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,4096,2048,1024,512>
1827 ; SSE41-NEXT: pmulhuw %xmm0, %xmm1
1828 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1831 ; AVX-LABEL: constant_shift_v8i8:
1833 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1834 ; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
1835 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1838 ; XOP-LABEL: constant_shift_v8i8:
1840 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1841 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
1844 ; AVX512DQ-LABEL: constant_shift_v8i8:
1845 ; AVX512DQ: # %bb.0:
1846 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1847 ; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
1848 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1849 ; AVX512DQ-NEXT: retq
1851 ; AVX512BW-LABEL: constant_shift_v8i8:
1852 ; AVX512BW: # %bb.0:
1853 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1854 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1855 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1856 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1857 ; AVX512BW-NEXT: vzeroupper
1858 ; AVX512BW-NEXT: retq
1860 ; AVX512DQVL-LABEL: constant_shift_v8i8:
1861 ; AVX512DQVL: # %bb.0:
1862 ; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1863 ; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
1864 ; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1865 ; AVX512DQVL-NEXT: retq
1867 ; AVX512BWVL-LABEL: constant_shift_v8i8:
1868 ; AVX512BWVL: # %bb.0:
1869 ; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1870 ; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
1871 ; AVX512BWVL-NEXT: retq
1873 ; X32-SSE-LABEL: constant_shift_v8i8:
1875 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1876 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
1877 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
1878 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1879 ; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0
1880 ; X32-SSE-NEXT: pand %xmm1, %xmm0
1881 ; X32-SSE-NEXT: por %xmm2, %xmm0
1882 ; X32-SSE-NEXT: retl
1883 %shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1887 define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
1888 ; SSE2-LABEL: constant_shift_v4i8:
1890 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1891 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1892 ; SSE2-NEXT: psrld $3, %xmm1
1893 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1894 ; SSE2-NEXT: psrld $2, %xmm2
1895 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1896 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1897 ; SSE2-NEXT: psrld $1, %xmm1
1898 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1899 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1902 ; SSE41-LABEL: constant_shift_v4i8:
1904 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
1905 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1906 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1907 ; SSE41-NEXT: psrld $2, %xmm1
1908 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1909 ; SSE41-NEXT: psrld $3, %xmm0
1910 ; SSE41-NEXT: psrld $1, %xmm2
1911 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1912 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1913 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1916 ; AVX1-LABEL: constant_shift_v4i8:
1918 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1919 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1
1920 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2
1921 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1922 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
1923 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1924 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1927 ; AVX2-LABEL: constant_shift_v4i8:
1929 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1930 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1933 ; XOPAVX1-LABEL: constant_shift_v4i8:
1935 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1936 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
1937 ; XOPAVX1-NEXT: retq
1939 ; XOPAVX2-LABEL: constant_shift_v4i8:
1941 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1942 ; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1943 ; XOPAVX2-NEXT: retq
1945 ; AVX512-LABEL: constant_shift_v4i8:
1947 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1948 ; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1951 ; AVX512VL-LABEL: constant_shift_v4i8:
1952 ; AVX512VL: # %bb.0:
1953 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1954 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
1955 ; AVX512VL-NEXT: retq
1957 ; X32-SSE-LABEL: constant_shift_v4i8:
1959 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1960 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1961 ; X32-SSE-NEXT: psrld $3, %xmm1
1962 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1963 ; X32-SSE-NEXT: psrld $2, %xmm2
1964 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1965 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1966 ; X32-SSE-NEXT: psrld $1, %xmm1
1967 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1968 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1969 ; X32-SSE-NEXT: retl
1970 %shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
1974 define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
1975 ; SSE2-LABEL: constant_shift_v2i8:
1977 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1978 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1979 ; SSE2-NEXT: psrlq $2, %xmm1
1980 ; SSE2-NEXT: psrlq $3, %xmm0
1981 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1984 ; SSE41-LABEL: constant_shift_v2i8:
1986 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
1987 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1988 ; SSE41-NEXT: psrlq $3, %xmm1
1989 ; SSE41-NEXT: psrlq $2, %xmm0
1990 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1993 ; AVX1-LABEL: constant_shift_v2i8:
1995 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1996 ; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1
1997 ; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
1998 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2001 ; AVX2-LABEL: constant_shift_v2i8:
2003 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2004 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
2007 ; XOPAVX1-LABEL: constant_shift_v2i8:
2009 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2010 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
2011 ; XOPAVX1-NEXT: retq
2013 ; XOPAVX2-LABEL: constant_shift_v2i8:
2015 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2016 ; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
2017 ; XOPAVX2-NEXT: retq
2019 ; AVX512-LABEL: constant_shift_v2i8:
2021 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2022 ; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
2025 ; AVX512VL-LABEL: constant_shift_v2i8:
2026 ; AVX512VL: # %bb.0:
2027 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2028 ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
2029 ; AVX512VL-NEXT: retq
2031 ; X32-SSE-LABEL: constant_shift_v2i8:
2033 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2034 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
2035 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
2036 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2
2037 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2038 ; X32-SSE-NEXT: psrlq %xmm1, %xmm0
2039 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
2040 ; X32-SSE-NEXT: retl
2041 %shift = lshr <2 x i8> %a, <i8 2, i8 3>
2046 ; Uniform Constant Shifts
2049 define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
2050 ; SSE2-LABEL: splatconstant_shift_v2i32:
2052 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
2053 ; SSE2-NEXT: psrlq $5, %xmm0
2056 ; SSE41-LABEL: splatconstant_shift_v2i32:
2058 ; SSE41-NEXT: pxor %xmm1, %xmm1
2059 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
2060 ; SSE41-NEXT: psrlq $5, %xmm0
2063 ; AVX1-LABEL: splatconstant_shift_v2i32:
2065 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2066 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
2067 ; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm0
2070 ; AVX2-LABEL: splatconstant_shift_v2i32:
2072 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2073 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2074 ; AVX2-NEXT: vpsrlq $5, %xmm0, %xmm0
2077 ; XOPAVX1-LABEL: splatconstant_shift_v2i32:
2079 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2080 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
2081 ; XOPAVX1-NEXT: vpsrlq $5, %xmm0, %xmm0
2082 ; XOPAVX1-NEXT: retq
2084 ; XOPAVX2-LABEL: splatconstant_shift_v2i32:
2086 ; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2087 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2088 ; XOPAVX2-NEXT: vpsrlq $5, %xmm0, %xmm0
2089 ; XOPAVX2-NEXT: retq
2091 ; AVX512-LABEL: splatconstant_shift_v2i32:
2093 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
2094 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2095 ; AVX512-NEXT: vpsrlq $5, %xmm0, %xmm0
2098 ; AVX512VL-LABEL: splatconstant_shift_v2i32:
2099 ; AVX512VL: # %bb.0:
2100 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
2101 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2102 ; AVX512VL-NEXT: vpsrlq $5, %xmm0, %xmm0
2103 ; AVX512VL-NEXT: retq
2105 ; X32-SSE-LABEL: splatconstant_shift_v2i32:
2107 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2108 ; X32-SSE-NEXT: psrlq $5, %xmm0
2109 ; X32-SSE-NEXT: retl
2110 %shift = lshr <2 x i32> %a, <i32 5, i32 5>
2111 ret <2 x i32> %shift
2114 define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
2115 ; SSE2-LABEL: splatconstant_shift_v4i16:
2117 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
2118 ; SSE2-NEXT: psrld $3, %xmm0
2121 ; SSE41-LABEL: splatconstant_shift_v4i16:
2123 ; SSE41-NEXT: pxor %xmm1, %xmm1
2124 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2125 ; SSE41-NEXT: psrld $3, %xmm0
2128 ; AVX-LABEL: splatconstant_shift_v4i16:
2130 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
2131 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2132 ; AVX-NEXT: vpsrld $3, %xmm0, %xmm0
2135 ; XOP-LABEL: splatconstant_shift_v4i16:
2137 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2138 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2139 ; XOP-NEXT: vpsrld $3, %xmm0, %xmm0
2142 ; AVX512-LABEL: splatconstant_shift_v4i16:
2144 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
2145 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2146 ; AVX512-NEXT: vpsrld $3, %xmm0, %xmm0
2149 ; AVX512VL-LABEL: splatconstant_shift_v4i16:
2150 ; AVX512VL: # %bb.0:
2151 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
2152 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2153 ; AVX512VL-NEXT: vpsrld $3, %xmm0, %xmm0
2154 ; AVX512VL-NEXT: retq
2156 ; X32-SSE-LABEL: splatconstant_shift_v4i16:
2158 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2159 ; X32-SSE-NEXT: psrld $3, %xmm0
2160 ; X32-SSE-NEXT: retl
2161 %shift = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
2162 ret <4 x i16> %shift
2165 define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
2166 ; SSE2-LABEL: splatconstant_shift_v2i16:
2168 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
2169 ; SSE2-NEXT: psrlq $3, %xmm0
2172 ; SSE41-LABEL: splatconstant_shift_v2i16:
2174 ; SSE41-NEXT: pxor %xmm1, %xmm1
2175 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
2176 ; SSE41-NEXT: psrlq $3, %xmm0
2179 ; AVX-LABEL: splatconstant_shift_v2i16:
2181 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
2182 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
2183 ; AVX-NEXT: vpsrlq $3, %xmm0, %xmm0
2186 ; XOP-LABEL: splatconstant_shift_v2i16:
2188 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2189 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
2190 ; XOP-NEXT: vpsrlq $3, %xmm0, %xmm0
2193 ; AVX512-LABEL: splatconstant_shift_v2i16:
2195 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
2196 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
2197 ; AVX512-NEXT: vpsrlq $3, %xmm0, %xmm0
2200 ; AVX512VL-LABEL: splatconstant_shift_v2i16:
2201 ; AVX512VL: # %bb.0:
2202 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
2203 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
2204 ; AVX512VL-NEXT: vpsrlq $3, %xmm0, %xmm0
2205 ; AVX512VL-NEXT: retq
2207 ; X32-SSE-LABEL: splatconstant_shift_v2i16:
2209 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2210 ; X32-SSE-NEXT: psrlq {{\.LCPI.*}}, %xmm0
2211 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1]
2212 ; X32-SSE-NEXT: retl
2213 %shift = lshr <2 x i16> %a, <i16 3, i16 3>
2214 ret <2 x i16> %shift
2217 define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
2218 ; SSE-LABEL: splatconstant_shift_v8i8:
2220 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
2221 ; SSE-NEXT: psrlw $3, %xmm0
2224 ; AVX-LABEL: splatconstant_shift_v8i8:
2226 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2227 ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
2230 ; XOP-LABEL: splatconstant_shift_v8i8:
2232 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2233 ; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
2236 ; AVX512-LABEL: splatconstant_shift_v8i8:
2238 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2239 ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
2242 ; AVX512VL-LABEL: splatconstant_shift_v8i8:
2243 ; AVX512VL: # %bb.0:
2244 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2245 ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
2246 ; AVX512VL-NEXT: retq
2248 ; X32-SSE-LABEL: splatconstant_shift_v8i8:
2250 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2251 ; X32-SSE-NEXT: psrlw $3, %xmm0
2252 ; X32-SSE-NEXT: retl
2253 %shift = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
2257 define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
2258 ; SSE-LABEL: splatconstant_shift_v4i8:
2260 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
2261 ; SSE-NEXT: psrld $3, %xmm0
2264 ; AVX-LABEL: splatconstant_shift_v4i8:
2266 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2267 ; AVX-NEXT: vpsrld $3, %xmm0, %xmm0
2270 ; XOP-LABEL: splatconstant_shift_v4i8:
2272 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2273 ; XOP-NEXT: vpsrld $3, %xmm0, %xmm0
2276 ; AVX512-LABEL: splatconstant_shift_v4i8:
2278 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2279 ; AVX512-NEXT: vpsrld $3, %xmm0, %xmm0
2282 ; AVX512VL-LABEL: splatconstant_shift_v4i8:
2283 ; AVX512VL: # %bb.0:
2284 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2285 ; AVX512VL-NEXT: vpsrld $3, %xmm0, %xmm0
2286 ; AVX512VL-NEXT: retq
2288 ; X32-SSE-LABEL: splatconstant_shift_v4i8:
2290 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2291 ; X32-SSE-NEXT: psrld $3, %xmm0
2292 ; X32-SSE-NEXT: retl
2293 %shift = lshr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
2297 define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
2298 ; SSE-LABEL: splatconstant_shift_v2i8:
2300 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
2301 ; SSE-NEXT: psrlq $3, %xmm0
2304 ; AVX-LABEL: splatconstant_shift_v2i8:
2306 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2307 ; AVX-NEXT: vpsrlq $3, %xmm0, %xmm0
2310 ; XOP-LABEL: splatconstant_shift_v2i8:
2312 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2313 ; XOP-NEXT: vpsrlq $3, %xmm0, %xmm0
2316 ; AVX512-LABEL: splatconstant_shift_v2i8:
2318 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2319 ; AVX512-NEXT: vpsrlq $3, %xmm0, %xmm0
2322 ; AVX512VL-LABEL: splatconstant_shift_v2i8:
2323 ; AVX512VL: # %bb.0:
2324 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2325 ; AVX512VL-NEXT: vpsrlq $3, %xmm0, %xmm0
2326 ; AVX512VL-NEXT: retq
2328 ; X32-SSE-LABEL: splatconstant_shift_v2i8:
2330 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2331 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
2332 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
2333 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2
2334 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2335 ; X32-SSE-NEXT: psrlq %xmm1, %xmm0
2336 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
2337 ; X32-SSE-NEXT: retl
2338 %shift = lshr <2 x i8> %a, <i8 3, i8 3>