1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
11 ; 32-bit runs to make sure we do reasonable things for i64 shifts.
12 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1
13 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
19 define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
20 ; AVX1-LABEL: var_shift_v4i64:
22 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
23 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
24 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
25 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
26 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
27 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
28 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3
29 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
30 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
31 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
32 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
35 ; AVX2-LABEL: var_shift_v4i64:
37 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
40 ; XOPAVX1-LABEL: var_shift_v4i64:
42 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
43 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
44 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
45 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
46 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm4, %xmm2
47 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1
48 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
49 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
52 ; XOPAVX2-LABEL: var_shift_v4i64:
54 ; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
57 ; AVX512-LABEL: var_shift_v4i64:
59 ; AVX512-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
62 ; AVX512VL-LABEL: var_shift_v4i64:
64 ; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
67 ; X32-AVX1-LABEL: var_shift_v4i64:
69 ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
70 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
71 ; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
72 ; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
73 ; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
74 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
75 ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3
76 ; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
77 ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
78 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
79 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
82 ; X32-AVX2-LABEL: var_shift_v4i64:
84 ; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
86 %shift = lshr <4 x i64> %a, %b
90 define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
91 ; AVX1-LABEL: var_shift_v8i32:
93 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
94 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
95 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
96 ; AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4
97 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
98 ; AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5
99 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
100 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
101 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
102 ; AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6
103 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
104 ; AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2
105 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
106 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
107 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
108 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
109 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
110 ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
111 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
112 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
113 ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
114 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
115 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
116 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
117 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
118 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
121 ; AVX2-LABEL: var_shift_v8i32:
123 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
126 ; XOPAVX1-LABEL: var_shift_v8i32:
128 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
129 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
130 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
131 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
132 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm4, %xmm2
133 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1
134 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
135 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
138 ; XOPAVX2-LABEL: var_shift_v8i32:
140 ; XOPAVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
143 ; AVX512-LABEL: var_shift_v8i32:
145 ; AVX512-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
148 ; AVX512VL-LABEL: var_shift_v8i32:
150 ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
151 ; AVX512VL-NEXT: retq
153 ; X32-AVX1-LABEL: var_shift_v8i32:
155 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
156 ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
157 ; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
158 ; X32-AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4
159 ; X32-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
160 ; X32-AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5
161 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
162 ; X32-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
163 ; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
164 ; X32-AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6
165 ; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
166 ; X32-AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2
167 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
168 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
169 ; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
170 ; X32-AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
171 ; X32-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
172 ; X32-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
173 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
174 ; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
175 ; X32-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
176 ; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
177 ; X32-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
178 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
179 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
180 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
181 ; X32-AVX1-NEXT: retl
183 ; X32-AVX2-LABEL: var_shift_v8i32:
185 ; X32-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
186 ; X32-AVX2-NEXT: retl
187 %shift = lshr <8 x i32> %a, %b
191 define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
192 ; AVX1-LABEL: var_shift_v16i16:
194 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
195 ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
196 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
197 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
198 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
199 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
200 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5
201 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
202 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4
203 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
204 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4
205 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
206 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
207 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4
208 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
209 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
210 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
211 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
212 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
213 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
214 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4
215 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
216 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
217 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
218 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
219 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
220 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
221 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
222 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
223 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
224 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
227 ; AVX2-LABEL: var_shift_v16i16:
229 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
230 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
231 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
232 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
233 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
234 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
235 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
236 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
237 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
238 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
241 ; XOPAVX1-LABEL: var_shift_v16i16:
243 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
244 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
245 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
246 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
247 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm4, %xmm2
248 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1
249 ; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0
250 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
253 ; XOPAVX2-LABEL: var_shift_v16i16:
255 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
256 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
257 ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
258 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
259 ; XOPAVX2-NEXT: vpshlw %xmm2, %xmm4, %xmm2
260 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
261 ; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0
262 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
265 ; AVX512DQ-LABEL: var_shift_v16i16:
267 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
268 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
269 ; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
270 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
271 ; AVX512DQ-NEXT: retq
273 ; AVX512BW-LABEL: var_shift_v16i16:
275 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
276 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
277 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
278 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
279 ; AVX512BW-NEXT: retq
281 ; AVX512DQVL-LABEL: var_shift_v16i16:
282 ; AVX512DQVL: # %bb.0:
283 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
284 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
285 ; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
286 ; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
287 ; AVX512DQVL-NEXT: retq
289 ; AVX512BWVL-LABEL: var_shift_v16i16:
290 ; AVX512BWVL: # %bb.0:
291 ; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
292 ; AVX512BWVL-NEXT: retq
294 ; X32-AVX1-LABEL: var_shift_v16i16:
296 ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
297 ; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
298 ; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
299 ; X32-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
300 ; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
301 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
302 ; X32-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5
303 ; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
304 ; X32-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4
305 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
306 ; X32-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4
307 ; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
308 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
309 ; X32-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4
310 ; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
311 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
312 ; X32-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
313 ; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
314 ; X32-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
315 ; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
316 ; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4
317 ; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
318 ; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
319 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
320 ; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
321 ; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
322 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
323 ; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
324 ; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
325 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
326 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
327 ; X32-AVX1-NEXT: retl
329 ; X32-AVX2-LABEL: var_shift_v16i16:
331 ; X32-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
332 ; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
333 ; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
334 ; X32-AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
335 ; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
336 ; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
337 ; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
338 ; X32-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
339 ; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
340 ; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
341 ; X32-AVX2-NEXT: retl
342 %shift = lshr <16 x i16> %a, %b
343 ret <16 x i16> %shift
346 define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
347 ; AVX1-LABEL: var_shift_v32i8:
349 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
350 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
351 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
352 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
353 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
354 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
355 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
356 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
357 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
358 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
359 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
360 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
361 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
362 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
363 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
364 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
365 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
366 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
367 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
368 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
369 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
370 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
371 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
372 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
373 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
374 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
375 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
376 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
377 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
378 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
381 ; AVX2-LABEL: var_shift_v32i8:
383 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
384 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
385 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
386 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
387 ; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
388 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
389 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
390 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
391 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
392 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
393 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
394 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
397 ; XOPAVX1-LABEL: var_shift_v32i8:
399 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
400 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
401 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
402 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
403 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm4, %xmm2
404 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1
405 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
406 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
409 ; XOPAVX2-LABEL: var_shift_v32i8:
411 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
412 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
413 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
414 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
415 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm4, %xmm2
416 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
417 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
418 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
421 ; AVX512DQ-LABEL: var_shift_v32i8:
423 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
424 ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2
425 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
426 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
427 ; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm2
428 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
429 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
430 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
431 ; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm2
432 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
433 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
434 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
435 ; AVX512DQ-NEXT: retq
437 ; AVX512BW-LABEL: var_shift_v32i8:
439 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
440 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
441 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
442 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
443 ; AVX512BW-NEXT: retq
445 ; AVX512DQVL-LABEL: var_shift_v32i8:
446 ; AVX512DQVL: # %bb.0:
447 ; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
448 ; AVX512DQVL-NEXT: vpsrlw $4, %ymm0, %ymm2
449 ; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
450 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
451 ; AVX512DQVL-NEXT: vpsrlw $2, %ymm0, %ymm2
452 ; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
453 ; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
454 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
455 ; AVX512DQVL-NEXT: vpsrlw $1, %ymm0, %ymm2
456 ; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
457 ; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
458 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
459 ; AVX512DQVL-NEXT: retq
461 ; AVX512BWVL-LABEL: var_shift_v32i8:
462 ; AVX512BWVL: # %bb.0:
463 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
464 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
465 ; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
466 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
467 ; AVX512BWVL-NEXT: retq
469 ; X32-AVX1-LABEL: var_shift_v32i8:
471 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
472 ; X32-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
473 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
474 ; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
475 ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
476 ; X32-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
477 ; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
478 ; X32-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
479 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
480 ; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
481 ; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
482 ; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
483 ; X32-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
484 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
485 ; X32-AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
486 ; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
487 ; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
488 ; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
489 ; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
490 ; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
491 ; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
492 ; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
493 ; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
494 ; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
495 ; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
496 ; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
497 ; X32-AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
498 ; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
499 ; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
500 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
501 ; X32-AVX1-NEXT: retl
503 ; X32-AVX2-LABEL: var_shift_v32i8:
505 ; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
506 ; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
507 ; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
508 ; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
509 ; X32-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
510 ; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
511 ; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
512 ; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
513 ; X32-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
514 ; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
515 ; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
516 ; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
517 ; X32-AVX2-NEXT: retl
518 %shift = lshr <32 x i8> %a, %b
523 ; Uniform Variable Shifts
526 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
527 ; AVX1-LABEL: splatvar_shift_v4i64:
529 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
530 ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
531 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
532 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
535 ; AVX2-LABEL: splatvar_shift_v4i64:
537 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
540 ; XOPAVX1-LABEL: splatvar_shift_v4i64:
542 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
543 ; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
544 ; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
545 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
548 ; XOPAVX2-LABEL: splatvar_shift_v4i64:
550 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
553 ; AVX512-LABEL: splatvar_shift_v4i64:
555 ; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
558 ; AVX512VL-LABEL: splatvar_shift_v4i64:
560 ; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
561 ; AVX512VL-NEXT: retq
563 ; X32-AVX1-LABEL: splatvar_shift_v4i64:
565 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
566 ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
567 ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
568 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
569 ; X32-AVX1-NEXT: retl
571 ; X32-AVX2-LABEL: splatvar_shift_v4i64:
573 ; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
574 ; X32-AVX2-NEXT: retl
575 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
576 %shift = lshr <4 x i64> %a, %splat
580 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
581 ; AVX1-LABEL: splatvar_shift_v8i32:
583 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
584 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
585 ; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
586 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
587 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
590 ; AVX2-LABEL: splatvar_shift_v8i32:
592 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
593 ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
596 ; XOPAVX1-LABEL: splatvar_shift_v8i32:
598 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
599 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
600 ; XOPAVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
601 ; XOPAVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
602 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
605 ; XOPAVX2-LABEL: splatvar_shift_v8i32:
607 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
608 ; XOPAVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
611 ; AVX512-LABEL: splatvar_shift_v8i32:
613 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
614 ; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0
617 ; AVX512VL-LABEL: splatvar_shift_v8i32:
619 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
620 ; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0
621 ; AVX512VL-NEXT: retq
623 ; X32-AVX1-LABEL: splatvar_shift_v8i32:
625 ; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
626 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
627 ; X32-AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
628 ; X32-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
629 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
630 ; X32-AVX1-NEXT: retl
632 ; X32-AVX2-LABEL: splatvar_shift_v8i32:
634 ; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
635 ; X32-AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
636 ; X32-AVX2-NEXT: retl
637 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
638 %shift = lshr <8 x i32> %a, %splat
642 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
643 ; AVX1-LABEL: splatvar_shift_v16i16:
645 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
646 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
647 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
648 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
649 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
652 ; AVX2-LABEL: splatvar_shift_v16i16:
654 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
655 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
658 ; XOPAVX1-LABEL: splatvar_shift_v16i16:
660 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
661 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
662 ; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
663 ; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
664 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
667 ; XOPAVX2-LABEL: splatvar_shift_v16i16:
669 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
670 ; XOPAVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
673 ; AVX512-LABEL: splatvar_shift_v16i16:
675 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
676 ; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
679 ; AVX512VL-LABEL: splatvar_shift_v16i16:
681 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
682 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
683 ; AVX512VL-NEXT: retq
685 ; X32-AVX1-LABEL: splatvar_shift_v16i16:
687 ; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
688 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
689 ; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
690 ; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
691 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
692 ; X32-AVX1-NEXT: retl
694 ; X32-AVX2-LABEL: splatvar_shift_v16i16:
696 ; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
697 ; X32-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
698 ; X32-AVX2-NEXT: retl
699 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
700 %shift = lshr <16 x i16> %a, %splat
701 ret <16 x i16> %shift
704 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
705 ; AVX1-LABEL: splatvar_shift_v32i8:
707 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
708 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
709 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
710 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
711 ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
712 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
713 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
714 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
715 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
716 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
719 ; AVX2-LABEL: splatvar_shift_v32i8:
721 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
722 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
723 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
724 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
725 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
726 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
727 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
730 ; XOPAVX1-LABEL: splatvar_shift_v32i8:
732 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
733 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
734 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
735 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
736 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2
737 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
738 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
741 ; XOPAVX2-LABEL: splatvar_shift_v32i8:
743 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1
744 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
745 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
746 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
747 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
748 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm4, %xmm2
749 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
750 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
751 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
754 ; AVX512DQ-LABEL: splatvar_shift_v32i8:
756 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
757 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
758 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
759 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
760 ; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
761 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
762 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
763 ; AVX512DQ-NEXT: retq
765 ; AVX512BW-LABEL: splatvar_shift_v32i8:
767 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1
768 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
769 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
770 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
771 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
772 ; AVX512BW-NEXT: retq
774 ; AVX512DQVL-LABEL: splatvar_shift_v32i8:
775 ; AVX512DQVL: # %bb.0:
776 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
777 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
778 ; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
779 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
780 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
781 ; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
782 ; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
783 ; AVX512DQVL-NEXT: retq
785 ; AVX512BWVL-LABEL: splatvar_shift_v32i8:
786 ; AVX512BWVL: # %bb.0:
787 ; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %ymm1
788 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
789 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
790 ; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
791 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
792 ; AVX512BWVL-NEXT: retq
794 ; X32-AVX1-LABEL: splatvar_shift_v32i8:
796 ; X32-AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
797 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
798 ; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
799 ; X32-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
800 ; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
801 ; X32-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
802 ; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
803 ; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
804 ; X32-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
805 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
806 ; X32-AVX1-NEXT: retl
808 ; X32-AVX2-LABEL: splatvar_shift_v32i8:
810 ; X32-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
811 ; X32-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
812 ; X32-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
813 ; X32-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
814 ; X32-AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
815 ; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
816 ; X32-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
817 ; X32-AVX2-NEXT: retl
818 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
819 %shift = lshr <32 x i8> %a, %splat
827 define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
828 ; AVX1-LABEL: constant_shift_v4i64:
830 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
831 ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
832 ; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
833 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
834 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
835 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
836 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
837 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
840 ; AVX2-LABEL: constant_shift_v4i64:
842 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
845 ; XOPAVX1-LABEL: constant_shift_v4i64:
847 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm1
848 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
849 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
850 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
853 ; XOPAVX2-LABEL: constant_shift_v4i64:
855 ; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
858 ; AVX512-LABEL: constant_shift_v4i64:
860 ; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
863 ; AVX512VL-LABEL: constant_shift_v4i64:
865 ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
866 ; AVX512VL-NEXT: retq
868 ; X32-AVX1-LABEL: constant_shift_v4i64:
870 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
871 ; X32-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
872 ; X32-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
873 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
874 ; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
875 ; X32-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
876 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
877 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
878 ; X32-AVX1-NEXT: retl
880 ; X32-AVX2-LABEL: constant_shift_v4i64:
882 ; X32-AVX2-NEXT: vpsrlvq {{\.LCPI.*}}, %ymm0, %ymm0
883 ; X32-AVX2-NEXT: retl
884 %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
888 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
889 ; AVX1-LABEL: constant_shift_v8i32:
891 ; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
892 ; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
893 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
894 ; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2
895 ; AVX1-NEXT: vpsrld $4, %xmm0, %xmm3
896 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
897 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
898 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
899 ; AVX1-NEXT: vpsrld $7, %xmm0, %xmm2
900 ; AVX1-NEXT: vpsrld $9, %xmm0, %xmm3
901 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
902 ; AVX1-NEXT: vpsrld $8, %xmm0, %xmm0
903 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
904 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
907 ; AVX2-LABEL: constant_shift_v8i32:
909 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
912 ; XOPAVX1-LABEL: constant_shift_v8i32:
914 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1
915 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
916 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
917 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
920 ; XOPAVX2-LABEL: constant_shift_v8i32:
922 ; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
925 ; AVX512-LABEL: constant_shift_v8i32:
927 ; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
930 ; AVX512VL-LABEL: constant_shift_v8i32:
932 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
933 ; AVX512VL-NEXT: retq
935 ; X32-AVX1-LABEL: constant_shift_v8i32:
937 ; X32-AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
938 ; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
939 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
940 ; X32-AVX1-NEXT: vpsrld $6, %xmm0, %xmm2
941 ; X32-AVX1-NEXT: vpsrld $4, %xmm0, %xmm3
942 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
943 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
944 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
945 ; X32-AVX1-NEXT: vpsrld $7, %xmm0, %xmm2
946 ; X32-AVX1-NEXT: vpsrld $9, %xmm0, %xmm3
947 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
948 ; X32-AVX1-NEXT: vpsrld $8, %xmm0, %xmm0
949 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
950 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
951 ; X32-AVX1-NEXT: retl
953 ; X32-AVX2-LABEL: constant_shift_v8i32:
955 ; X32-AVX2-NEXT: vpsrlvd {{\.LCPI.*}}, %ymm0, %ymm0
956 ; X32-AVX2-NEXT: retl
957 %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
961 define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
962 ; AVX1-LABEL: constant_shift_v16i16:
964 ; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
965 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
966 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
967 ; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
968 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
971 ; AVX2-LABEL: constant_shift_v16i16:
973 ; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
974 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
975 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
978 ; XOPAVX1-LABEL: constant_shift_v16i16:
980 ; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm1
981 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
982 ; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
983 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
986 ; XOPAVX2-LABEL: constant_shift_v16i16:
988 ; XOPAVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
989 ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
990 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
993 ; AVX512DQ-LABEL: constant_shift_v16i16:
995 ; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
996 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
997 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
998 ; AVX512DQ-NEXT: retq
1000 ; AVX512BW-LABEL: constant_shift_v16i16:
1001 ; AVX512BW: # %bb.0:
1002 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1003 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1004 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1005 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1006 ; AVX512BW-NEXT: retq
1008 ; AVX512DQVL-LABEL: constant_shift_v16i16:
1009 ; AVX512DQVL: # %bb.0:
1010 ; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
1011 ; AVX512DQVL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1012 ; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1013 ; AVX512DQVL-NEXT: retq
1015 ; AVX512BWVL-LABEL: constant_shift_v16i16:
1016 ; AVX512BWVL: # %bb.0:
1017 ; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
1018 ; AVX512BWVL-NEXT: retq
1020 ; X32-AVX1-LABEL: constant_shift_v16i16:
1021 ; X32-AVX1: # %bb.0:
1022 ; X32-AVX1-NEXT: vpmulhuw {{\.LCPI.*}}, %xmm0, %xmm1
1023 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1024 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1025 ; X32-AVX1-NEXT: vpmulhuw {{\.LCPI.*}}, %xmm0, %xmm0
1026 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1027 ; X32-AVX1-NEXT: retl
1029 ; X32-AVX2-LABEL: constant_shift_v16i16:
1030 ; X32-AVX2: # %bb.0:
1031 ; X32-AVX2-NEXT: vpmulhuw {{\.LCPI.*}}, %ymm0, %ymm1
1032 ; X32-AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1033 ; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1034 ; X32-AVX2-NEXT: retl
1035 %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1036 ret <16 x i16> %shift
1039 define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
1040 ; AVX1-LABEL: constant_shift_v32i8:
1042 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1043 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1044 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1045 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,4,8,16,32,64,128,256]
1046 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
1047 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1048 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1049 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [256,128,64,32,16,8,4,2]
1050 ; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
1051 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1052 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
1053 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1054 ; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
1055 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1056 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1057 ; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
1058 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1059 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1060 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1063 ; AVX2-LABEL: constant_shift_v32i8:
1065 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1066 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1067 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
1068 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
1069 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1070 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1071 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1072 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1075 ; XOPAVX1-LABEL: constant_shift_v32i8:
1077 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1078 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1079 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1080 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1081 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1082 ; XOPAVX1-NEXT: retq
1084 ; XOPAVX2-LABEL: constant_shift_v32i8:
1086 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1087 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1088 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1089 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1090 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1091 ; XOPAVX2-NEXT: retq
1093 ; AVX512DQ-LABEL: constant_shift_v32i8:
1094 ; AVX512DQ: # %bb.0:
1095 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
1096 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1097 ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
1098 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
1099 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1100 ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1101 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
1102 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1103 ; AVX512DQ-NEXT: retq
1105 ; AVX512BW-LABEL: constant_shift_v32i8:
1106 ; AVX512BW: # %bb.0:
1107 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1108 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
1109 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1110 ; AVX512BW-NEXT: retq
1112 ; AVX512DQVL-LABEL: constant_shift_v32i8:
1113 ; AVX512DQVL: # %bb.0:
1114 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1115 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm1, %ymm1
1116 ; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
1117 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm1, %ymm1
1118 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1119 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0
1120 ; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1121 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0
1122 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1123 ; AVX512DQVL-NEXT: retq
1125 ; AVX512BWVL-LABEL: constant_shift_v32i8:
1126 ; AVX512BWVL: # %bb.0:
1127 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1128 ; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
1129 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
1130 ; AVX512BWVL-NEXT: retq
1132 ; X32-AVX1-LABEL: constant_shift_v32i8:
1133 ; X32-AVX1: # %bb.0:
1134 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1135 ; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1136 ; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1137 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,4,8,16,32,64,128,256]
1138 ; X32-AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
1139 ; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1140 ; X32-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1141 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [256,128,64,32,16,8,4,2]
1142 ; X32-AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
1143 ; X32-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1144 ; X32-AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
1145 ; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1146 ; X32-AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
1147 ; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1148 ; X32-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1149 ; X32-AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
1150 ; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1151 ; X32-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1152 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1153 ; X32-AVX1-NEXT: retl
1155 ; X32-AVX2-LABEL: constant_shift_v32i8:
1156 ; X32-AVX2: # %bb.0:
1157 ; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1158 ; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1159 ; X32-AVX2-NEXT: vpmullw {{\.LCPI.*}}, %ymm2, %ymm2
1160 ; X32-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
1161 ; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1162 ; X32-AVX2-NEXT: vpmullw {{\.LCPI.*}}, %ymm0, %ymm0
1163 ; X32-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1164 ; X32-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1165 ; X32-AVX2-NEXT: retl
1166 %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1167 ret <32 x i8> %shift
1171 ; Uniform Constant Shifts
1174 define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
1175 ; AVX1-LABEL: splatconstant_shift_v4i64:
1177 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
1178 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1179 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
1180 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1183 ; AVX2-LABEL: splatconstant_shift_v4i64:
1185 ; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1188 ; XOPAVX1-LABEL: splatconstant_shift_v4i64:
1190 ; XOPAVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
1191 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1192 ; XOPAVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
1193 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1194 ; XOPAVX1-NEXT: retq
1196 ; XOPAVX2-LABEL: splatconstant_shift_v4i64:
1198 ; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1199 ; XOPAVX2-NEXT: retq
1201 ; AVX512-LABEL: splatconstant_shift_v4i64:
1203 ; AVX512-NEXT: vpsrlq $7, %ymm0, %ymm0
1206 ; AVX512VL-LABEL: splatconstant_shift_v4i64:
1207 ; AVX512VL: # %bb.0:
1208 ; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0
1209 ; AVX512VL-NEXT: retq
1211 ; X32-AVX1-LABEL: splatconstant_shift_v4i64:
1212 ; X32-AVX1: # %bb.0:
1213 ; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
1214 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1215 ; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
1216 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1217 ; X32-AVX1-NEXT: retl
1219 ; X32-AVX2-LABEL: splatconstant_shift_v4i64:
1220 ; X32-AVX2: # %bb.0:
1221 ; X32-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1222 ; X32-AVX2-NEXT: retl
1223 %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
1224 ret <4 x i64> %shift
1227 define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
1228 ; AVX1-LABEL: splatconstant_shift_v8i32:
1230 ; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
1231 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1232 ; AVX1-NEXT: vpsrld $5, %xmm0, %xmm0
1233 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1236 ; AVX2-LABEL: splatconstant_shift_v8i32:
1238 ; AVX2-NEXT: vpsrld $5, %ymm0, %ymm0
1241 ; XOPAVX1-LABEL: splatconstant_shift_v8i32:
1243 ; XOPAVX1-NEXT: vpsrld $5, %xmm0, %xmm1
1244 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1245 ; XOPAVX1-NEXT: vpsrld $5, %xmm0, %xmm0
1246 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1247 ; XOPAVX1-NEXT: retq
1249 ; XOPAVX2-LABEL: splatconstant_shift_v8i32:
1251 ; XOPAVX2-NEXT: vpsrld $5, %ymm0, %ymm0
1252 ; XOPAVX2-NEXT: retq
1254 ; AVX512-LABEL: splatconstant_shift_v8i32:
1256 ; AVX512-NEXT: vpsrld $5, %ymm0, %ymm0
1259 ; AVX512VL-LABEL: splatconstant_shift_v8i32:
1260 ; AVX512VL: # %bb.0:
1261 ; AVX512VL-NEXT: vpsrld $5, %ymm0, %ymm0
1262 ; AVX512VL-NEXT: retq
1264 ; X32-AVX1-LABEL: splatconstant_shift_v8i32:
1265 ; X32-AVX1: # %bb.0:
1266 ; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
1267 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1268 ; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm0
1269 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1270 ; X32-AVX1-NEXT: retl
1272 ; X32-AVX2-LABEL: splatconstant_shift_v8i32:
1273 ; X32-AVX2: # %bb.0:
1274 ; X32-AVX2-NEXT: vpsrld $5, %ymm0, %ymm0
1275 ; X32-AVX2-NEXT: retl
1276 %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
1277 ret <8 x i32> %shift
1280 define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
1281 ; AVX1-LABEL: splatconstant_shift_v16i16:
1283 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
1284 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1285 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1286 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1289 ; AVX2-LABEL: splatconstant_shift_v16i16:
1291 ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1294 ; XOPAVX1-LABEL: splatconstant_shift_v16i16:
1296 ; XOPAVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
1297 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1298 ; XOPAVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1299 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1300 ; XOPAVX1-NEXT: retq
1302 ; XOPAVX2-LABEL: splatconstant_shift_v16i16:
1304 ; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1305 ; XOPAVX2-NEXT: retq
1307 ; AVX512-LABEL: splatconstant_shift_v16i16:
1309 ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
1312 ; AVX512VL-LABEL: splatconstant_shift_v16i16:
1313 ; AVX512VL: # %bb.0:
1314 ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
1315 ; AVX512VL-NEXT: retq
1317 ; X32-AVX1-LABEL: splatconstant_shift_v16i16:
1318 ; X32-AVX1: # %bb.0:
1319 ; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
1320 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1321 ; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1322 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1323 ; X32-AVX1-NEXT: retl
1325 ; X32-AVX2-LABEL: splatconstant_shift_v16i16:
1326 ; X32-AVX2: # %bb.0:
1327 ; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1328 ; X32-AVX2-NEXT: retl
1329 %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1330 ret <16 x i16> %shift
1333 define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
1334 ; AVX1-LABEL: splatconstant_shift_v32i8:
1336 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1337 ; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
1338 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1339 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1340 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1341 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1342 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1345 ; AVX2-LABEL: splatconstant_shift_v32i8:
1347 ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1348 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1351 ; XOPAVX1-LABEL: splatconstant_shift_v32i8:
1353 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1354 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253]
1355 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1356 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1357 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1358 ; XOPAVX1-NEXT: retq
1360 ; XOPAVX2-LABEL: splatconstant_shift_v32i8:
1362 ; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1363 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1364 ; XOPAVX2-NEXT: retq
1366 ; AVX512-LABEL: splatconstant_shift_v32i8:
1368 ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
1369 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1372 ; AVX512VL-LABEL: splatconstant_shift_v32i8:
1373 ; AVX512VL: # %bb.0:
1374 ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
1375 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1376 ; AVX512VL-NEXT: retq
1378 ; X32-AVX1-LABEL: splatconstant_shift_v32i8:
1379 ; X32-AVX1: # %bb.0:
1380 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1381 ; X32-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
1382 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1383 ; X32-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1384 ; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1385 ; X32-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1386 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1387 ; X32-AVX1-NEXT: retl
1389 ; X32-AVX2-LABEL: splatconstant_shift_v32i8:
1390 ; X32-AVX2: # %bb.0:
1391 ; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1392 ; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
1393 ; X32-AVX2-NEXT: retl
1394 %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1395 ret <32 x i8> %shift