1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
11 ; 32-bit runs to make sure we do reasonable things for i64 shifts.
12 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1
13 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
19 define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
20 ; AVX1-LABEL: var_shift_v4i64:
22 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
23 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
24 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
25 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
26 ; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6
27 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
28 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
29 ; AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2
30 ; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5
31 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
32 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
33 ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
34 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4
35 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
36 ; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3
37 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
38 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1
39 ; AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0
40 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
41 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
42 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
43 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
46 ; AVX2-LABEL: var_shift_v4i64:
48 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
49 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
50 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
51 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
52 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
55 ; XOPAVX1-LABEL: var_shift_v4i64:
57 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
58 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
59 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
60 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
61 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm4, %xmm2
62 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1
63 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
64 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
67 ; XOPAVX2-LABEL: var_shift_v4i64:
69 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
70 ; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
71 ; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
72 ; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
73 ; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
76 ; AVX512-LABEL: var_shift_v4i64:
78 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
79 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
80 ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
81 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
84 ; AVX512VL-LABEL: var_shift_v4i64:
86 ; AVX512VL-NEXT: vpsravq %ymm1, %ymm0, %ymm0
89 ; X32-AVX1-LABEL: var_shift_v4i64:
91 ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
92 ; X32-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
93 ; X32-AVX1-NEXT: # xmm3 = mem[0,0]
94 ; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
95 ; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
96 ; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6
97 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
98 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
99 ; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2
100 ; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5
101 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
102 ; X32-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
103 ; X32-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
104 ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4
105 ; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
106 ; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3
107 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
108 ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1
109 ; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0
110 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
111 ; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
112 ; X32-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
113 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
114 ; X32-AVX1-NEXT: retl
116 ; X32-AVX2-LABEL: var_shift_v4i64:
118 ; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
119 ; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
120 ; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
121 ; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
122 ; X32-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
123 ; X32-AVX2-NEXT: retl
124 %shift = ashr <4 x i64> %a, %b
128 define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
129 ; AVX1-LABEL: var_shift_v8i32:
131 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
132 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
133 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
134 ; AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4
135 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
136 ; AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5
137 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
138 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
139 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
140 ; AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6
141 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
142 ; AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2
143 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
144 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
145 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
146 ; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
147 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
148 ; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
149 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
150 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
151 ; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
152 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
153 ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
154 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
155 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
156 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
159 ; AVX2-LABEL: var_shift_v8i32:
161 ; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
164 ; XOPAVX1-LABEL: var_shift_v8i32:
166 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
167 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
168 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
169 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
170 ; XOPAVX1-NEXT: vpshad %xmm2, %xmm4, %xmm2
171 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1
172 ; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
173 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
176 ; XOPAVX2-LABEL: var_shift_v8i32:
178 ; XOPAVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
181 ; AVX512-LABEL: var_shift_v8i32:
183 ; AVX512-NEXT: vpsravd %ymm1, %ymm0, %ymm0
186 ; AVX512VL-LABEL: var_shift_v8i32:
188 ; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
189 ; AVX512VL-NEXT: retq
191 ; X32-AVX1-LABEL: var_shift_v8i32:
193 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
194 ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
195 ; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
196 ; X32-AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4
197 ; X32-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
198 ; X32-AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5
199 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
200 ; X32-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
201 ; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
202 ; X32-AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6
203 ; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
204 ; X32-AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2
205 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
206 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
207 ; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
208 ; X32-AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
209 ; X32-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
210 ; X32-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
211 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
212 ; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
213 ; X32-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
214 ; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
215 ; X32-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
216 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
217 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
218 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
219 ; X32-AVX1-NEXT: retl
221 ; X32-AVX2-LABEL: var_shift_v8i32:
223 ; X32-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
224 ; X32-AVX2-NEXT: retl
225 %shift = ashr <8 x i32> %a, %b
229 define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
230 ; AVX1-LABEL: var_shift_v16i16:
232 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
233 ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
234 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
235 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
236 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
237 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
238 ; AVX1-NEXT: vpsraw $8, %xmm4, %xmm5
239 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
240 ; AVX1-NEXT: vpsraw $4, %xmm2, %xmm4
241 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
242 ; AVX1-NEXT: vpsraw $2, %xmm2, %xmm4
243 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
244 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
245 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm4
246 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
247 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
248 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
249 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
250 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
251 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
252 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm4
253 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
254 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
255 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
256 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
257 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
258 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
259 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
260 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
261 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
262 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
265 ; AVX2-LABEL: var_shift_v16i16:
267 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
268 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
269 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
270 ; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
271 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
272 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
273 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
274 ; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
275 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
276 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
279 ; XOPAVX1-LABEL: var_shift_v16i16:
281 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
282 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
283 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
284 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
285 ; XOPAVX1-NEXT: vpshaw %xmm2, %xmm4, %xmm2
286 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1
287 ; XOPAVX1-NEXT: vpshaw %xmm1, %xmm0, %xmm0
288 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
291 ; XOPAVX2-LABEL: var_shift_v16i16:
293 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
294 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
295 ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
296 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
297 ; XOPAVX2-NEXT: vpshaw %xmm2, %xmm4, %xmm2
298 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
299 ; XOPAVX2-NEXT: vpshaw %xmm1, %xmm0, %xmm0
300 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
303 ; AVX512DQ-LABEL: var_shift_v16i16:
305 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
306 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
307 ; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
308 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
309 ; AVX512DQ-NEXT: retq
311 ; AVX512BW-LABEL: var_shift_v16i16:
313 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
314 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
315 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
316 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
317 ; AVX512BW-NEXT: retq
319 ; AVX512DQVL-LABEL: var_shift_v16i16:
320 ; AVX512DQVL: # %bb.0:
321 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
322 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0
323 ; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
324 ; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
325 ; AVX512DQVL-NEXT: retq
327 ; AVX512BWVL-LABEL: var_shift_v16i16:
328 ; AVX512BWVL: # %bb.0:
329 ; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
330 ; AVX512BWVL-NEXT: retq
332 ; X32-AVX1-LABEL: var_shift_v16i16:
334 ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
335 ; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
336 ; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
337 ; X32-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
338 ; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
339 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
340 ; X32-AVX1-NEXT: vpsraw $8, %xmm4, %xmm5
341 ; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
342 ; X32-AVX1-NEXT: vpsraw $4, %xmm2, %xmm4
343 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
344 ; X32-AVX1-NEXT: vpsraw $2, %xmm2, %xmm4
345 ; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
346 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
347 ; X32-AVX1-NEXT: vpsraw $1, %xmm2, %xmm4
348 ; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
349 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
350 ; X32-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
351 ; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
352 ; X32-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
353 ; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
354 ; X32-AVX1-NEXT: vpsraw $8, %xmm0, %xmm4
355 ; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
356 ; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
357 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
358 ; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
359 ; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
360 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
361 ; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
362 ; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
363 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
364 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
365 ; X32-AVX1-NEXT: retl
367 ; X32-AVX2-LABEL: var_shift_v16i16:
369 ; X32-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
370 ; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
371 ; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
372 ; X32-AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
373 ; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
374 ; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
375 ; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
376 ; X32-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
377 ; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
378 ; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
379 ; X32-AVX2-NEXT: retl
380 %shift = ashr <16 x i16> %a, %b
381 ret <16 x i16> %shift
384 define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
385 ; AVX1-LABEL: var_shift_v32i8:
387 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
388 ; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
389 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
390 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
391 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
392 ; AVX1-NEXT: vpsraw $4, %xmm5, %xmm6
393 ; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
394 ; AVX1-NEXT: vpsraw $2, %xmm5, %xmm6
395 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
396 ; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
397 ; AVX1-NEXT: vpsraw $1, %xmm5, %xmm6
398 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
399 ; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
400 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
401 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
402 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
403 ; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
404 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
405 ; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
406 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
407 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
408 ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
409 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
410 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
411 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
412 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
413 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
414 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
415 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
416 ; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
417 ; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
418 ; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
419 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
420 ; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
421 ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
422 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
423 ; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
424 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
425 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
426 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
427 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm4
428 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
429 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm4
430 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
431 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
432 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm4
433 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
434 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
435 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
436 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
437 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
440 ; AVX2-LABEL: var_shift_v32i8:
442 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
443 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
444 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
445 ; AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
446 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
447 ; AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
448 ; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
449 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
450 ; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
451 ; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
452 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
453 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
454 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
455 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
456 ; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
457 ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
458 ; AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
459 ; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
460 ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
461 ; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
462 ; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
463 ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
464 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
465 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
468 ; XOPAVX1-LABEL: var_shift_v32i8:
470 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
471 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
472 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
473 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
474 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm4, %xmm2
475 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1
476 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
477 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
480 ; XOPAVX2-LABEL: var_shift_v32i8:
482 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
483 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
484 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
485 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
486 ; XOPAVX2-NEXT: vpshab %xmm2, %xmm4, %xmm2
487 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
488 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
489 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
492 ; AVX512DQ-LABEL: var_shift_v32i8:
494 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
495 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
496 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
497 ; AVX512DQ-NEXT: vpsraw $4, %ymm3, %ymm4
498 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
499 ; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4
500 ; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2
501 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
502 ; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4
503 ; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2
504 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
505 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
506 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
507 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
508 ; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm3
509 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
510 ; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm3
511 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1
512 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
513 ; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm3
514 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1
515 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
516 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
517 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
518 ; AVX512DQ-NEXT: retq
520 ; AVX512BW-LABEL: var_shift_v32i8:
522 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
523 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
524 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
525 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
526 ; AVX512BW-NEXT: retq
528 ; AVX512DQVL-LABEL: var_shift_v32i8:
529 ; AVX512DQVL: # %bb.0:
530 ; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
531 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
532 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
533 ; AVX512DQVL-NEXT: vpsraw $4, %ymm3, %ymm4
534 ; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
535 ; AVX512DQVL-NEXT: vpsraw $2, %ymm3, %ymm4
536 ; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
537 ; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
538 ; AVX512DQVL-NEXT: vpsraw $1, %ymm3, %ymm4
539 ; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
540 ; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
541 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2
542 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
543 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
544 ; AVX512DQVL-NEXT: vpsraw $4, %ymm0, %ymm3
545 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
546 ; AVX512DQVL-NEXT: vpsraw $2, %ymm0, %ymm3
547 ; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
548 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
549 ; AVX512DQVL-NEXT: vpsraw $1, %ymm0, %ymm3
550 ; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
551 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
552 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0
553 ; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
554 ; AVX512DQVL-NEXT: retq
556 ; AVX512BWVL-LABEL: var_shift_v32i8:
557 ; AVX512BWVL: # %bb.0:
558 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
559 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
560 ; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0
561 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
562 ; AVX512BWVL-NEXT: retq
564 ; X32-AVX1-LABEL: var_shift_v32i8:
566 ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
567 ; X32-AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
568 ; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
569 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
570 ; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
571 ; X32-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6
572 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
573 ; X32-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6
574 ; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
575 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
576 ; X32-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6
577 ; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
578 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
579 ; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
580 ; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
581 ; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
582 ; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
583 ; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
584 ; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
585 ; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
586 ; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
587 ; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
588 ; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
589 ; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
590 ; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
591 ; X32-AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
592 ; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
593 ; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
594 ; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
595 ; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
596 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
597 ; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
598 ; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
599 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
600 ; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
601 ; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
602 ; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
603 ; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
604 ; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
605 ; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
606 ; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm4
607 ; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
608 ; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm4
609 ; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
610 ; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
611 ; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm4
612 ; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
613 ; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
614 ; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
615 ; X32-AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
616 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
617 ; X32-AVX1-NEXT: retl
619 ; X32-AVX2-LABEL: var_shift_v32i8:
621 ; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
622 ; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
623 ; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
624 ; X32-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
625 ; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
626 ; X32-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
627 ; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
628 ; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
629 ; X32-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
630 ; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
631 ; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
632 ; X32-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
633 ; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
634 ; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
635 ; X32-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
636 ; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
637 ; X32-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
638 ; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
639 ; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
640 ; X32-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
641 ; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
642 ; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
643 ; X32-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
644 ; X32-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
645 ; X32-AVX2-NEXT: retl
646 %shift = ashr <32 x i8> %a, %b
651 ; Uniform Variable Shifts
654 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
655 ; AVX1-LABEL: splatvar_shift_v4i64:
657 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
658 ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
659 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
660 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
661 ; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3
662 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3
663 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
664 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
665 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
666 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
669 ; AVX2-LABEL: splatvar_shift_v4i64:
671 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
672 ; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
673 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
674 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
675 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
678 ; XOPAVX1-LABEL: splatvar_shift_v4i64:
680 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
681 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
682 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
683 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
684 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm2, %xmm2
685 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
686 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
689 ; XOPAVX2-LABEL: splatvar_shift_v4i64:
691 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
692 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
693 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
694 ; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
695 ; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
698 ; AVX512-LABEL: splatvar_shift_v4i64:
700 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
701 ; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0
702 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
705 ; AVX512VL-LABEL: splatvar_shift_v4i64:
707 ; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0
708 ; AVX512VL-NEXT: retq
710 ; X32-AVX1-LABEL: splatvar_shift_v4i64:
712 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
713 ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
714 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
715 ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
716 ; X32-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3
717 ; X32-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3
718 ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
719 ; X32-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
720 ; X32-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
721 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
722 ; X32-AVX1-NEXT: retl
724 ; X32-AVX2-LABEL: splatvar_shift_v4i64:
726 ; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
727 ; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
728 ; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
729 ; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
730 ; X32-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
731 ; X32-AVX2-NEXT: retl
732 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
733 %shift = ashr <4 x i64> %a, %splat
737 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
738 ; AVX1-LABEL: splatvar_shift_v8i32:
740 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
741 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
742 ; AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
743 ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
744 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
747 ; AVX2-LABEL: splatvar_shift_v8i32:
749 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
750 ; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
753 ; XOPAVX1-LABEL: splatvar_shift_v8i32:
755 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
756 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
757 ; XOPAVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
758 ; XOPAVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
759 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
762 ; XOPAVX2-LABEL: splatvar_shift_v8i32:
764 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
765 ; XOPAVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
768 ; AVX512-LABEL: splatvar_shift_v8i32:
770 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
771 ; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0
774 ; AVX512VL-LABEL: splatvar_shift_v8i32:
776 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
777 ; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0
778 ; AVX512VL-NEXT: retq
780 ; X32-AVX1-LABEL: splatvar_shift_v8i32:
782 ; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
783 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
784 ; X32-AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
785 ; X32-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
786 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
787 ; X32-AVX1-NEXT: retl
789 ; X32-AVX2-LABEL: splatvar_shift_v8i32:
791 ; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
792 ; X32-AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
793 ; X32-AVX2-NEXT: retl
794 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
795 %shift = ashr <8 x i32> %a, %splat
799 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
800 ; AVX1-LABEL: splatvar_shift_v16i16:
802 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
803 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
804 ; AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
805 ; AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
806 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
809 ; AVX2-LABEL: splatvar_shift_v16i16:
811 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
812 ; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
815 ; XOPAVX1-LABEL: splatvar_shift_v16i16:
817 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
818 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
819 ; XOPAVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
820 ; XOPAVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
821 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
824 ; XOPAVX2-LABEL: splatvar_shift_v16i16:
826 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
827 ; XOPAVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
830 ; AVX512-LABEL: splatvar_shift_v16i16:
832 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
833 ; AVX512-NEXT: vpsraw %xmm1, %ymm0, %ymm0
836 ; AVX512VL-LABEL: splatvar_shift_v16i16:
838 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
839 ; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0
840 ; AVX512VL-NEXT: retq
842 ; X32-AVX1-LABEL: splatvar_shift_v16i16:
844 ; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
845 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
846 ; X32-AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
847 ; X32-AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
848 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
849 ; X32-AVX1-NEXT: retl
851 ; X32-AVX2-LABEL: splatvar_shift_v16i16:
853 ; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
854 ; X32-AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
855 ; X32-AVX2-NEXT: retl
856 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
857 %shift = ashr <16 x i16> %a, %splat
858 ret <16 x i16> %shift
861 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
862 ; AVX1-LABEL: splatvar_shift_v32i8:
864 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
865 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
866 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
867 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
868 ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
869 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
870 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
871 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
872 ; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
873 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
874 ; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
875 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
876 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
877 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
878 ; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0
879 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
882 ; AVX2-LABEL: splatvar_shift_v32i8:
884 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
885 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
886 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
887 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
888 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
889 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
890 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
891 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
892 ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
893 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
894 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
897 ; XOPAVX1-LABEL: splatvar_shift_v32i8:
899 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
900 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
901 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
902 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
903 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm2, %xmm2
904 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
905 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
908 ; XOPAVX2-LABEL: splatvar_shift_v32i8:
910 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1
911 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
912 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
913 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
914 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
915 ; XOPAVX2-NEXT: vpshab %xmm2, %xmm4, %xmm2
916 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
917 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
918 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
921 ; AVX512DQ-LABEL: splatvar_shift_v32i8:
923 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
924 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
925 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
926 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
927 ; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2
928 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2
929 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
930 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
931 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
932 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
933 ; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0
934 ; AVX512DQ-NEXT: retq
936 ; AVX512BW-LABEL: splatvar_shift_v32i8:
938 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1
939 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
940 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
941 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
942 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
943 ; AVX512BW-NEXT: retq
945 ; AVX512DQVL-LABEL: splatvar_shift_v32i8:
946 ; AVX512DQVL: # %bb.0:
947 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
948 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
949 ; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
950 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
951 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm2, %xmm2
952 ; AVX512DQVL-NEXT: vpbroadcastb %xmm2, %ymm2
953 ; AVX512DQVL-NEXT: vpand %ymm2, %ymm0, %ymm0
954 ; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
955 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
956 ; AVX512DQVL-NEXT: vpxor %ymm1, %ymm0, %ymm0
957 ; AVX512DQVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
958 ; AVX512DQVL-NEXT: retq
960 ; AVX512BWVL-LABEL: splatvar_shift_v32i8:
961 ; AVX512BWVL: # %bb.0:
962 ; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %ymm1
963 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
964 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
965 ; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0
966 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
967 ; AVX512BWVL-NEXT: retq
969 ; X32-AVX1-LABEL: splatvar_shift_v32i8:
971 ; X32-AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
972 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
973 ; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
974 ; X32-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
975 ; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
976 ; X32-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
977 ; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
978 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
979 ; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
980 ; X32-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
981 ; X32-AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
982 ; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
983 ; X32-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
984 ; X32-AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
985 ; X32-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0
986 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
987 ; X32-AVX1-NEXT: retl
989 ; X32-AVX2-LABEL: splatvar_shift_v32i8:
991 ; X32-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
992 ; X32-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
993 ; X32-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
994 ; X32-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
995 ; X32-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
996 ; X32-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
997 ; X32-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
998 ; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
999 ; X32-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
1000 ; X32-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1001 ; X32-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1002 ; X32-AVX2-NEXT: retl
1003 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
1004 %shift = ashr <32 x i8> %a, %splat
1005 ret <32 x i8> %shift
1012 define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
1013 ; AVX1-LABEL: constant_shift_v4i64:
1015 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1016 ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
1017 ; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
1018 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1019 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967296,2]
1020 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
1021 ; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1022 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
1023 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
1024 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1025 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4611686018427387904,72057594037927936]
1026 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
1027 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
1028 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1031 ; AVX2-LABEL: constant_shift_v4i64:
1033 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
1034 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
1035 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1036 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
1039 ; XOPAVX1-LABEL: constant_shift_v4i64:
1041 ; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm1
1042 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1043 ; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
1044 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1045 ; XOPAVX1-NEXT: retq
1047 ; XOPAVX2-LABEL: constant_shift_v4i64:
1049 ; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
1050 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
1051 ; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1052 ; XOPAVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
1053 ; XOPAVX2-NEXT: retq
1055 ; AVX512-LABEL: constant_shift_v4i64:
1057 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1058 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,31,62]
1059 ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
1060 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1063 ; AVX512VL-LABEL: constant_shift_v4i64:
1064 ; AVX512VL: # %bb.0:
1065 ; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %ymm0, %ymm0
1066 ; AVX512VL-NEXT: retq
1068 ; X32-AVX1-LABEL: constant_shift_v4i64:
1069 ; X32-AVX1: # %bb.0:
1070 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1071 ; X32-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
1072 ; X32-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
1073 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1074 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,0]
1075 ; X32-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
1076 ; X32-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1077 ; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
1078 ; X32-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
1079 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1080 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1073741824,0,16777216]
1081 ; X32-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
1082 ; X32-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
1083 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1084 ; X32-AVX1-NEXT: retl
1086 ; X32-AVX2-LABEL: constant_shift_v4i64:
1087 ; X32-AVX2: # %bb.0:
1088 ; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
1089 ; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
1090 ; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
1091 ; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
1092 ; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
1093 ; X32-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
1094 ; X32-AVX2-NEXT: retl
1095 %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
1096 ret <4 x i64> %shift
1099 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
1100 ; AVX1-LABEL: constant_shift_v8i32:
1102 ; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
1103 ; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2
1104 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1105 ; AVX1-NEXT: vpsrad $6, %xmm0, %xmm2
1106 ; AVX1-NEXT: vpsrad $4, %xmm0, %xmm3
1107 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1108 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1109 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1110 ; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
1111 ; AVX1-NEXT: vpsrad $9, %xmm0, %xmm3
1112 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1113 ; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0
1114 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1115 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1118 ; AVX2-LABEL: constant_shift_v8i32:
1120 ; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
1123 ; XOPAVX1-LABEL: constant_shift_v8i32:
1125 ; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm1
1126 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1127 ; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0
1128 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1129 ; XOPAVX1-NEXT: retq
1131 ; XOPAVX2-LABEL: constant_shift_v8i32:
1133 ; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
1134 ; XOPAVX2-NEXT: retq
1136 ; AVX512-LABEL: constant_shift_v8i32:
1138 ; AVX512-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
1141 ; AVX512VL-LABEL: constant_shift_v8i32:
1142 ; AVX512VL: # %bb.0:
1143 ; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
1144 ; AVX512VL-NEXT: retq
1146 ; X32-AVX1-LABEL: constant_shift_v8i32:
1147 ; X32-AVX1: # %bb.0:
1148 ; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
1149 ; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm2
1150 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1151 ; X32-AVX1-NEXT: vpsrad $6, %xmm0, %xmm2
1152 ; X32-AVX1-NEXT: vpsrad $4, %xmm0, %xmm3
1153 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1154 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1155 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1156 ; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
1157 ; X32-AVX1-NEXT: vpsrad $9, %xmm0, %xmm3
1158 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1159 ; X32-AVX1-NEXT: vpsrad $8, %xmm0, %xmm0
1160 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1161 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1162 ; X32-AVX1-NEXT: retl
1164 ; X32-AVX2-LABEL: constant_shift_v8i32:
1165 ; X32-AVX2: # %bb.0:
1166 ; X32-AVX2-NEXT: vpsravd {{\.LCPI.*}}, %ymm0, %ymm0
1167 ; X32-AVX2-NEXT: retl
1168 %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
1169 ret <8 x i32> %shift
1172 define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
1173 ; AVX1-LABEL: constant_shift_v16i16:
1175 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
1176 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1177 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
1178 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
1179 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1180 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
1181 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1184 ; AVX2-LABEL: constant_shift_v16i16:
1186 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1
1187 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1188 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm0
1189 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7,8],ymm0[9],ymm2[10,11,12,13,14,15]
1190 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1193 ; XOPAVX1-LABEL: constant_shift_v16i16:
1195 ; XOPAVX1-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm1
1196 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1197 ; XOPAVX1-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
1198 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1199 ; XOPAVX1-NEXT: retq
1201 ; XOPAVX2-LABEL: constant_shift_v16i16:
1203 ; XOPAVX2-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm1
1204 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1205 ; XOPAVX2-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
1206 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1207 ; XOPAVX2-NEXT: retq
1209 ; AVX512DQ-LABEL: constant_shift_v16i16:
1210 ; AVX512DQ: # %bb.0:
1211 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
1212 ; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
1213 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
1214 ; AVX512DQ-NEXT: retq
1216 ; AVX512BW-LABEL: constant_shift_v16i16:
1217 ; AVX512BW: # %bb.0:
1218 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1219 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1220 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
1221 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1222 ; AVX512BW-NEXT: retq
1224 ; AVX512DQVL-LABEL: constant_shift_v16i16:
1225 ; AVX512DQVL: # %bb.0:
1226 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0
1227 ; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
1228 ; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
1229 ; AVX512DQVL-NEXT: retq
1231 ; AVX512BWVL-LABEL: constant_shift_v16i16:
1232 ; AVX512BWVL: # %bb.0:
1233 ; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
1234 ; AVX512BWVL-NEXT: retq
1236 ; X32-AVX1-LABEL: constant_shift_v16i16:
1237 ; X32-AVX1: # %bb.0:
1238 ; X32-AVX1-NEXT: vpmulhw {{\.LCPI.*}}, %xmm0, %xmm1
1239 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1240 ; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
1241 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
1242 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1243 ; X32-AVX1-NEXT: vpmulhw {{\.LCPI.*}}, %xmm0, %xmm0
1244 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1245 ; X32-AVX1-NEXT: retl
1247 ; X32-AVX2-LABEL: constant_shift_v16i16:
1248 ; X32-AVX2: # %bb.0:
1249 ; X32-AVX2-NEXT: vpmulhw {{\.LCPI.*}}, %ymm0, %ymm1
1250 ; X32-AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1251 ; X32-AVX2-NEXT: vpsraw $1, %xmm0, %xmm0
1252 ; X32-AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7,8],ymm0[9],ymm2[10,11,12,13,14,15]
1253 ; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1254 ; X32-AVX2-NEXT: retl
1255 %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1256 ret <16 x i16> %shift
1259 define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
1260 ; AVX1-LABEL: constant_shift_v32i8:
1262 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1263 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1264 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
1265 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,8,16,32,64,128,256]
1266 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1267 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1268 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1269 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
1270 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2]
1271 ; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
1272 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1273 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1274 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1275 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
1276 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1277 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1278 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1279 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
1280 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
1281 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1282 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1283 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1286 ; AVX2-LABEL: constant_shift_v32i8:
1288 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1289 ; AVX2-NEXT: vpsraw $8, %ymm1, %ymm1
1290 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
1291 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1292 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1293 ; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0
1294 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1295 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1296 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1299 ; XOPAVX1-LABEL: constant_shift_v32i8:
1301 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1302 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1303 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm1, %xmm1
1304 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm0, %xmm0
1305 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1306 ; XOPAVX1-NEXT: retq
1308 ; XOPAVX2-LABEL: constant_shift_v32i8:
1310 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1311 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1312 ; XOPAVX2-NEXT: vpshab %xmm2, %xmm1, %xmm1
1313 ; XOPAVX2-NEXT: vpshab %xmm2, %xmm0, %xmm0
1314 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1315 ; XOPAVX2-NEXT: retq
1317 ; AVX512DQ-LABEL: constant_shift_v32i8:
1318 ; AVX512DQ: # %bb.0:
1319 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1320 ; AVX512DQ-NEXT: vpsraw $8, %ymm1, %ymm1
1321 ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
1322 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
1323 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1324 ; AVX512DQ-NEXT: vpsraw $8, %ymm0, %ymm0
1325 ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1326 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
1327 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1328 ; AVX512DQ-NEXT: retq
1330 ; AVX512BW-LABEL: constant_shift_v32i8:
1331 ; AVX512BW: # %bb.0:
1332 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
1333 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
1334 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1335 ; AVX512BW-NEXT: retq
1337 ; AVX512DQVL-LABEL: constant_shift_v32i8:
1338 ; AVX512DQVL: # %bb.0:
1339 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1340 ; AVX512DQVL-NEXT: vpsraw $8, %ymm1, %ymm1
1341 ; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
1342 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm1, %ymm1
1343 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1344 ; AVX512DQVL-NEXT: vpsraw $8, %ymm0, %ymm0
1345 ; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1346 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0
1347 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1348 ; AVX512DQVL-NEXT: retq
1350 ; AVX512BWVL-LABEL: constant_shift_v32i8:
1351 ; AVX512BWVL: # %bb.0:
1352 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
1353 ; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
1354 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
1355 ; AVX512BWVL-NEXT: retq
1357 ; X32-AVX1-LABEL: constant_shift_v32i8:
1358 ; X32-AVX1: # %bb.0:
1359 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1360 ; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1361 ; X32-AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
1362 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,8,16,32,64,128,256]
1363 ; X32-AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1364 ; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1365 ; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1366 ; X32-AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
1367 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2]
1368 ; X32-AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
1369 ; X32-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1370 ; X32-AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1371 ; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1372 ; X32-AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
1373 ; X32-AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1374 ; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1375 ; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1376 ; X32-AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
1377 ; X32-AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
1378 ; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1379 ; X32-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1380 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1381 ; X32-AVX1-NEXT: retl
1383 ; X32-AVX2-LABEL: constant_shift_v32i8:
1384 ; X32-AVX2: # %bb.0:
1385 ; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1386 ; X32-AVX2-NEXT: vpsraw $8, %ymm1, %ymm1
1387 ; X32-AVX2-NEXT: vpmullw {{\.LCPI.*}}, %ymm1, %ymm1
1388 ; X32-AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1389 ; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1390 ; X32-AVX2-NEXT: vpsraw $8, %ymm0, %ymm0
1391 ; X32-AVX2-NEXT: vpmullw {{\.LCPI.*}}, %ymm0, %ymm0
1392 ; X32-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1393 ; X32-AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1394 ; X32-AVX2-NEXT: retl
1395 %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1396 ret <32 x i8> %shift
1400 ; Uniform Constant Shifts
1403 define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
1404 ; AVX1-LABEL: splatconstant_shift_v4i64:
1406 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1407 ; AVX1-NEXT: vpsrad $7, %xmm1, %xmm2
1408 ; AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1
1409 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1410 ; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
1411 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
1412 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1413 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1416 ; AVX2-LABEL: splatconstant_shift_v4i64:
1418 ; AVX2-NEXT: vpsrad $7, %ymm0, %ymm1
1419 ; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1420 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1423 ; XOPAVX1-LABEL: splatconstant_shift_v4i64:
1425 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1426 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609]
1427 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1
1428 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0
1429 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1430 ; XOPAVX1-NEXT: retq
1432 ; XOPAVX2-LABEL: splatconstant_shift_v4i64:
1434 ; XOPAVX2-NEXT: vpsrad $7, %ymm0, %ymm1
1435 ; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1436 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1437 ; XOPAVX2-NEXT: retq
1439 ; AVX512-LABEL: splatconstant_shift_v4i64:
1441 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1442 ; AVX512-NEXT: vpsraq $7, %zmm0, %zmm0
1443 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1446 ; AVX512VL-LABEL: splatconstant_shift_v4i64:
1447 ; AVX512VL: # %bb.0:
1448 ; AVX512VL-NEXT: vpsraq $7, %ymm0, %ymm0
1449 ; AVX512VL-NEXT: retq
1451 ; X32-AVX1-LABEL: splatconstant_shift_v4i64:
1452 ; X32-AVX1: # %bb.0:
1453 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1454 ; X32-AVX1-NEXT: vpsrad $7, %xmm1, %xmm2
1455 ; X32-AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1
1456 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1457 ; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
1458 ; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
1459 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1460 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1461 ; X32-AVX1-NEXT: retl
1463 ; X32-AVX2-LABEL: splatconstant_shift_v4i64:
1464 ; X32-AVX2: # %bb.0:
1465 ; X32-AVX2-NEXT: vpsrad $7, %ymm0, %ymm1
1466 ; X32-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1467 ; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1468 ; X32-AVX2-NEXT: retl
1469 %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
1470 ret <4 x i64> %shift
1473 define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
1474 ; AVX1-LABEL: splatconstant_shift_v8i32:
1476 ; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
1477 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1478 ; AVX1-NEXT: vpsrad $5, %xmm0, %xmm0
1479 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1482 ; AVX2-LABEL: splatconstant_shift_v8i32:
1484 ; AVX2-NEXT: vpsrad $5, %ymm0, %ymm0
1487 ; XOPAVX1-LABEL: splatconstant_shift_v8i32:
1489 ; XOPAVX1-NEXT: vpsrad $5, %xmm0, %xmm1
1490 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1491 ; XOPAVX1-NEXT: vpsrad $5, %xmm0, %xmm0
1492 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1493 ; XOPAVX1-NEXT: retq
1495 ; XOPAVX2-LABEL: splatconstant_shift_v8i32:
1497 ; XOPAVX2-NEXT: vpsrad $5, %ymm0, %ymm0
1498 ; XOPAVX2-NEXT: retq
1500 ; AVX512-LABEL: splatconstant_shift_v8i32:
1502 ; AVX512-NEXT: vpsrad $5, %ymm0, %ymm0
1505 ; AVX512VL-LABEL: splatconstant_shift_v8i32:
1506 ; AVX512VL: # %bb.0:
1507 ; AVX512VL-NEXT: vpsrad $5, %ymm0, %ymm0
1508 ; AVX512VL-NEXT: retq
1510 ; X32-AVX1-LABEL: splatconstant_shift_v8i32:
1511 ; X32-AVX1: # %bb.0:
1512 ; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
1513 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1514 ; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm0
1515 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1516 ; X32-AVX1-NEXT: retl
1518 ; X32-AVX2-LABEL: splatconstant_shift_v8i32:
1519 ; X32-AVX2: # %bb.0:
1520 ; X32-AVX2-NEXT: vpsrad $5, %ymm0, %ymm0
1521 ; X32-AVX2-NEXT: retl
1522 %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
1523 ret <8 x i32> %shift
1526 define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
1527 ; AVX1-LABEL: splatconstant_shift_v16i16:
1529 ; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
1530 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1531 ; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
1532 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1535 ; AVX2-LABEL: splatconstant_shift_v16i16:
1537 ; AVX2-NEXT: vpsraw $3, %ymm0, %ymm0
1540 ; XOPAVX1-LABEL: splatconstant_shift_v16i16:
1542 ; XOPAVX1-NEXT: vpsraw $3, %xmm0, %xmm1
1543 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1544 ; XOPAVX1-NEXT: vpsraw $3, %xmm0, %xmm0
1545 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1546 ; XOPAVX1-NEXT: retq
1548 ; XOPAVX2-LABEL: splatconstant_shift_v16i16:
1550 ; XOPAVX2-NEXT: vpsraw $3, %ymm0, %ymm0
1551 ; XOPAVX2-NEXT: retq
1553 ; AVX512-LABEL: splatconstant_shift_v16i16:
1555 ; AVX512-NEXT: vpsraw $3, %ymm0, %ymm0
1558 ; AVX512VL-LABEL: splatconstant_shift_v16i16:
1559 ; AVX512VL: # %bb.0:
1560 ; AVX512VL-NEXT: vpsraw $3, %ymm0, %ymm0
1561 ; AVX512VL-NEXT: retq
1563 ; X32-AVX1-LABEL: splatconstant_shift_v16i16:
1564 ; X32-AVX1: # %bb.0:
1565 ; X32-AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
1566 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1567 ; X32-AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
1568 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1569 ; X32-AVX1-NEXT: retl
1571 ; X32-AVX2-LABEL: splatconstant_shift_v16i16:
1572 ; X32-AVX2: # %bb.0:
1573 ; X32-AVX2-NEXT: vpsraw $3, %ymm0, %ymm0
1574 ; X32-AVX2-NEXT: retl
1575 %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1576 ret <16 x i16> %shift
1579 define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
1580 ; AVX1-LABEL: splatconstant_shift_v32i8:
1582 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1583 ; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
1584 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1585 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1586 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1587 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
1588 ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
1589 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1590 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1591 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
1592 ; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
1593 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1596 ; AVX2-LABEL: splatconstant_shift_v32i8:
1598 ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1599 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1600 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1601 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1602 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1605 ; XOPAVX1-LABEL: splatconstant_shift_v32i8:
1607 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1608 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253]
1609 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm1, %xmm1
1610 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm0, %xmm0
1611 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1612 ; XOPAVX1-NEXT: retq
1614 ; XOPAVX2-LABEL: splatconstant_shift_v32i8:
1616 ; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1617 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1618 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1619 ; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1620 ; XOPAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1621 ; XOPAVX2-NEXT: retq
1623 ; AVX512-LABEL: splatconstant_shift_v32i8:
1625 ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
1626 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1627 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1628 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
1629 ; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1632 ; AVX512VL-LABEL: splatconstant_shift_v32i8:
1633 ; AVX512VL: # %bb.0:
1634 ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
1635 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1636 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1637 ; AVX512VL-NEXT: vpxor %ymm1, %ymm0, %ymm0
1638 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1639 ; AVX512VL-NEXT: retq
1641 ; X32-AVX1-LABEL: splatconstant_shift_v32i8:
1642 ; X32-AVX1: # %bb.0:
1643 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1644 ; X32-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
1645 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1646 ; X32-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1647 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1648 ; X32-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
1649 ; X32-AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
1650 ; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1651 ; X32-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1652 ; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
1653 ; X32-AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
1654 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1655 ; X32-AVX1-NEXT: retl
1657 ; X32-AVX2-LABEL: splatconstant_shift_v32i8:
1658 ; X32-AVX2: # %bb.0:
1659 ; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1660 ; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
1661 ; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1662 ; X32-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1663 ; X32-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1664 ; X32-AVX2-NEXT: retl
1665 %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1666 ret <32 x i8> %shift