1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
11 ; 32-bit runs to make sure we do reasonable things for i64 shifts.
12 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86-AVX1
13 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX2
19 define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
20 ; AVX1-LABEL: var_shift_v4i64:
22 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
23 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
24 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
25 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
26 ; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6
27 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
28 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
29 ; AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2
30 ; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5
31 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
32 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
33 ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
34 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4
35 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
36 ; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3
37 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
38 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1
39 ; AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0
40 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
41 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
42 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
43 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
46 ; AVX2-LABEL: var_shift_v4i64:
48 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
49 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
50 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
51 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
52 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
55 ; XOPAVX1-LABEL: var_shift_v4i64:
57 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
58 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
59 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
60 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
61 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm4, %xmm2
62 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1
63 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
64 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
67 ; XOPAVX2-LABEL: var_shift_v4i64:
69 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
70 ; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
71 ; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
72 ; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
73 ; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
76 ; AVX512-LABEL: var_shift_v4i64:
78 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
79 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
80 ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
81 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
84 ; AVX512VL-LABEL: var_shift_v4i64:
86 ; AVX512VL-NEXT: vpsravq %ymm1, %ymm0, %ymm0
89 ; X86-AVX1-LABEL: var_shift_v4i64:
91 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
92 ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
93 ; X86-AVX1-NEXT: # xmm3 = mem[0,0]
94 ; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
95 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
96 ; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6
97 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
98 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
99 ; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2
100 ; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5
101 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
102 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
103 ; X86-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
104 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4
105 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
106 ; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3
107 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
108 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1
109 ; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0
110 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
111 ; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
112 ; X86-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
113 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
114 ; X86-AVX1-NEXT: retl
116 ; X86-AVX2-LABEL: var_shift_v4i64:
118 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
119 ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
120 ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
121 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
122 ; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
123 ; X86-AVX2-NEXT: retl
124 %shift = ashr <4 x i64> %a, %b
128 define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
129 ; AVX1-LABEL: var_shift_v8i32:
131 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
132 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
133 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
134 ; AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4
135 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
136 ; AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5
137 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
138 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
139 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
140 ; AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6
141 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
142 ; AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2
143 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
144 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
145 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
146 ; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
147 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
148 ; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
149 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
150 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
151 ; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
152 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
153 ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
154 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
155 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
156 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
159 ; AVX2-LABEL: var_shift_v8i32:
161 ; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
164 ; XOPAVX1-LABEL: var_shift_v8i32:
166 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
167 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
168 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
169 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
170 ; XOPAVX1-NEXT: vpshad %xmm2, %xmm4, %xmm2
171 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1
172 ; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
173 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
176 ; XOPAVX2-LABEL: var_shift_v8i32:
178 ; XOPAVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
181 ; AVX512-LABEL: var_shift_v8i32:
183 ; AVX512-NEXT: vpsravd %ymm1, %ymm0, %ymm0
186 ; AVX512VL-LABEL: var_shift_v8i32:
188 ; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
189 ; AVX512VL-NEXT: retq
191 ; X86-AVX1-LABEL: var_shift_v8i32:
193 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
194 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
195 ; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
196 ; X86-AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4
197 ; X86-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
198 ; X86-AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5
199 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
200 ; X86-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
201 ; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
202 ; X86-AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6
203 ; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
204 ; X86-AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2
205 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
206 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
207 ; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
208 ; X86-AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
209 ; X86-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
210 ; X86-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
211 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
212 ; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
213 ; X86-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
214 ; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
215 ; X86-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
216 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
217 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
218 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
219 ; X86-AVX1-NEXT: retl
221 ; X86-AVX2-LABEL: var_shift_v8i32:
223 ; X86-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
224 ; X86-AVX2-NEXT: retl
225 %shift = ashr <8 x i32> %a, %b
229 define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
230 ; AVX1-LABEL: var_shift_v16i16:
232 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
233 ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
234 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
235 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
236 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
237 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
238 ; AVX1-NEXT: vpsraw $8, %xmm4, %xmm5
239 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
240 ; AVX1-NEXT: vpsraw $4, %xmm2, %xmm4
241 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
242 ; AVX1-NEXT: vpsraw $2, %xmm2, %xmm4
243 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
244 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
245 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm4
246 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
247 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
248 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
249 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
250 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
251 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
252 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm4
253 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
254 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
255 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
256 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
257 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
258 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
259 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
260 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
261 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
262 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
265 ; AVX2-LABEL: var_shift_v16i16:
267 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
268 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
269 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
270 ; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
271 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
272 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
273 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
274 ; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
275 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
276 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
279 ; XOPAVX1-LABEL: var_shift_v16i16:
281 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
282 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
283 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
284 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
285 ; XOPAVX1-NEXT: vpshaw %xmm2, %xmm4, %xmm2
286 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1
287 ; XOPAVX1-NEXT: vpshaw %xmm1, %xmm0, %xmm0
288 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
291 ; XOPAVX2-LABEL: var_shift_v16i16:
293 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
294 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
295 ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
296 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
297 ; XOPAVX2-NEXT: vpshaw %xmm2, %xmm4, %xmm2
298 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
299 ; XOPAVX2-NEXT: vpshaw %xmm1, %xmm0, %xmm0
300 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
303 ; AVX512DQ-LABEL: var_shift_v16i16:
305 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
306 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
307 ; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
308 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
309 ; AVX512DQ-NEXT: retq
311 ; AVX512BW-LABEL: var_shift_v16i16:
313 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
314 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
315 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
316 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
317 ; AVX512BW-NEXT: retq
319 ; AVX512DQVL-LABEL: var_shift_v16i16:
320 ; AVX512DQVL: # %bb.0:
321 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
322 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0
323 ; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
324 ; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
325 ; AVX512DQVL-NEXT: retq
327 ; AVX512BWVL-LABEL: var_shift_v16i16:
328 ; AVX512BWVL: # %bb.0:
329 ; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
330 ; AVX512BWVL-NEXT: retq
332 ; X86-AVX1-LABEL: var_shift_v16i16:
334 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
335 ; X86-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
336 ; X86-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
337 ; X86-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
338 ; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
339 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
340 ; X86-AVX1-NEXT: vpsraw $8, %xmm4, %xmm5
341 ; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
342 ; X86-AVX1-NEXT: vpsraw $4, %xmm2, %xmm4
343 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
344 ; X86-AVX1-NEXT: vpsraw $2, %xmm2, %xmm4
345 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
346 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
347 ; X86-AVX1-NEXT: vpsraw $1, %xmm2, %xmm4
348 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
349 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
350 ; X86-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
351 ; X86-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
352 ; X86-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
353 ; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
354 ; X86-AVX1-NEXT: vpsraw $8, %xmm0, %xmm4
355 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
356 ; X86-AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
357 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
358 ; X86-AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
359 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
360 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
361 ; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
362 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
363 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
364 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
365 ; X86-AVX1-NEXT: retl
367 ; X86-AVX2-LABEL: var_shift_v16i16:
369 ; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
370 ; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
371 ; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
372 ; X86-AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
373 ; X86-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
374 ; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
375 ; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
376 ; X86-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
377 ; X86-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
378 ; X86-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
379 ; X86-AVX2-NEXT: retl
380 %shift = ashr <16 x i16> %a, %b
381 ret <16 x i16> %shift
384 define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
385 ; AVX1-LABEL: var_shift_v32i8:
387 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
388 ; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
389 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
390 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
391 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
392 ; AVX1-NEXT: vpsraw $4, %xmm5, %xmm6
393 ; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
394 ; AVX1-NEXT: vpsraw $2, %xmm5, %xmm6
395 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
396 ; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
397 ; AVX1-NEXT: vpsraw $1, %xmm5, %xmm6
398 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
399 ; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
400 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
401 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
402 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
403 ; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
404 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
405 ; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
406 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
407 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
408 ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
409 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
410 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
411 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
412 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
413 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
414 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
415 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
416 ; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
417 ; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
418 ; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
419 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
420 ; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
421 ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
422 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
423 ; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
424 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
425 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
426 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
427 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm4
428 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
429 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm4
430 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
431 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
432 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm4
433 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
434 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
435 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
436 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
437 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
440 ; AVX2-LABEL: var_shift_v32i8:
442 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
443 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
444 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
445 ; AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
446 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
447 ; AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
448 ; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
449 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
450 ; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
451 ; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
452 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
453 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
454 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
455 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
456 ; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
457 ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
458 ; AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
459 ; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
460 ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
461 ; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
462 ; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
463 ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
464 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
465 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
468 ; XOPAVX1-LABEL: var_shift_v32i8:
470 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
471 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
472 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
473 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
474 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm4, %xmm2
475 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1
476 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
477 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
480 ; XOPAVX2-LABEL: var_shift_v32i8:
482 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
483 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
484 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
485 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
486 ; XOPAVX2-NEXT: vpshab %xmm2, %xmm4, %xmm2
487 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
488 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
489 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
492 ; AVX512DQ-LABEL: var_shift_v32i8:
494 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
495 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
496 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
497 ; AVX512DQ-NEXT: vpsraw $4, %ymm3, %ymm4
498 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
499 ; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4
500 ; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2
501 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
502 ; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4
503 ; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2
504 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
505 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
506 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
507 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
508 ; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm3
509 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
510 ; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm3
511 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1
512 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
513 ; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm3
514 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1
515 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
516 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
517 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
518 ; AVX512DQ-NEXT: retq
520 ; AVX512BW-LABEL: var_shift_v32i8:
522 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
523 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
524 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
525 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
526 ; AVX512BW-NEXT: retq
528 ; AVX512DQVL-LABEL: var_shift_v32i8:
529 ; AVX512DQVL: # %bb.0:
530 ; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
531 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
532 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
533 ; AVX512DQVL-NEXT: vpsraw $4, %ymm3, %ymm4
534 ; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
535 ; AVX512DQVL-NEXT: vpsraw $2, %ymm3, %ymm4
536 ; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
537 ; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
538 ; AVX512DQVL-NEXT: vpsraw $1, %ymm3, %ymm4
539 ; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
540 ; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
541 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2
542 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
543 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
544 ; AVX512DQVL-NEXT: vpsraw $4, %ymm0, %ymm3
545 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
546 ; AVX512DQVL-NEXT: vpsraw $2, %ymm0, %ymm3
547 ; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
548 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
549 ; AVX512DQVL-NEXT: vpsraw $1, %ymm0, %ymm3
550 ; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
551 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
552 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0
553 ; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
554 ; AVX512DQVL-NEXT: retq
556 ; AVX512BWVL-LABEL: var_shift_v32i8:
557 ; AVX512BWVL: # %bb.0:
558 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
559 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
560 ; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0
561 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
562 ; AVX512BWVL-NEXT: retq
564 ; X86-AVX1-LABEL: var_shift_v32i8:
566 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
567 ; X86-AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
568 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
569 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
570 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
571 ; X86-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6
572 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
573 ; X86-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6
574 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
575 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
576 ; X86-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6
577 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
578 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
579 ; X86-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
580 ; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
581 ; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
582 ; X86-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
583 ; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
584 ; X86-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
585 ; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
586 ; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
587 ; X86-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
588 ; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
589 ; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
590 ; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
591 ; X86-AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
592 ; X86-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
593 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
594 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
595 ; X86-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
596 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
597 ; X86-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
598 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
599 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
600 ; X86-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
601 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
602 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
603 ; X86-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
604 ; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
605 ; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
606 ; X86-AVX1-NEXT: vpsraw $4, %xmm0, %xmm4
607 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
608 ; X86-AVX1-NEXT: vpsraw $2, %xmm0, %xmm4
609 ; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
610 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
611 ; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm4
612 ; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
613 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
614 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
615 ; X86-AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
616 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
617 ; X86-AVX1-NEXT: retl
619 ; X86-AVX2-LABEL: var_shift_v32i8:
621 ; X86-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
622 ; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
623 ; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
624 ; X86-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
625 ; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
626 ; X86-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
627 ; X86-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
628 ; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
629 ; X86-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
630 ; X86-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
631 ; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
632 ; X86-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
633 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
634 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
635 ; X86-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
636 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
637 ; X86-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
638 ; X86-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
639 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
640 ; X86-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
641 ; X86-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
642 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
643 ; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
644 ; X86-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
645 ; X86-AVX2-NEXT: retl
646 %shift = ashr <32 x i8> %a, %b
651 ; Uniform Variable Shifts
654 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
655 ; AVX1-LABEL: splatvar_shift_v4i64:
657 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
658 ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
659 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
660 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
661 ; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3
662 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3
663 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
664 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
665 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
666 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
669 ; AVX2-LABEL: splatvar_shift_v4i64:
671 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
672 ; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
673 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
674 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
675 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
678 ; XOPAVX1-LABEL: splatvar_shift_v4i64:
680 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
681 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
682 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
683 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
684 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm2, %xmm2
685 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
686 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
689 ; XOPAVX2-LABEL: splatvar_shift_v4i64:
691 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
692 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
693 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
694 ; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
695 ; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
698 ; AVX512-LABEL: splatvar_shift_v4i64:
700 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
701 ; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0
702 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
705 ; AVX512VL-LABEL: splatvar_shift_v4i64:
707 ; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0
708 ; AVX512VL-NEXT: retq
710 ; X86-AVX1-LABEL: splatvar_shift_v4i64:
712 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
713 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
714 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
715 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
716 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3
717 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3
718 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
719 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
720 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
721 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
722 ; X86-AVX1-NEXT: retl
724 ; X86-AVX2-LABEL: splatvar_shift_v4i64:
726 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
727 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
728 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
729 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
730 ; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
731 ; X86-AVX2-NEXT: retl
732 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
733 %shift = ashr <4 x i64> %a, %splat
737 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
738 ; AVX1-LABEL: splatvar_shift_v8i32:
740 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
741 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
742 ; AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
743 ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
744 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
747 ; AVX2-LABEL: splatvar_shift_v8i32:
749 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
750 ; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
753 ; XOPAVX1-LABEL: splatvar_shift_v8i32:
755 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
756 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
757 ; XOPAVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
758 ; XOPAVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
759 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
762 ; XOPAVX2-LABEL: splatvar_shift_v8i32:
764 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
765 ; XOPAVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
768 ; AVX512-LABEL: splatvar_shift_v8i32:
770 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
771 ; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0
774 ; AVX512VL-LABEL: splatvar_shift_v8i32:
776 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
777 ; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0
778 ; AVX512VL-NEXT: retq
780 ; X86-AVX1-LABEL: splatvar_shift_v8i32:
782 ; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
783 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
784 ; X86-AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
785 ; X86-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
786 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
787 ; X86-AVX1-NEXT: retl
789 ; X86-AVX2-LABEL: splatvar_shift_v8i32:
791 ; X86-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
792 ; X86-AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
793 ; X86-AVX2-NEXT: retl
794 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
795 %shift = ashr <8 x i32> %a, %splat
799 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
800 ; AVX1-LABEL: splatvar_shift_v16i16:
802 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
803 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
804 ; AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
805 ; AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
806 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
809 ; AVX2-LABEL: splatvar_shift_v16i16:
811 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
812 ; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
815 ; XOPAVX1-LABEL: splatvar_shift_v16i16:
817 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
818 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
819 ; XOPAVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
820 ; XOPAVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
821 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
824 ; XOPAVX2-LABEL: splatvar_shift_v16i16:
826 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
827 ; XOPAVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
830 ; AVX512-LABEL: splatvar_shift_v16i16:
832 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
833 ; AVX512-NEXT: vpsraw %xmm1, %ymm0, %ymm0
836 ; AVX512VL-LABEL: splatvar_shift_v16i16:
838 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
839 ; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0
840 ; AVX512VL-NEXT: retq
842 ; X86-AVX1-LABEL: splatvar_shift_v16i16:
844 ; X86-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
845 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
846 ; X86-AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
847 ; X86-AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
848 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
849 ; X86-AVX1-NEXT: retl
851 ; X86-AVX2-LABEL: splatvar_shift_v16i16:
853 ; X86-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
854 ; X86-AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
855 ; X86-AVX2-NEXT: retl
856 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
857 %shift = ashr <16 x i16> %a, %splat
858 ret <16 x i16> %shift
861 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
862 ; AVX1-LABEL: splatvar_shift_v32i8:
864 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
865 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
866 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
867 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
868 ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
869 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
870 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
871 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
872 ; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
873 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
874 ; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
875 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
876 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
877 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
878 ; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0
879 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
882 ; AVX2-LABEL: splatvar_shift_v32i8:
884 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
885 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
886 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
887 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
888 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
889 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
890 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
891 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
892 ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
893 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
894 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
897 ; XOPAVX1-LABEL: splatvar_shift_v32i8:
899 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
900 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
901 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
902 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
903 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm2, %xmm2
904 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
905 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
908 ; XOPAVX2-LABEL: splatvar_shift_v32i8:
910 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
911 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
912 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
913 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
914 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm2, %xmm2
915 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
916 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
919 ; AVX512DQ-LABEL: splatvar_shift_v32i8:
921 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
922 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
923 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
924 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
925 ; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2
926 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2
927 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
928 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
929 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
930 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
931 ; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0
932 ; AVX512DQ-NEXT: retq
934 ; AVX512BW-LABEL: splatvar_shift_v32i8:
936 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
937 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
938 ; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0
939 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
940 ; AVX512BW-NEXT: retq
942 ; AVX512DQVL-LABEL: splatvar_shift_v32i8:
943 ; AVX512DQVL: # %bb.0:
944 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
945 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
946 ; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
947 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
948 ; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
949 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1
950 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
951 ; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
952 ; AVX512DQVL-NEXT: vpternlogq $108, %ymm0, %ymm2, %ymm1
953 ; AVX512DQVL-NEXT: vpsubb %ymm2, %ymm1, %ymm0
954 ; AVX512DQVL-NEXT: retq
956 ; AVX512BWVL-LABEL: splatvar_shift_v32i8:
957 ; AVX512BWVL: # %bb.0:
958 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
959 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
960 ; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0
961 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
962 ; AVX512BWVL-NEXT: retq
964 ; X86-AVX1-LABEL: splatvar_shift_v32i8:
966 ; X86-AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
967 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
968 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
969 ; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
970 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
971 ; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
972 ; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
973 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
974 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
975 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
976 ; X86-AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
977 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
978 ; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
979 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
980 ; X86-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0
981 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
982 ; X86-AVX1-NEXT: retl
984 ; X86-AVX2-LABEL: splatvar_shift_v32i8:
986 ; X86-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
987 ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
988 ; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
989 ; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
990 ; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
991 ; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
992 ; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
993 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
994 ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
995 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
996 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
997 ; X86-AVX2-NEXT: retl
998 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
999 %shift = ashr <32 x i8> %a, %splat
1000 ret <32 x i8> %shift
1007 define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
1008 ; AVX1-LABEL: constant_shift_v4i64:
1010 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1011 ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
1012 ; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
1013 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1014 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967296,2]
1015 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
1016 ; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1017 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
1018 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
1019 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1020 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4611686018427387904,72057594037927936]
1021 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
1022 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
1023 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1026 ; AVX2-LABEL: constant_shift_v4i64:
1028 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1029 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
1030 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1031 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
1034 ; XOPAVX1-LABEL: constant_shift_v4i64:
1036 ; XOPAVX1-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1037 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1038 ; XOPAVX1-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1039 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1040 ; XOPAVX1-NEXT: retq
1042 ; XOPAVX2-LABEL: constant_shift_v4i64:
1044 ; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1045 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
1046 ; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1047 ; XOPAVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
1048 ; XOPAVX2-NEXT: retq
1050 ; AVX512-LABEL: constant_shift_v4i64:
1052 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1053 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,31,62]
1054 ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
1055 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1058 ; AVX512VL-LABEL: constant_shift_v4i64:
1059 ; AVX512VL: # %bb.0:
1060 ; AVX512VL-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1061 ; AVX512VL-NEXT: retq
1063 ; X86-AVX1-LABEL: constant_shift_v4i64:
1064 ; X86-AVX1: # %bb.0:
1065 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1066 ; X86-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
1067 ; X86-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
1068 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1069 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,0]
1070 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
1071 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1072 ; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
1073 ; X86-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
1074 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1075 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1073741824,0,16777216]
1076 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
1077 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
1078 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1079 ; X86-AVX1-NEXT: retl
1081 ; X86-AVX2-LABEL: constant_shift_v4i64:
1082 ; X86-AVX2: # %bb.0:
1083 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
1084 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
1085 ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
1086 ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
1087 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
1088 ; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
1089 ; X86-AVX2-NEXT: retl
1090 %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
1091 ret <4 x i64> %shift
1094 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
1095 ; AVX1-LABEL: constant_shift_v8i32:
1097 ; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
1098 ; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2
1099 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1100 ; AVX1-NEXT: vpsrad $6, %xmm0, %xmm2
1101 ; AVX1-NEXT: vpsrad $4, %xmm0, %xmm3
1102 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1103 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1104 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1105 ; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
1106 ; AVX1-NEXT: vpsrad $9, %xmm0, %xmm3
1107 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1108 ; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0
1109 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1110 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1113 ; AVX2-LABEL: constant_shift_v8i32:
1115 ; AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1118 ; XOPAVX1-LABEL: constant_shift_v8i32:
1120 ; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1121 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1122 ; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1123 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1124 ; XOPAVX1-NEXT: retq
1126 ; XOPAVX2-LABEL: constant_shift_v8i32:
1128 ; XOPAVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1129 ; XOPAVX2-NEXT: retq
1131 ; AVX512-LABEL: constant_shift_v8i32:
1133 ; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1136 ; AVX512VL-LABEL: constant_shift_v8i32:
1137 ; AVX512VL: # %bb.0:
1138 ; AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1139 ; AVX512VL-NEXT: retq
1141 ; X86-AVX1-LABEL: constant_shift_v8i32:
1142 ; X86-AVX1: # %bb.0:
1143 ; X86-AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
1144 ; X86-AVX1-NEXT: vpsrad $5, %xmm0, %xmm2
1145 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1146 ; X86-AVX1-NEXT: vpsrad $6, %xmm0, %xmm2
1147 ; X86-AVX1-NEXT: vpsrad $4, %xmm0, %xmm3
1148 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1149 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1150 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1151 ; X86-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
1152 ; X86-AVX1-NEXT: vpsrad $9, %xmm0, %xmm3
1153 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1154 ; X86-AVX1-NEXT: vpsrad $8, %xmm0, %xmm0
1155 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1156 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1157 ; X86-AVX1-NEXT: retl
1159 ; X86-AVX2-LABEL: constant_shift_v8i32:
1160 ; X86-AVX2: # %bb.0:
1161 ; X86-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1162 ; X86-AVX2-NEXT: retl
1163 %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
1164 ret <8 x i32> %shift
1167 define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
1168 ; AVX1-LABEL: constant_shift_v16i16:
1170 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1171 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1172 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
1173 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
1174 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1175 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1176 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1179 ; AVX2-LABEL: constant_shift_v16i16:
1181 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1182 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1183 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm0
1184 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7]
1185 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1188 ; XOPAVX1-LABEL: constant_shift_v16i16:
1190 ; XOPAVX1-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1191 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1192 ; XOPAVX1-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1193 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1194 ; XOPAVX1-NEXT: retq
1196 ; XOPAVX2-LABEL: constant_shift_v16i16:
1198 ; XOPAVX2-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1199 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1200 ; XOPAVX2-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1201 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1202 ; XOPAVX2-NEXT: retq
1204 ; AVX512DQ-LABEL: constant_shift_v16i16:
1205 ; AVX512DQ: # %bb.0:
1206 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
1207 ; AVX512DQ-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1208 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
1209 ; AVX512DQ-NEXT: retq
1211 ; AVX512BW-LABEL: constant_shift_v16i16:
1212 ; AVX512BW: # %bb.0:
1213 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1214 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1215 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
1216 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1217 ; AVX512BW-NEXT: retq
1219 ; AVX512DQVL-LABEL: constant_shift_v16i16:
1220 ; AVX512DQVL: # %bb.0:
1221 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0
1222 ; AVX512DQVL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1223 ; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
1224 ; AVX512DQVL-NEXT: retq
1226 ; AVX512BWVL-LABEL: constant_shift_v16i16:
1227 ; AVX512BWVL: # %bb.0:
1228 ; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1229 ; AVX512BWVL-NEXT: retq
1231 ; X86-AVX1-LABEL: constant_shift_v16i16:
1232 ; X86-AVX1: # %bb.0:
1233 ; X86-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
1234 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1235 ; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
1236 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
1237 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1238 ; X86-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1239 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1240 ; X86-AVX1-NEXT: retl
1242 ; X86-AVX2-LABEL: constant_shift_v16i16:
1243 ; X86-AVX2: # %bb.0:
1244 ; X86-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
1245 ; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1246 ; X86-AVX2-NEXT: vpsraw $1, %xmm0, %xmm0
1247 ; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7]
1248 ; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1249 ; X86-AVX2-NEXT: retl
1250 %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1251 ret <16 x i16> %shift
1254 define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
1255 ; AVX1-LABEL: constant_shift_v32i8:
1257 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1258 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1259 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
1260 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,8,16,32,64,128,256]
1261 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1262 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1263 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1264 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
1265 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2]
1266 ; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
1267 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1268 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1269 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1270 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
1271 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1272 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1273 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1274 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
1275 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
1276 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1277 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1278 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1281 ; AVX2-LABEL: constant_shift_v32i8:
1283 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1284 ; AVX2-NEXT: vpsraw $8, %ymm1, %ymm1
1285 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1286 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1287 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1288 ; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0
1289 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1290 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1291 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1294 ; XOPAVX1-LABEL: constant_shift_v32i8:
1296 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1297 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1298 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm1, %xmm1
1299 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm0, %xmm0
1300 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1301 ; XOPAVX1-NEXT: retq
1303 ; XOPAVX2-LABEL: constant_shift_v32i8:
1305 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1306 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1307 ; XOPAVX2-NEXT: vpshab %xmm2, %xmm1, %xmm1
1308 ; XOPAVX2-NEXT: vpshab %xmm2, %xmm0, %xmm0
1309 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1310 ; XOPAVX2-NEXT: retq
1312 ; AVX512DQ-LABEL: constant_shift_v32i8:
1313 ; AVX512DQ: # %bb.0:
1314 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1315 ; AVX512DQ-NEXT: vpsraw $8, %ymm1, %ymm1
1316 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1317 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
1318 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1319 ; AVX512DQ-NEXT: vpsraw $8, %ymm0, %ymm0
1320 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1321 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
1322 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1323 ; AVX512DQ-NEXT: retq
1325 ; AVX512BW-LABEL: constant_shift_v32i8:
1326 ; AVX512BW: # %bb.0:
1327 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
1328 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1329 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1330 ; AVX512BW-NEXT: retq
1332 ; AVX512DQVL-LABEL: constant_shift_v32i8:
1333 ; AVX512DQVL: # %bb.0:
1334 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1335 ; AVX512DQVL-NEXT: vpsraw $8, %ymm1, %ymm1
1336 ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1337 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm1, %ymm1
1338 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1339 ; AVX512DQVL-NEXT: vpsraw $8, %ymm0, %ymm0
1340 ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1341 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0
1342 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1343 ; AVX512DQVL-NEXT: retq
1345 ; AVX512BWVL-LABEL: constant_shift_v32i8:
1346 ; AVX512BWVL: # %bb.0:
1347 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
1348 ; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1349 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
1350 ; AVX512BWVL-NEXT: retq
1352 ; X86-AVX1-LABEL: constant_shift_v32i8:
1353 ; X86-AVX1: # %bb.0:
1354 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1355 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1356 ; X86-AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
1357 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,8,16,32,64,128,256]
1358 ; X86-AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1359 ; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1360 ; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1361 ; X86-AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
1362 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2]
1363 ; X86-AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
1364 ; X86-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1365 ; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1366 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1367 ; X86-AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
1368 ; X86-AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1369 ; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1370 ; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1371 ; X86-AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
1372 ; X86-AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
1373 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1374 ; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1375 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1376 ; X86-AVX1-NEXT: retl
1378 ; X86-AVX2-LABEL: constant_shift_v32i8:
1379 ; X86-AVX2: # %bb.0:
1380 ; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1381 ; X86-AVX2-NEXT: vpsraw $8, %ymm1, %ymm1
1382 ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
1383 ; X86-AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1384 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1385 ; X86-AVX2-NEXT: vpsraw $8, %ymm0, %ymm0
1386 ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1387 ; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1388 ; X86-AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1389 ; X86-AVX2-NEXT: retl
1390 %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1391 ret <32 x i8> %shift
1395 ; Uniform Constant Shifts
1398 define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
1399 ; AVX1-LABEL: splatconstant_shift_v4i64:
1401 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1402 ; AVX1-NEXT: vpsrad $7, %xmm1, %xmm2
1403 ; AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1
1404 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1405 ; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
1406 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
1407 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1408 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1411 ; AVX2-LABEL: splatconstant_shift_v4i64:
1413 ; AVX2-NEXT: vpsrad $7, %ymm0, %ymm1
1414 ; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1415 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1418 ; XOPAVX1-LABEL: splatconstant_shift_v4i64:
1420 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1421 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609]
1422 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1
1423 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0
1424 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1425 ; XOPAVX1-NEXT: retq
1427 ; XOPAVX2-LABEL: splatconstant_shift_v4i64:
1429 ; XOPAVX2-NEXT: vpsrad $7, %ymm0, %ymm1
1430 ; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1431 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1432 ; XOPAVX2-NEXT: retq
1434 ; AVX512-LABEL: splatconstant_shift_v4i64:
1436 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1437 ; AVX512-NEXT: vpsraq $7, %zmm0, %zmm0
1438 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1441 ; AVX512VL-LABEL: splatconstant_shift_v4i64:
1442 ; AVX512VL: # %bb.0:
1443 ; AVX512VL-NEXT: vpsraq $7, %ymm0, %ymm0
1444 ; AVX512VL-NEXT: retq
1446 ; X86-AVX1-LABEL: splatconstant_shift_v4i64:
1447 ; X86-AVX1: # %bb.0:
1448 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1449 ; X86-AVX1-NEXT: vpsrad $7, %xmm1, %xmm2
1450 ; X86-AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1
1451 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1452 ; X86-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
1453 ; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
1454 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1455 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1456 ; X86-AVX1-NEXT: retl
1458 ; X86-AVX2-LABEL: splatconstant_shift_v4i64:
1459 ; X86-AVX2: # %bb.0:
1460 ; X86-AVX2-NEXT: vpsrad $7, %ymm0, %ymm1
1461 ; X86-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1462 ; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1463 ; X86-AVX2-NEXT: retl
1464 %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
1465 ret <4 x i64> %shift
1468 define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
1469 ; AVX1-LABEL: splatconstant_shift_v8i32:
1471 ; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
1472 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1473 ; AVX1-NEXT: vpsrad $5, %xmm0, %xmm0
1474 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1477 ; AVX2-LABEL: splatconstant_shift_v8i32:
1479 ; AVX2-NEXT: vpsrad $5, %ymm0, %ymm0
1482 ; XOPAVX1-LABEL: splatconstant_shift_v8i32:
1484 ; XOPAVX1-NEXT: vpsrad $5, %xmm0, %xmm1
1485 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1486 ; XOPAVX1-NEXT: vpsrad $5, %xmm0, %xmm0
1487 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1488 ; XOPAVX1-NEXT: retq
1490 ; XOPAVX2-LABEL: splatconstant_shift_v8i32:
1492 ; XOPAVX2-NEXT: vpsrad $5, %ymm0, %ymm0
1493 ; XOPAVX2-NEXT: retq
1495 ; AVX512-LABEL: splatconstant_shift_v8i32:
1497 ; AVX512-NEXT: vpsrad $5, %ymm0, %ymm0
1500 ; AVX512VL-LABEL: splatconstant_shift_v8i32:
1501 ; AVX512VL: # %bb.0:
1502 ; AVX512VL-NEXT: vpsrad $5, %ymm0, %ymm0
1503 ; AVX512VL-NEXT: retq
1505 ; X86-AVX1-LABEL: splatconstant_shift_v8i32:
1506 ; X86-AVX1: # %bb.0:
1507 ; X86-AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
1508 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1509 ; X86-AVX1-NEXT: vpsrad $5, %xmm0, %xmm0
1510 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1511 ; X86-AVX1-NEXT: retl
1513 ; X86-AVX2-LABEL: splatconstant_shift_v8i32:
1514 ; X86-AVX2: # %bb.0:
1515 ; X86-AVX2-NEXT: vpsrad $5, %ymm0, %ymm0
1516 ; X86-AVX2-NEXT: retl
1517 %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
1518 ret <8 x i32> %shift
1521 define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
1522 ; AVX1-LABEL: splatconstant_shift_v16i16:
1524 ; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
1525 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1526 ; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
1527 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1530 ; AVX2-LABEL: splatconstant_shift_v16i16:
1532 ; AVX2-NEXT: vpsraw $3, %ymm0, %ymm0
1535 ; XOPAVX1-LABEL: splatconstant_shift_v16i16:
1537 ; XOPAVX1-NEXT: vpsraw $3, %xmm0, %xmm1
1538 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1539 ; XOPAVX1-NEXT: vpsraw $3, %xmm0, %xmm0
1540 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1541 ; XOPAVX1-NEXT: retq
1543 ; XOPAVX2-LABEL: splatconstant_shift_v16i16:
1545 ; XOPAVX2-NEXT: vpsraw $3, %ymm0, %ymm0
1546 ; XOPAVX2-NEXT: retq
1548 ; AVX512-LABEL: splatconstant_shift_v16i16:
1550 ; AVX512-NEXT: vpsraw $3, %ymm0, %ymm0
1553 ; AVX512VL-LABEL: splatconstant_shift_v16i16:
1554 ; AVX512VL: # %bb.0:
1555 ; AVX512VL-NEXT: vpsraw $3, %ymm0, %ymm0
1556 ; AVX512VL-NEXT: retq
1558 ; X86-AVX1-LABEL: splatconstant_shift_v16i16:
1559 ; X86-AVX1: # %bb.0:
1560 ; X86-AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
1561 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1562 ; X86-AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
1563 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1564 ; X86-AVX1-NEXT: retl
1566 ; X86-AVX2-LABEL: splatconstant_shift_v16i16:
1567 ; X86-AVX2: # %bb.0:
1568 ; X86-AVX2-NEXT: vpsraw $3, %ymm0, %ymm0
1569 ; X86-AVX2-NEXT: retl
1570 %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1571 ret <16 x i16> %shift
1574 define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
1575 ; AVX1-LABEL: splatconstant_shift_v32i8:
1577 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1578 ; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
1579 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1580 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1581 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1582 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
1583 ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
1584 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1585 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1586 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
1587 ; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
1588 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1591 ; AVX2-LABEL: splatconstant_shift_v32i8:
1593 ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1594 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1595 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1596 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1597 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1600 ; XOPAVX1-LABEL: splatconstant_shift_v32i8:
1602 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1603 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253]
1604 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm1, %xmm1
1605 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm0, %xmm0
1606 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1607 ; XOPAVX1-NEXT: retq
1609 ; XOPAVX2-LABEL: splatconstant_shift_v32i8:
1611 ; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1612 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1613 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1614 ; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1615 ; XOPAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1616 ; XOPAVX2-NEXT: retq
1618 ; AVX512-LABEL: splatconstant_shift_v32i8:
1620 ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
1621 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1622 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1623 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
1624 ; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1627 ; AVX512VL-LABEL: splatconstant_shift_v32i8:
1628 ; AVX512VL: # %bb.0:
1629 ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
1630 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1631 ; AVX512VL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1632 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1633 ; AVX512VL-NEXT: retq
1635 ; X86-AVX1-LABEL: splatconstant_shift_v32i8:
1636 ; X86-AVX1: # %bb.0:
1637 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1638 ; X86-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
1639 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1640 ; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1641 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1642 ; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
1643 ; X86-AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
1644 ; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1645 ; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1646 ; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
1647 ; X86-AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
1648 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1649 ; X86-AVX1-NEXT: retl
1651 ; X86-AVX2-LABEL: splatconstant_shift_v32i8:
1652 ; X86-AVX2: # %bb.0:
1653 ; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1654 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1655 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1656 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1657 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1658 ; X86-AVX2-NEXT: retl
1659 %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1660 ret <32 x i8> %shift
1667 define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
1668 ; AVX1-LABEL: shift32_v4i64:
1670 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1671 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
1672 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1673 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1674 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
1675 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1676 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1677 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1680 ; AVX2-LABEL: shift32_v4i64:
1682 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1
1683 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1684 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1687 ; XOPAVX1-LABEL: shift32_v4i64:
1689 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1690 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551584,18446744073709551584]
1691 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1
1692 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0
1693 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1694 ; XOPAVX1-NEXT: retq
1696 ; XOPAVX2-LABEL: shift32_v4i64:
1698 ; XOPAVX2-NEXT: vpsrad $31, %ymm0, %ymm1
1699 ; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1700 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1701 ; XOPAVX2-NEXT: retq
1703 ; AVX512-LABEL: shift32_v4i64:
1705 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1706 ; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0
1707 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1710 ; AVX512VL-LABEL: shift32_v4i64:
1711 ; AVX512VL: # %bb.0:
1712 ; AVX512VL-NEXT: vpsraq $32, %ymm0, %ymm0
1713 ; AVX512VL-NEXT: retq
1715 ; X86-AVX1-LABEL: shift32_v4i64:
1716 ; X86-AVX1: # %bb.0:
1717 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1718 ; X86-AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
1719 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1720 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1721 ; X86-AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
1722 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1723 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1724 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1725 ; X86-AVX1-NEXT: retl
1727 ; X86-AVX2-LABEL: shift32_v4i64:
1728 ; X86-AVX2: # %bb.0:
1729 ; X86-AVX2-NEXT: vpsrad $31, %ymm0, %ymm1
1730 ; X86-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1731 ; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1732 ; X86-AVX2-NEXT: retl
1733 %shift = ashr <4 x i64> %a, <i64 32, i64 32, i64 32, i64 32>
1734 ret <4 x i64> %shift