1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
11 ; 32-bit runs to make sure we do reasonable things for i64 shifts.
12 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86-AVX1
13 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX2
19 define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
20 ; AVX1-LABEL: var_shift_v4i64:
22 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
23 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
24 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
25 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
26 ; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6
27 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
28 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
29 ; AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2
30 ; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5
31 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
32 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
33 ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
34 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4
35 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
36 ; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3
37 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
38 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1
39 ; AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0
40 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
41 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
42 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
43 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
46 ; AVX2-LABEL: var_shift_v4i64:
48 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
49 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
50 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
51 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
52 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
55 ; XOPAVX1-LABEL: var_shift_v4i64:
57 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
58 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
59 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
60 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
61 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm4, %xmm2
62 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1
63 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
64 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
67 ; XOPAVX2-LABEL: var_shift_v4i64:
69 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
70 ; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
71 ; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
72 ; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
73 ; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
76 ; AVX512-LABEL: var_shift_v4i64:
78 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
79 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
80 ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
81 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
84 ; AVX512VL-LABEL: var_shift_v4i64:
86 ; AVX512VL-NEXT: vpsravq %ymm1, %ymm0, %ymm0
89 ; X86-AVX1-LABEL: var_shift_v4i64:
91 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
92 ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
93 ; X86-AVX1-NEXT: # xmm3 = mem[0,0]
94 ; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
95 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
96 ; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6
97 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
98 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
99 ; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2
100 ; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5
101 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
102 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
103 ; X86-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
104 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4
105 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
106 ; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3
107 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
108 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1
109 ; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0
110 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
111 ; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
112 ; X86-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
113 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
114 ; X86-AVX1-NEXT: retl
116 ; X86-AVX2-LABEL: var_shift_v4i64:
118 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
119 ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
120 ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
121 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
122 ; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
123 ; X86-AVX2-NEXT: retl
124 %shift = ashr <4 x i64> %a, %b
128 define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
129 ; AVX1-LABEL: var_shift_v8i32:
131 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
132 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
133 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
134 ; AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4
135 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
136 ; AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5
137 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
138 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
139 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
140 ; AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6
141 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
142 ; AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2
143 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
144 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
145 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
146 ; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
147 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
148 ; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
149 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
150 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
151 ; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
152 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
153 ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
154 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
155 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
156 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
159 ; AVX2-LABEL: var_shift_v8i32:
161 ; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
164 ; XOPAVX1-LABEL: var_shift_v8i32:
166 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
167 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
168 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
169 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
170 ; XOPAVX1-NEXT: vpshad %xmm2, %xmm4, %xmm2
171 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1
172 ; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
173 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
176 ; XOPAVX2-LABEL: var_shift_v8i32:
178 ; XOPAVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
181 ; AVX512-LABEL: var_shift_v8i32:
183 ; AVX512-NEXT: vpsravd %ymm1, %ymm0, %ymm0
186 ; AVX512VL-LABEL: var_shift_v8i32:
188 ; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
189 ; AVX512VL-NEXT: retq
191 ; X86-AVX1-LABEL: var_shift_v8i32:
193 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
194 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
195 ; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
196 ; X86-AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4
197 ; X86-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
198 ; X86-AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5
199 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
200 ; X86-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
201 ; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
202 ; X86-AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6
203 ; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
204 ; X86-AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2
205 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
206 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
207 ; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
208 ; X86-AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
209 ; X86-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
210 ; X86-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
211 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
212 ; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
213 ; X86-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
214 ; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
215 ; X86-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
216 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
217 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
218 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
219 ; X86-AVX1-NEXT: retl
221 ; X86-AVX2-LABEL: var_shift_v8i32:
223 ; X86-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
224 ; X86-AVX2-NEXT: retl
225 %shift = ashr <8 x i32> %a, %b
229 define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
230 ; AVX1-LABEL: var_shift_v16i16:
232 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
233 ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
234 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
235 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
236 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
237 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
238 ; AVX1-NEXT: vpsraw $8, %xmm4, %xmm5
239 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
240 ; AVX1-NEXT: vpsraw $4, %xmm2, %xmm4
241 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
242 ; AVX1-NEXT: vpsraw $2, %xmm2, %xmm4
243 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
244 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
245 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm4
246 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
247 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
248 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
249 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
250 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
251 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
252 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm4
253 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
254 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
255 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
256 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
257 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
258 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
259 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
260 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
261 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
262 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
265 ; AVX2-LABEL: var_shift_v16i16:
267 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
268 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
269 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
270 ; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
271 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
272 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
273 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
274 ; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
275 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
276 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
279 ; XOPAVX1-LABEL: var_shift_v16i16:
281 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
282 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
283 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
284 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
285 ; XOPAVX1-NEXT: vpshaw %xmm2, %xmm4, %xmm2
286 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1
287 ; XOPAVX1-NEXT: vpshaw %xmm1, %xmm0, %xmm0
288 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
291 ; XOPAVX2-LABEL: var_shift_v16i16:
293 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
294 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
295 ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
296 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
297 ; XOPAVX2-NEXT: vpshaw %xmm2, %xmm4, %xmm2
298 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
299 ; XOPAVX2-NEXT: vpshaw %xmm1, %xmm0, %xmm0
300 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
303 ; AVX512DQ-LABEL: var_shift_v16i16:
305 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
306 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
307 ; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
308 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
309 ; AVX512DQ-NEXT: retq
311 ; AVX512BW-LABEL: var_shift_v16i16:
313 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
314 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
315 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
316 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
317 ; AVX512BW-NEXT: retq
319 ; AVX512DQVL-LABEL: var_shift_v16i16:
320 ; AVX512DQVL: # %bb.0:
321 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
322 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0
323 ; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
324 ; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
325 ; AVX512DQVL-NEXT: retq
327 ; AVX512BWVL-LABEL: var_shift_v16i16:
328 ; AVX512BWVL: # %bb.0:
329 ; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
330 ; AVX512BWVL-NEXT: retq
332 ; X86-AVX1-LABEL: var_shift_v16i16:
334 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
335 ; X86-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
336 ; X86-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
337 ; X86-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
338 ; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
339 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
340 ; X86-AVX1-NEXT: vpsraw $8, %xmm4, %xmm5
341 ; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
342 ; X86-AVX1-NEXT: vpsraw $4, %xmm2, %xmm4
343 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
344 ; X86-AVX1-NEXT: vpsraw $2, %xmm2, %xmm4
345 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
346 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
347 ; X86-AVX1-NEXT: vpsraw $1, %xmm2, %xmm4
348 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
349 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
350 ; X86-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
351 ; X86-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
352 ; X86-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
353 ; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
354 ; X86-AVX1-NEXT: vpsraw $8, %xmm0, %xmm4
355 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
356 ; X86-AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
357 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
358 ; X86-AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
359 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
360 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
361 ; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
362 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
363 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
364 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
365 ; X86-AVX1-NEXT: retl
367 ; X86-AVX2-LABEL: var_shift_v16i16:
369 ; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
370 ; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
371 ; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
372 ; X86-AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
373 ; X86-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
374 ; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
375 ; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
376 ; X86-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
377 ; X86-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
378 ; X86-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
379 ; X86-AVX2-NEXT: retl
380 %shift = ashr <16 x i16> %a, %b
381 ret <16 x i16> %shift
384 define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
385 ; AVX1-LABEL: var_shift_v32i8:
387 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
388 ; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
389 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
390 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
391 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
392 ; AVX1-NEXT: vpsraw $4, %xmm5, %xmm6
393 ; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
394 ; AVX1-NEXT: vpsraw $2, %xmm5, %xmm6
395 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
396 ; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
397 ; AVX1-NEXT: vpsraw $1, %xmm5, %xmm6
398 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
399 ; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
400 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
401 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
402 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
403 ; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
404 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
405 ; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
406 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
407 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
408 ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
409 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
410 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
411 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
412 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
413 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
414 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
415 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
416 ; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
417 ; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
418 ; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
419 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
420 ; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
421 ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
422 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
423 ; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
424 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
425 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
426 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
427 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm4
428 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
429 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm4
430 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
431 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
432 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm4
433 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
434 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
435 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
436 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
437 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
440 ; AVX2-LABEL: var_shift_v32i8:
442 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
443 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
444 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
445 ; AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
446 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
447 ; AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
448 ; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
449 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
450 ; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
451 ; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
452 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
453 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
454 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
455 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
456 ; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
457 ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
458 ; AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
459 ; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
460 ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
461 ; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
462 ; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
463 ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
464 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
465 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
468 ; XOPAVX1-LABEL: var_shift_v32i8:
470 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
471 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
472 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
473 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
474 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm4, %xmm2
475 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1
476 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
477 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
480 ; XOPAVX2-LABEL: var_shift_v32i8:
482 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
483 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
484 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
485 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
486 ; XOPAVX2-NEXT: vpshab %xmm2, %xmm4, %xmm2
487 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
488 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
489 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
492 ; AVX512DQ-LABEL: var_shift_v32i8:
494 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
495 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
496 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
497 ; AVX512DQ-NEXT: vpsraw $4, %ymm3, %ymm4
498 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
499 ; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4
500 ; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2
501 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
502 ; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4
503 ; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2
504 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
505 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
506 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
507 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
508 ; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm3
509 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
510 ; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm3
511 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1
512 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
513 ; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm3
514 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1
515 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
516 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
517 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
518 ; AVX512DQ-NEXT: retq
520 ; AVX512BW-LABEL: var_shift_v32i8:
522 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
523 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
524 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
525 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
526 ; AVX512BW-NEXT: retq
528 ; AVX512DQVL-LABEL: var_shift_v32i8:
529 ; AVX512DQVL: # %bb.0:
530 ; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
531 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
532 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
533 ; AVX512DQVL-NEXT: vpsraw $4, %ymm3, %ymm4
534 ; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
535 ; AVX512DQVL-NEXT: vpsraw $2, %ymm3, %ymm4
536 ; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
537 ; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
538 ; AVX512DQVL-NEXT: vpsraw $1, %ymm3, %ymm4
539 ; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
540 ; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
541 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2
542 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
543 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
544 ; AVX512DQVL-NEXT: vpsraw $4, %ymm0, %ymm3
545 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
546 ; AVX512DQVL-NEXT: vpsraw $2, %ymm0, %ymm3
547 ; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
548 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
549 ; AVX512DQVL-NEXT: vpsraw $1, %ymm0, %ymm3
550 ; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
551 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
552 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0
553 ; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
554 ; AVX512DQVL-NEXT: retq
556 ; AVX512BWVL-LABEL: var_shift_v32i8:
557 ; AVX512BWVL: # %bb.0:
558 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
559 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
560 ; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0
561 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
562 ; AVX512BWVL-NEXT: retq
564 ; X86-AVX1-LABEL: var_shift_v32i8:
566 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
567 ; X86-AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
568 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
569 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
570 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
571 ; X86-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6
572 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
573 ; X86-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6
574 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
575 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
576 ; X86-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6
577 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
578 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
579 ; X86-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
580 ; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
581 ; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
582 ; X86-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
583 ; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
584 ; X86-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
585 ; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
586 ; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
587 ; X86-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
588 ; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
589 ; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
590 ; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
591 ; X86-AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
592 ; X86-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
593 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
594 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
595 ; X86-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
596 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
597 ; X86-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
598 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
599 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
600 ; X86-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
601 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
602 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
603 ; X86-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
604 ; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
605 ; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
606 ; X86-AVX1-NEXT: vpsraw $4, %xmm0, %xmm4
607 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
608 ; X86-AVX1-NEXT: vpsraw $2, %xmm0, %xmm4
609 ; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
610 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
611 ; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm4
612 ; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
613 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
614 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
615 ; X86-AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
616 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
617 ; X86-AVX1-NEXT: retl
619 ; X86-AVX2-LABEL: var_shift_v32i8:
621 ; X86-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
622 ; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
623 ; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
624 ; X86-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
625 ; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
626 ; X86-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
627 ; X86-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
628 ; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
629 ; X86-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
630 ; X86-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
631 ; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
632 ; X86-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
633 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
634 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
635 ; X86-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
636 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
637 ; X86-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
638 ; X86-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
639 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
640 ; X86-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
641 ; X86-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
642 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
643 ; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
644 ; X86-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
645 ; X86-AVX2-NEXT: retl
646 %shift = ashr <32 x i8> %a, %b
651 ; Uniform Variable Shifts
654 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
655 ; AVX1-LABEL: splatvar_shift_v4i64:
657 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
658 ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
659 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
660 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
661 ; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3
662 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3
663 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
664 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
665 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
666 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
669 ; AVX2-LABEL: splatvar_shift_v4i64:
671 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
672 ; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
673 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
674 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
675 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
678 ; XOPAVX1-LABEL: splatvar_shift_v4i64:
680 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
681 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
682 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
683 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
684 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm2, %xmm2
685 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
686 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
689 ; XOPAVX2-LABEL: splatvar_shift_v4i64:
691 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
692 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
693 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
694 ; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
695 ; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
698 ; AVX512-LABEL: splatvar_shift_v4i64:
700 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
701 ; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0
702 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
705 ; AVX512VL-LABEL: splatvar_shift_v4i64:
707 ; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0
708 ; AVX512VL-NEXT: retq
710 ; X86-AVX1-LABEL: splatvar_shift_v4i64:
712 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
713 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
714 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
715 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
716 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3
717 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3
718 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
719 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
720 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
721 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
722 ; X86-AVX1-NEXT: retl
724 ; X86-AVX2-LABEL: splatvar_shift_v4i64:
726 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
727 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
728 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
729 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
730 ; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
731 ; X86-AVX2-NEXT: retl
732 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
733 %shift = ashr <4 x i64> %a, %splat
737 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
738 ; AVX1-LABEL: splatvar_shift_v8i32:
740 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
741 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
742 ; AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
743 ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
744 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
747 ; AVX2-LABEL: splatvar_shift_v8i32:
749 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
750 ; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
753 ; XOPAVX1-LABEL: splatvar_shift_v8i32:
755 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
756 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
757 ; XOPAVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
758 ; XOPAVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
759 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
762 ; XOPAVX2-LABEL: splatvar_shift_v8i32:
764 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
765 ; XOPAVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
768 ; AVX512-LABEL: splatvar_shift_v8i32:
770 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
771 ; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0
774 ; AVX512VL-LABEL: splatvar_shift_v8i32:
776 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
777 ; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0
778 ; AVX512VL-NEXT: retq
780 ; X86-AVX1-LABEL: splatvar_shift_v8i32:
782 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
783 ; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
784 ; X86-AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
785 ; X86-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
786 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
787 ; X86-AVX1-NEXT: retl
789 ; X86-AVX2-LABEL: splatvar_shift_v8i32:
791 ; X86-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
792 ; X86-AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
793 ; X86-AVX2-NEXT: retl
794 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
795 %shift = ashr <8 x i32> %a, %splat
799 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
800 ; AVX1-LABEL: splatvar_shift_v16i16:
802 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
803 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
804 ; AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
805 ; AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
806 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
809 ; AVX2-LABEL: splatvar_shift_v16i16:
811 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
812 ; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
815 ; XOPAVX1-LABEL: splatvar_shift_v16i16:
817 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
818 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
819 ; XOPAVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
820 ; XOPAVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
821 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
824 ; XOPAVX2-LABEL: splatvar_shift_v16i16:
826 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
827 ; XOPAVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
830 ; AVX512-LABEL: splatvar_shift_v16i16:
832 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
833 ; AVX512-NEXT: vpsraw %xmm1, %ymm0, %ymm0
836 ; AVX512VL-LABEL: splatvar_shift_v16i16:
838 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
839 ; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0
840 ; AVX512VL-NEXT: retq
842 ; X86-AVX1-LABEL: splatvar_shift_v16i16:
844 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
845 ; X86-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
846 ; X86-AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
847 ; X86-AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
848 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
849 ; X86-AVX1-NEXT: retl
851 ; X86-AVX2-LABEL: splatvar_shift_v16i16:
853 ; X86-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
854 ; X86-AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
855 ; X86-AVX2-NEXT: retl
856 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
857 %shift = ashr <16 x i16> %a, %splat
858 ret <16 x i16> %shift
861 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
862 ; AVX1-LABEL: splatvar_shift_v32i8:
864 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
865 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
866 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
867 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
868 ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
869 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
870 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
871 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
872 ; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
873 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
874 ; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
875 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
876 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
877 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
878 ; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0
879 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
882 ; AVX2-LABEL: splatvar_shift_v32i8:
884 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
885 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
886 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
887 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
888 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
889 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
890 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
891 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
892 ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
893 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
894 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
897 ; XOPAVX1-LABEL: splatvar_shift_v32i8:
899 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
900 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
901 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
902 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
903 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm2, %xmm2
904 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
905 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
908 ; XOPAVX2-LABEL: splatvar_shift_v32i8:
910 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
911 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
912 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
913 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
914 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm2, %xmm2
915 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
916 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
919 ; AVX512DQ-LABEL: splatvar_shift_v32i8:
921 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
922 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
923 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
924 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
925 ; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2
926 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2
927 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
928 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
929 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
930 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
931 ; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0
932 ; AVX512DQ-NEXT: retq
934 ; AVX512BW-LABEL: splatvar_shift_v32i8:
936 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
937 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
938 ; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0
939 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
940 ; AVX512BW-NEXT: retq
942 ; AVX512DQVL-LABEL: splatvar_shift_v32i8:
943 ; AVX512DQVL: # %bb.0:
944 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
945 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
946 ; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
947 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
948 ; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
949 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1
950 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
951 ; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
952 ; AVX512DQVL-NEXT: vpternlogq $108, %ymm0, %ymm2, %ymm1
953 ; AVX512DQVL-NEXT: vpsubb %ymm2, %ymm1, %ymm0
954 ; AVX512DQVL-NEXT: retq
956 ; AVX512BWVL-LABEL: splatvar_shift_v32i8:
957 ; AVX512BWVL: # %bb.0:
958 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
959 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
960 ; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0
961 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
962 ; AVX512BWVL-NEXT: retq
964 ; X86-AVX1-LABEL: splatvar_shift_v32i8:
966 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
967 ; X86-AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
968 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
969 ; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
970 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
971 ; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
972 ; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
973 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
974 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
975 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
976 ; X86-AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
977 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
978 ; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
979 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
980 ; X86-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0
981 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
982 ; X86-AVX1-NEXT: retl
984 ; X86-AVX2-LABEL: splatvar_shift_v32i8:
986 ; X86-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
987 ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
988 ; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
989 ; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
990 ; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
991 ; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
992 ; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
993 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
994 ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
995 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
996 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
997 ; X86-AVX2-NEXT: retl
998 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
999 %shift = ashr <32 x i8> %a, %splat
1000 ret <32 x i8> %shift
1004 ; Uniform Variable Modulo Shifts
1007 define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
1008 ; AVX1-LABEL: splatvar_modulo_shift_v4i64:
1010 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1011 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
1012 ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
1013 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1014 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
1015 ; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3
1016 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3
1017 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
1018 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
1019 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
1020 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1023 ; AVX2-LABEL: splatvar_modulo_shift_v4i64:
1025 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1026 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1027 ; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
1028 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
1029 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
1030 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
1033 ; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64:
1035 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1036 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1037 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
1038 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1039 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1040 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm2, %xmm2
1041 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
1042 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1043 ; XOPAVX1-NEXT: retq
1045 ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64:
1047 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1048 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1049 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
1050 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
1051 ; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
1052 ; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
1053 ; XOPAVX2-NEXT: retq
1055 ; AVX512-LABEL: splatvar_modulo_shift_v4i64:
1057 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1058 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1059 ; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0
1060 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1063 ; AVX512VL-LABEL: splatvar_modulo_shift_v4i64:
1064 ; AVX512VL: # %bb.0:
1065 ; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1
1066 ; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0
1067 ; AVX512VL-NEXT: retq
1069 ; X86-AVX1-LABEL: splatvar_modulo_shift_v4i64:
1070 ; X86-AVX1: # %bb.0:
1071 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1072 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
1073 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
1074 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1075 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
1076 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3
1077 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3
1078 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
1079 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
1080 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
1081 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1082 ; X86-AVX1-NEXT: retl
1084 ; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64:
1085 ; X86-AVX2: # %bb.0:
1086 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1087 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
1088 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
1089 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
1090 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
1091 ; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
1092 ; X86-AVX2-NEXT: retl
1093 %mod = and <4 x i64> %b, <i64 63, i64 63, i64 63, i64 63>
1094 %splat = shufflevector <4 x i64> %mod, <4 x i64> undef, <4 x i32> zeroinitializer
1095 %shift = ashr <4 x i64> %a, %splat
1096 ret <4 x i64> %shift
1099 define <8 x i32> @splatvar_modulo_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
1100 ; AVX1-LABEL: splatvar_modulo_shift_v8i32:
1102 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1103 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1104 ; AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
1105 ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
1106 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1109 ; AVX2-LABEL: splatvar_modulo_shift_v8i32:
1111 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1112 ; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
1115 ; XOPAVX1-LABEL: splatvar_modulo_shift_v8i32:
1117 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1118 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1119 ; XOPAVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
1120 ; XOPAVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
1121 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1122 ; XOPAVX1-NEXT: retq
1124 ; XOPAVX2-LABEL: splatvar_modulo_shift_v8i32:
1126 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1127 ; XOPAVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
1128 ; XOPAVX2-NEXT: retq
1130 ; AVX512-LABEL: splatvar_modulo_shift_v8i32:
1132 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1133 ; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0
1136 ; AVX512VL-LABEL: splatvar_modulo_shift_v8i32:
1137 ; AVX512VL: # %bb.0:
1138 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1139 ; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0
1140 ; AVX512VL-NEXT: retq
1142 ; X86-AVX1-LABEL: splatvar_modulo_shift_v8i32:
1143 ; X86-AVX1: # %bb.0:
1144 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1145 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1146 ; X86-AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
1147 ; X86-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
1148 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1149 ; X86-AVX1-NEXT: retl
1151 ; X86-AVX2-LABEL: splatvar_modulo_shift_v8i32:
1152 ; X86-AVX2: # %bb.0:
1153 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1154 ; X86-AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
1155 ; X86-AVX2-NEXT: retl
1156 %mod = and <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
1157 %splat = shufflevector <8 x i32> %mod, <8 x i32> undef, <8 x i32> zeroinitializer
1158 %shift = ashr <8 x i32> %a, %splat
1159 ret <8 x i32> %shift
1162 define <16 x i16> @splatvar_modulo_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
1163 ; AVX1-LABEL: splatvar_modulo_shift_v16i16:
1165 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1166 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1167 ; AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
1168 ; AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
1169 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1172 ; AVX2-LABEL: splatvar_modulo_shift_v16i16:
1174 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1175 ; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
1178 ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i16:
1180 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1181 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1182 ; XOPAVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
1183 ; XOPAVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
1184 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1185 ; XOPAVX1-NEXT: retq
1187 ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i16:
1189 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1190 ; XOPAVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
1191 ; XOPAVX2-NEXT: retq
1193 ; AVX512-LABEL: splatvar_modulo_shift_v16i16:
1195 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1196 ; AVX512-NEXT: vpsraw %xmm1, %ymm0, %ymm0
1199 ; AVX512VL-LABEL: splatvar_modulo_shift_v16i16:
1200 ; AVX512VL: # %bb.0:
1201 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1202 ; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0
1203 ; AVX512VL-NEXT: retq
1205 ; X86-AVX1-LABEL: splatvar_modulo_shift_v16i16:
1206 ; X86-AVX1: # %bb.0:
1207 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1208 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1209 ; X86-AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
1210 ; X86-AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
1211 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1212 ; X86-AVX1-NEXT: retl
1214 ; X86-AVX2-LABEL: splatvar_modulo_shift_v16i16:
1215 ; X86-AVX2: # %bb.0:
1216 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1217 ; X86-AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
1218 ; X86-AVX2-NEXT: retl
1219 %mod = and <16 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
1220 %splat = shufflevector <16 x i16> %mod, <16 x i16> undef, <16 x i32> zeroinitializer
1221 %shift = ashr <16 x i16> %a, %splat
1222 ret <16 x i16> %shift
1225 define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
1226 ; AVX1-LABEL: splatvar_modulo_shift_v32i8:
1228 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1229 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1230 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
1231 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1232 ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
1233 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1234 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1235 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
1236 ; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
1237 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
1238 ; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
1239 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1240 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1241 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
1242 ; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0
1243 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1246 ; AVX2-LABEL: splatvar_modulo_shift_v32i8:
1248 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1249 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1250 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1251 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
1252 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
1253 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
1254 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1255 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
1256 ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
1257 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1258 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1261 ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
1263 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1264 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1265 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1266 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1267 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1268 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm2, %xmm2
1269 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
1270 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1271 ; XOPAVX1-NEXT: retq
1273 ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8:
1275 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1276 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1277 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1278 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1279 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1280 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm2, %xmm2
1281 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
1282 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1283 ; XOPAVX2-NEXT: retq
1285 ; AVX512DQ-LABEL: splatvar_modulo_shift_v32i8:
1286 ; AVX512DQ: # %bb.0:
1287 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1288 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1289 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1290 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
1291 ; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2
1292 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2
1293 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
1294 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
1295 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
1296 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
1297 ; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1298 ; AVX512DQ-NEXT: retq
1300 ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
1301 ; AVX512BW: # %bb.0:
1302 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
1303 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1304 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1305 ; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0
1306 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1307 ; AVX512BW-NEXT: retq
1309 ; AVX512DQVL-LABEL: splatvar_modulo_shift_v32i8:
1310 ; AVX512DQVL: # %bb.0:
1311 ; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1312 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1313 ; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
1314 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
1315 ; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1316 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1
1317 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
1318 ; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
1319 ; AVX512DQVL-NEXT: vpternlogq $108, %ymm0, %ymm2, %ymm1
1320 ; AVX512DQVL-NEXT: vpsubb %ymm2, %ymm1, %ymm0
1321 ; AVX512DQVL-NEXT: retq
1323 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
1324 ; AVX512BWVL: # %bb.0:
1325 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
1326 ; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1327 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1328 ; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0
1329 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
1330 ; AVX512BWVL-NEXT: retq
1332 ; X86-AVX1-LABEL: splatvar_modulo_shift_v32i8:
1333 ; X86-AVX1: # %bb.0:
1334 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1335 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1336 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
1337 ; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1338 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
1339 ; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1340 ; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1341 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
1342 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
1343 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
1344 ; X86-AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
1345 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1346 ; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1347 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
1348 ; X86-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0
1349 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1350 ; X86-AVX1-NEXT: retl
1352 ; X86-AVX2-LABEL: splatvar_modulo_shift_v32i8:
1353 ; X86-AVX2: # %bb.0:
1354 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1355 ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1356 ; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1357 ; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
1358 ; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
1359 ; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
1360 ; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1361 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
1362 ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
1363 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1364 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1365 ; X86-AVX2-NEXT: retl
1366 %mod = and <32 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
1367 %splat = shufflevector <32 x i8> %mod, <32 x i8> undef, <32 x i32> zeroinitializer
1368 %shift = ashr <32 x i8> %a, %splat
1369 ret <32 x i8> %shift
1376 define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
1377 ; AVX1-LABEL: constant_shift_v4i64:
1379 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1380 ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
1381 ; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
1382 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1383 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967296,2]
1384 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
1385 ; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1386 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
1387 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
1388 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1389 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4611686018427387904,72057594037927936]
1390 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
1391 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
1392 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1395 ; AVX2-LABEL: constant_shift_v4i64:
1397 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1398 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
1399 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1400 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
1403 ; XOPAVX1-LABEL: constant_shift_v4i64:
1405 ; XOPAVX1-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1406 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1407 ; XOPAVX1-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1408 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1409 ; XOPAVX1-NEXT: retq
1411 ; XOPAVX2-LABEL: constant_shift_v4i64:
1413 ; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1414 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
1415 ; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1416 ; XOPAVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
1417 ; XOPAVX2-NEXT: retq
1419 ; AVX512-LABEL: constant_shift_v4i64:
1421 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1422 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,31,62]
1423 ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
1424 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1427 ; AVX512VL-LABEL: constant_shift_v4i64:
1428 ; AVX512VL: # %bb.0:
1429 ; AVX512VL-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1430 ; AVX512VL-NEXT: retq
1432 ; X86-AVX1-LABEL: constant_shift_v4i64:
1433 ; X86-AVX1: # %bb.0:
1434 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1435 ; X86-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
1436 ; X86-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
1437 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1438 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,0]
1439 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
1440 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1441 ; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
1442 ; X86-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
1443 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1444 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1073741824,0,16777216]
1445 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
1446 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
1447 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1448 ; X86-AVX1-NEXT: retl
1450 ; X86-AVX2-LABEL: constant_shift_v4i64:
1451 ; X86-AVX2: # %bb.0:
1452 ; X86-AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1453 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1073741824,0,16777216,0,1,2,0]
1454 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1455 ; X86-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
1456 ; X86-AVX2-NEXT: retl
1457 %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
1458 ret <4 x i64> %shift
1461 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
1462 ; AVX1-LABEL: constant_shift_v8i32:
1464 ; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
1465 ; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2
1466 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1467 ; AVX1-NEXT: vpsrad $6, %xmm0, %xmm2
1468 ; AVX1-NEXT: vpsrad $4, %xmm0, %xmm3
1469 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1470 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1471 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1472 ; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
1473 ; AVX1-NEXT: vpsrad $9, %xmm0, %xmm3
1474 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1475 ; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0
1476 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1477 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1480 ; AVX2-LABEL: constant_shift_v8i32:
1482 ; AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1485 ; XOPAVX1-LABEL: constant_shift_v8i32:
1487 ; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1488 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1489 ; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1490 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1491 ; XOPAVX1-NEXT: retq
1493 ; XOPAVX2-LABEL: constant_shift_v8i32:
1495 ; XOPAVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1496 ; XOPAVX2-NEXT: retq
1498 ; AVX512-LABEL: constant_shift_v8i32:
1500 ; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1503 ; AVX512VL-LABEL: constant_shift_v8i32:
1504 ; AVX512VL: # %bb.0:
1505 ; AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1506 ; AVX512VL-NEXT: retq
1508 ; X86-AVX1-LABEL: constant_shift_v8i32:
1509 ; X86-AVX1: # %bb.0:
1510 ; X86-AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
1511 ; X86-AVX1-NEXT: vpsrad $5, %xmm0, %xmm2
1512 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1513 ; X86-AVX1-NEXT: vpsrad $6, %xmm0, %xmm2
1514 ; X86-AVX1-NEXT: vpsrad $4, %xmm0, %xmm3
1515 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1516 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1517 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1518 ; X86-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
1519 ; X86-AVX1-NEXT: vpsrad $9, %xmm0, %xmm3
1520 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1521 ; X86-AVX1-NEXT: vpsrad $8, %xmm0, %xmm0
1522 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1523 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1524 ; X86-AVX1-NEXT: retl
1526 ; X86-AVX2-LABEL: constant_shift_v8i32:
1527 ; X86-AVX2: # %bb.0:
1528 ; X86-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1529 ; X86-AVX2-NEXT: retl
1530 %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
1531 ret <8 x i32> %shift
1534 define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
1535 ; AVX1-LABEL: constant_shift_v16i16:
1537 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1538 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1539 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
1540 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
1541 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1542 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1543 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1546 ; AVX2-LABEL: constant_shift_v16i16:
1548 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1549 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1550 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm0
1551 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7]
1552 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1555 ; XOPAVX1-LABEL: constant_shift_v16i16:
1557 ; XOPAVX1-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1558 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1559 ; XOPAVX1-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1560 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1561 ; XOPAVX1-NEXT: retq
1563 ; XOPAVX2-LABEL: constant_shift_v16i16:
1565 ; XOPAVX2-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1566 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1567 ; XOPAVX2-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1568 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1569 ; XOPAVX2-NEXT: retq
1571 ; AVX512DQ-LABEL: constant_shift_v16i16:
1572 ; AVX512DQ: # %bb.0:
1573 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
1574 ; AVX512DQ-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1575 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
1576 ; AVX512DQ-NEXT: retq
1578 ; AVX512BW-LABEL: constant_shift_v16i16:
1579 ; AVX512BW: # %bb.0:
1580 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1581 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1582 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
1583 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1584 ; AVX512BW-NEXT: retq
1586 ; AVX512DQVL-LABEL: constant_shift_v16i16:
1587 ; AVX512DQVL: # %bb.0:
1588 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0
1589 ; AVX512DQVL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1590 ; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
1591 ; AVX512DQVL-NEXT: retq
1593 ; AVX512BWVL-LABEL: constant_shift_v16i16:
1594 ; AVX512BWVL: # %bb.0:
1595 ; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1596 ; AVX512BWVL-NEXT: retq
1598 ; X86-AVX1-LABEL: constant_shift_v16i16:
1599 ; X86-AVX1: # %bb.0:
1600 ; X86-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
1601 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1602 ; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
1603 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
1604 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1605 ; X86-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1606 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1607 ; X86-AVX1-NEXT: retl
1609 ; X86-AVX2-LABEL: constant_shift_v16i16:
1610 ; X86-AVX2: # %bb.0:
1611 ; X86-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
1612 ; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1613 ; X86-AVX2-NEXT: vpsraw $1, %xmm0, %xmm0
1614 ; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7]
1615 ; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1616 ; X86-AVX2-NEXT: retl
1617 %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1618 ret <16 x i16> %shift
1621 define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
1622 ; AVX1-LABEL: constant_shift_v32i8:
1624 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1625 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1626 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
1627 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,8,16,32,64,128,256]
1628 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1629 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1630 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1631 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
1632 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2]
1633 ; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
1634 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1635 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1636 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1637 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
1638 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1639 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1640 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1641 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
1642 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
1643 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1644 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1645 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1648 ; AVX2-LABEL: constant_shift_v32i8:
1650 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1651 ; AVX2-NEXT: vpsraw $8, %ymm1, %ymm1
1652 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1653 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1654 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1655 ; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0
1656 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1657 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1658 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1661 ; XOPAVX1-LABEL: constant_shift_v32i8:
1663 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1664 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1665 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm1, %xmm1
1666 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm0, %xmm0
1667 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1668 ; XOPAVX1-NEXT: retq
1670 ; XOPAVX2-LABEL: constant_shift_v32i8:
1672 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1673 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1674 ; XOPAVX2-NEXT: vpshab %xmm2, %xmm1, %xmm1
1675 ; XOPAVX2-NEXT: vpshab %xmm2, %xmm0, %xmm0
1676 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1677 ; XOPAVX2-NEXT: retq
1679 ; AVX512DQ-LABEL: constant_shift_v32i8:
1680 ; AVX512DQ: # %bb.0:
1681 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1682 ; AVX512DQ-NEXT: vpsraw $8, %ymm1, %ymm1
1683 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1684 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
1685 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1686 ; AVX512DQ-NEXT: vpsraw $8, %ymm0, %ymm0
1687 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1688 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
1689 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1690 ; AVX512DQ-NEXT: retq
1692 ; AVX512BW-LABEL: constant_shift_v32i8:
1693 ; AVX512BW: # %bb.0:
1694 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
1695 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1696 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1697 ; AVX512BW-NEXT: retq
1699 ; AVX512DQVL-LABEL: constant_shift_v32i8:
1700 ; AVX512DQVL: # %bb.0:
1701 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1702 ; AVX512DQVL-NEXT: vpsraw $8, %ymm1, %ymm1
1703 ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1704 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm1, %ymm1
1705 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1706 ; AVX512DQVL-NEXT: vpsraw $8, %ymm0, %ymm0
1707 ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1708 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0
1709 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1710 ; AVX512DQVL-NEXT: retq
1712 ; AVX512BWVL-LABEL: constant_shift_v32i8:
1713 ; AVX512BWVL: # %bb.0:
1714 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
1715 ; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1716 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
1717 ; AVX512BWVL-NEXT: retq
1719 ; X86-AVX1-LABEL: constant_shift_v32i8:
1720 ; X86-AVX1: # %bb.0:
1721 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1722 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1723 ; X86-AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
1724 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,8,16,32,64,128,256]
1725 ; X86-AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1726 ; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1727 ; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1728 ; X86-AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
1729 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2]
1730 ; X86-AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
1731 ; X86-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1732 ; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1733 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1734 ; X86-AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
1735 ; X86-AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1736 ; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1737 ; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1738 ; X86-AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
1739 ; X86-AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
1740 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1741 ; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1742 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1743 ; X86-AVX1-NEXT: retl
1745 ; X86-AVX2-LABEL: constant_shift_v32i8:
1746 ; X86-AVX2: # %bb.0:
1747 ; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1748 ; X86-AVX2-NEXT: vpsraw $8, %ymm1, %ymm1
1749 ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
1750 ; X86-AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1751 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1752 ; X86-AVX2-NEXT: vpsraw $8, %ymm0, %ymm0
1753 ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1754 ; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1755 ; X86-AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1756 ; X86-AVX2-NEXT: retl
1757 %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1758 ret <32 x i8> %shift
1762 ; Uniform Constant Shifts
1765 define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
1766 ; AVX1-LABEL: splatconstant_shift_v4i64:
1768 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1769 ; AVX1-NEXT: vpsrad $7, %xmm1, %xmm2
1770 ; AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1
1771 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1772 ; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
1773 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
1774 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1775 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1778 ; AVX2-LABEL: splatconstant_shift_v4i64:
1780 ; AVX2-NEXT: vpsrad $7, %ymm0, %ymm1
1781 ; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1782 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1785 ; XOPAVX1-LABEL: splatconstant_shift_v4i64:
1787 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1788 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609]
1789 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1
1790 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0
1791 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1792 ; XOPAVX1-NEXT: retq
1794 ; XOPAVX2-LABEL: splatconstant_shift_v4i64:
1796 ; XOPAVX2-NEXT: vpsrad $7, %ymm0, %ymm1
1797 ; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1798 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1799 ; XOPAVX2-NEXT: retq
1801 ; AVX512-LABEL: splatconstant_shift_v4i64:
1803 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1804 ; AVX512-NEXT: vpsraq $7, %zmm0, %zmm0
1805 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1808 ; AVX512VL-LABEL: splatconstant_shift_v4i64:
1809 ; AVX512VL: # %bb.0:
1810 ; AVX512VL-NEXT: vpsraq $7, %ymm0, %ymm0
1811 ; AVX512VL-NEXT: retq
1813 ; X86-AVX1-LABEL: splatconstant_shift_v4i64:
1814 ; X86-AVX1: # %bb.0:
1815 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1816 ; X86-AVX1-NEXT: vpsrad $7, %xmm1, %xmm2
1817 ; X86-AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1
1818 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1819 ; X86-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
1820 ; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
1821 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1822 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1823 ; X86-AVX1-NEXT: retl
1825 ; X86-AVX2-LABEL: splatconstant_shift_v4i64:
1826 ; X86-AVX2: # %bb.0:
1827 ; X86-AVX2-NEXT: vpsrad $7, %ymm0, %ymm1
1828 ; X86-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1829 ; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1830 ; X86-AVX2-NEXT: retl
1831 %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
1832 ret <4 x i64> %shift
1835 define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
1836 ; AVX1-LABEL: splatconstant_shift_v8i32:
1838 ; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
1839 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1840 ; AVX1-NEXT: vpsrad $5, %xmm0, %xmm0
1841 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1844 ; AVX2-LABEL: splatconstant_shift_v8i32:
1846 ; AVX2-NEXT: vpsrad $5, %ymm0, %ymm0
1849 ; XOPAVX1-LABEL: splatconstant_shift_v8i32:
1851 ; XOPAVX1-NEXT: vpsrad $5, %xmm0, %xmm1
1852 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1853 ; XOPAVX1-NEXT: vpsrad $5, %xmm0, %xmm0
1854 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1855 ; XOPAVX1-NEXT: retq
1857 ; XOPAVX2-LABEL: splatconstant_shift_v8i32:
1859 ; XOPAVX2-NEXT: vpsrad $5, %ymm0, %ymm0
1860 ; XOPAVX2-NEXT: retq
1862 ; AVX512-LABEL: splatconstant_shift_v8i32:
1864 ; AVX512-NEXT: vpsrad $5, %ymm0, %ymm0
1867 ; AVX512VL-LABEL: splatconstant_shift_v8i32:
1868 ; AVX512VL: # %bb.0:
1869 ; AVX512VL-NEXT: vpsrad $5, %ymm0, %ymm0
1870 ; AVX512VL-NEXT: retq
1872 ; X86-AVX1-LABEL: splatconstant_shift_v8i32:
1873 ; X86-AVX1: # %bb.0:
1874 ; X86-AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
1875 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1876 ; X86-AVX1-NEXT: vpsrad $5, %xmm0, %xmm0
1877 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1878 ; X86-AVX1-NEXT: retl
1880 ; X86-AVX2-LABEL: splatconstant_shift_v8i32:
1881 ; X86-AVX2: # %bb.0:
1882 ; X86-AVX2-NEXT: vpsrad $5, %ymm0, %ymm0
1883 ; X86-AVX2-NEXT: retl
1884 %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
1885 ret <8 x i32> %shift
1888 define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
1889 ; AVX1-LABEL: splatconstant_shift_v16i16:
1891 ; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
1892 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1893 ; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
1894 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1897 ; AVX2-LABEL: splatconstant_shift_v16i16:
1899 ; AVX2-NEXT: vpsraw $3, %ymm0, %ymm0
1902 ; XOPAVX1-LABEL: splatconstant_shift_v16i16:
1904 ; XOPAVX1-NEXT: vpsraw $3, %xmm0, %xmm1
1905 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1906 ; XOPAVX1-NEXT: vpsraw $3, %xmm0, %xmm0
1907 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1908 ; XOPAVX1-NEXT: retq
1910 ; XOPAVX2-LABEL: splatconstant_shift_v16i16:
1912 ; XOPAVX2-NEXT: vpsraw $3, %ymm0, %ymm0
1913 ; XOPAVX2-NEXT: retq
1915 ; AVX512-LABEL: splatconstant_shift_v16i16:
1917 ; AVX512-NEXT: vpsraw $3, %ymm0, %ymm0
1920 ; AVX512VL-LABEL: splatconstant_shift_v16i16:
1921 ; AVX512VL: # %bb.0:
1922 ; AVX512VL-NEXT: vpsraw $3, %ymm0, %ymm0
1923 ; AVX512VL-NEXT: retq
1925 ; X86-AVX1-LABEL: splatconstant_shift_v16i16:
1926 ; X86-AVX1: # %bb.0:
1927 ; X86-AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
1928 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1929 ; X86-AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
1930 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1931 ; X86-AVX1-NEXT: retl
1933 ; X86-AVX2-LABEL: splatconstant_shift_v16i16:
1934 ; X86-AVX2: # %bb.0:
1935 ; X86-AVX2-NEXT: vpsraw $3, %ymm0, %ymm0
1936 ; X86-AVX2-NEXT: retl
1937 %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1938 ret <16 x i16> %shift
1941 define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
1942 ; AVX1-LABEL: splatconstant_shift_v32i8:
1944 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1945 ; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
1946 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1947 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1948 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1949 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
1950 ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
1951 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1952 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1953 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
1954 ; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
1955 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1958 ; AVX2-LABEL: splatconstant_shift_v32i8:
1960 ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1961 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1962 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1963 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1964 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1967 ; XOPAVX1-LABEL: splatconstant_shift_v32i8:
1969 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1970 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253]
1971 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm1, %xmm1
1972 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm0, %xmm0
1973 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1974 ; XOPAVX1-NEXT: retq
1976 ; XOPAVX2-LABEL: splatconstant_shift_v32i8:
1978 ; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1979 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1980 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1981 ; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
1982 ; XOPAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1983 ; XOPAVX2-NEXT: retq
1985 ; AVX512-LABEL: splatconstant_shift_v32i8:
1987 ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
1988 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1989 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1990 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
1991 ; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
1994 ; AVX512VL-LABEL: splatconstant_shift_v32i8:
1995 ; AVX512VL: # %bb.0:
1996 ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
1997 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1998 ; AVX512VL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1999 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
2000 ; AVX512VL-NEXT: retq
2002 ; X86-AVX1-LABEL: splatconstant_shift_v32i8:
2003 ; X86-AVX1: # %bb.0:
2004 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2005 ; X86-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
2006 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
2007 ; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
2008 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2009 ; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
2010 ; X86-AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
2011 ; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
2012 ; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2013 ; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
2014 ; X86-AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
2015 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2016 ; X86-AVX1-NEXT: retl
2018 ; X86-AVX2-LABEL: splatconstant_shift_v32i8:
2019 ; X86-AVX2: # %bb.0:
2020 ; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
2021 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
2022 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2023 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
2024 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
2025 ; X86-AVX2-NEXT: retl
2026 %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
2027 ret <32 x i8> %shift
2034 define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
2035 ; AVX1-LABEL: shift32_v4i64:
2037 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2038 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
2039 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2040 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2041 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
2042 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2043 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
2044 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2047 ; AVX2-LABEL: shift32_v4i64:
2049 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1
2050 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
2051 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
2054 ; XOPAVX1-LABEL: shift32_v4i64:
2056 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2057 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551584,18446744073709551584]
2058 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1
2059 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0
2060 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2061 ; XOPAVX1-NEXT: retq
2063 ; XOPAVX2-LABEL: shift32_v4i64:
2065 ; XOPAVX2-NEXT: vpsrad $31, %ymm0, %ymm1
2066 ; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
2067 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
2068 ; XOPAVX2-NEXT: retq
2070 ; AVX512-LABEL: shift32_v4i64:
2072 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2073 ; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0
2074 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2077 ; AVX512VL-LABEL: shift32_v4i64:
2078 ; AVX512VL: # %bb.0:
2079 ; AVX512VL-NEXT: vpsraq $32, %ymm0, %ymm0
2080 ; AVX512VL-NEXT: retq
2082 ; X86-AVX1-LABEL: shift32_v4i64:
2083 ; X86-AVX1: # %bb.0:
2084 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2085 ; X86-AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
2086 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2087 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2088 ; X86-AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
2089 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2090 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
2091 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2092 ; X86-AVX1-NEXT: retl
2094 ; X86-AVX2-LABEL: shift32_v4i64:
2095 ; X86-AVX2: # %bb.0:
2096 ; X86-AVX2-NEXT: vpsrad $31, %ymm0, %ymm1
2097 ; X86-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
2098 ; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
2099 ; X86-AVX2-NEXT: retl
2100 %shift = ashr <4 x i64> %a, <i64 32, i64 32, i64 32, i64 32>
2101 ret <4 x i64> %shift
2104 define <4 x i64> @PR52719(<4 x i64> %a0, i32 %a1) {
2105 ; AVX1-LABEL: PR52719:
2107 ; AVX1-NEXT: vmovd %edi, %xmm1
2108 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
2109 ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
2110 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
2111 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
2112 ; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3
2113 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3
2114 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
2115 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
2116 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
2117 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
2120 ; AVX2-LABEL: PR52719:
2122 ; AVX2-NEXT: vmovd %edi, %xmm1
2123 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
2124 ; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
2125 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
2126 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
2127 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
2130 ; XOPAVX1-LABEL: PR52719:
2132 ; XOPAVX1-NEXT: vmovd %edi, %xmm1
2133 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
2134 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2135 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2136 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
2137 ; XOPAVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm3
2138 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
2139 ; XOPAVX1-NEXT: vpshaq %xmm3, %xmm4, %xmm3
2140 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
2141 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
2142 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
2143 ; XOPAVX1-NEXT: retq
2145 ; XOPAVX2-LABEL: PR52719:
2147 ; XOPAVX2-NEXT: vmovd %edi, %xmm1
2148 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
2149 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
2150 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
2151 ; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
2152 ; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
2153 ; XOPAVX2-NEXT: retq
2155 ; AVX512-LABEL: PR52719:
2157 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2158 ; AVX512-NEXT: vmovd %edi, %xmm1
2159 ; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0
2160 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2163 ; AVX512VL-LABEL: PR52719:
2164 ; AVX512VL: # %bb.0:
2165 ; AVX512VL-NEXT: vmovd %edi, %xmm1
2166 ; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0
2167 ; AVX512VL-NEXT: retq
2169 ; X86-AVX1-LABEL: PR52719:
2170 ; X86-AVX1: # %bb.0:
2171 ; X86-AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2172 ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
2173 ; X86-AVX1-NEXT: # xmm2 = mem[0,0]
2174 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
2175 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
2176 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
2177 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3
2178 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3
2179 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
2180 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
2181 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
2182 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
2183 ; X86-AVX1-NEXT: retl
2185 ; X86-AVX2-LABEL: PR52719:
2186 ; X86-AVX2: # %bb.0:
2187 ; X86-AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2188 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
2189 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
2190 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
2191 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
2192 ; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
2193 ; X86-AVX2-NEXT: retl
2194 %vec = insertelement <4 x i32> poison, i32 %a1, i64 0
2195 %splat = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
2196 %zext = zext <4 x i32> %splat to <4 x i64>
2197 %ashr = ashr <4 x i64> %a0, %zext