1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
11 ; 32-bit runs to make sure we do reasonable things for i64 shifts.
12 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86-AVX1
13 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX2
19 define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
20 ; AVX1-LABEL: var_shift_v4i64:
22 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
23 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
24 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
25 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
26 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
27 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
28 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3
29 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
30 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
31 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
32 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
35 ; AVX2-LABEL: var_shift_v4i64:
37 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
40 ; XOPAVX1-LABEL: var_shift_v4i64:
42 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
43 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
44 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
45 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
46 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm4, %xmm2
47 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1
48 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
49 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
52 ; XOPAVX2-LABEL: var_shift_v4i64:
54 ; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
57 ; AVX512-LABEL: var_shift_v4i64:
59 ; AVX512-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
62 ; AVX512VL-LABEL: var_shift_v4i64:
64 ; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
67 ; X86-AVX1-LABEL: var_shift_v4i64:
69 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
70 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
71 ; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
72 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
73 ; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
74 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
75 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3
76 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
77 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
78 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
79 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
82 ; X86-AVX2-LABEL: var_shift_v4i64:
84 ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
86 %shift = lshr <4 x i64> %a, %b
90 define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
91 ; AVX1-LABEL: var_shift_v8i32:
93 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
94 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
95 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
96 ; AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4
97 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
98 ; AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5
99 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
100 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
101 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
102 ; AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6
103 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
104 ; AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2
105 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
106 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
107 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
108 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
109 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
110 ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
111 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
112 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
113 ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
114 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
115 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
116 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
117 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
118 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
121 ; AVX2-LABEL: var_shift_v8i32:
123 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
126 ; XOPAVX1-LABEL: var_shift_v8i32:
128 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
129 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
130 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
131 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
132 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm4, %xmm2
133 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1
134 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
135 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
138 ; XOPAVX2-LABEL: var_shift_v8i32:
140 ; XOPAVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
143 ; AVX512-LABEL: var_shift_v8i32:
145 ; AVX512-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
148 ; AVX512VL-LABEL: var_shift_v8i32:
150 ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
151 ; AVX512VL-NEXT: retq
153 ; X86-AVX1-LABEL: var_shift_v8i32:
155 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
156 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
157 ; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
158 ; X86-AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4
159 ; X86-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
160 ; X86-AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5
161 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
162 ; X86-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
163 ; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
164 ; X86-AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6
165 ; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
166 ; X86-AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2
167 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
168 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
169 ; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
170 ; X86-AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
171 ; X86-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
172 ; X86-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
173 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
174 ; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
175 ; X86-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
176 ; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
177 ; X86-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
178 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
179 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
180 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
181 ; X86-AVX1-NEXT: retl
183 ; X86-AVX2-LABEL: var_shift_v8i32:
185 ; X86-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
186 ; X86-AVX2-NEXT: retl
187 %shift = lshr <8 x i32> %a, %b
191 define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
192 ; AVX1-LABEL: var_shift_v16i16:
194 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
195 ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
196 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
197 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
198 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
199 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
200 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5
201 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
202 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4
203 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
204 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4
205 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
206 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
207 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4
208 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
209 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
210 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
211 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
212 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
213 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
214 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4
215 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
216 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
217 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
218 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
219 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
220 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
221 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
222 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
223 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
224 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
227 ; AVX2-LABEL: var_shift_v16i16:
229 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
230 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
231 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
232 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
233 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
234 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
235 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
236 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
237 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
238 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
241 ; XOPAVX1-LABEL: var_shift_v16i16:
243 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
244 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
245 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
246 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
247 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm4, %xmm2
248 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1
249 ; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0
250 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
253 ; XOPAVX2-LABEL: var_shift_v16i16:
255 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
256 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
257 ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
258 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
259 ; XOPAVX2-NEXT: vpshlw %xmm2, %xmm4, %xmm2
260 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
261 ; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0
262 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
265 ; AVX512DQ-LABEL: var_shift_v16i16:
267 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
268 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
269 ; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
270 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
271 ; AVX512DQ-NEXT: retq
273 ; AVX512BW-LABEL: var_shift_v16i16:
275 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
276 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
277 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
278 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
279 ; AVX512BW-NEXT: retq
281 ; AVX512DQVL-LABEL: var_shift_v16i16:
282 ; AVX512DQVL: # %bb.0:
283 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
284 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
285 ; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
286 ; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
287 ; AVX512DQVL-NEXT: retq
289 ; AVX512BWVL-LABEL: var_shift_v16i16:
290 ; AVX512BWVL: # %bb.0:
291 ; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
292 ; AVX512BWVL-NEXT: retq
294 ; X86-AVX1-LABEL: var_shift_v16i16:
296 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
297 ; X86-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
298 ; X86-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
299 ; X86-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
300 ; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
301 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
302 ; X86-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5
303 ; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
304 ; X86-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4
305 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
306 ; X86-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4
307 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
308 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
309 ; X86-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4
310 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
311 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
312 ; X86-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
313 ; X86-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
314 ; X86-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
315 ; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
316 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4
317 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
318 ; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
319 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
320 ; X86-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
321 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
322 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
323 ; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
324 ; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
325 ; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
326 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
327 ; X86-AVX1-NEXT: retl
329 ; X86-AVX2-LABEL: var_shift_v16i16:
331 ; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
332 ; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
333 ; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
334 ; X86-AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
335 ; X86-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
336 ; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
337 ; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
338 ; X86-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
339 ; X86-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
340 ; X86-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
341 ; X86-AVX2-NEXT: retl
342 %shift = lshr <16 x i16> %a, %b
343 ret <16 x i16> %shift
346 define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
347 ; AVX1-LABEL: var_shift_v32i8:
349 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
350 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
351 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
352 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
353 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
354 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
355 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
356 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
357 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
358 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
359 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
360 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
361 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
362 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
363 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
364 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
365 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
366 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
367 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
368 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
369 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
370 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
371 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
372 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
373 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
374 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
375 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
376 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
377 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
378 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
381 ; AVX2-LABEL: var_shift_v32i8:
383 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
384 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
385 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
386 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
387 ; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
388 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
389 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
390 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
391 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
392 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
393 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
394 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
397 ; XOPAVX1-LABEL: var_shift_v32i8:
399 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
400 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
401 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
402 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
403 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm4, %xmm2
404 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1
405 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
406 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
409 ; XOPAVX2-LABEL: var_shift_v32i8:
411 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
412 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
413 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
414 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
415 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm4, %xmm2
416 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
417 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
418 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
421 ; AVX512DQ-LABEL: var_shift_v32i8:
423 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
424 ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2
425 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
426 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
427 ; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm2
428 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
429 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
430 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
431 ; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm2
432 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
433 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
434 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
435 ; AVX512DQ-NEXT: retq
437 ; AVX512BW-LABEL: var_shift_v32i8:
439 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
440 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
441 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
442 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
443 ; AVX512BW-NEXT: retq
445 ; AVX512DQVL-LABEL: var_shift_v32i8:
446 ; AVX512DQVL: # %bb.0:
447 ; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
448 ; AVX512DQVL-NEXT: vpsrlw $4, %ymm0, %ymm2
449 ; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
450 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
451 ; AVX512DQVL-NEXT: vpsrlw $2, %ymm0, %ymm2
452 ; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
453 ; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
454 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
455 ; AVX512DQVL-NEXT: vpsrlw $1, %ymm0, %ymm2
456 ; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
457 ; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
458 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
459 ; AVX512DQVL-NEXT: retq
461 ; AVX512BWVL-LABEL: var_shift_v32i8:
462 ; AVX512BWVL: # %bb.0:
463 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
464 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
465 ; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
466 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
467 ; AVX512BWVL-NEXT: retq
469 ; X86-AVX1-LABEL: var_shift_v32i8:
471 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
472 ; X86-AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4
473 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
474 ; X86-AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4
475 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
476 ; X86-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
477 ; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
478 ; X86-AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4
479 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
480 ; X86-AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
481 ; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
482 ; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
483 ; X86-AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4
484 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
485 ; X86-AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
486 ; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
487 ; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
488 ; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
489 ; X86-AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
490 ; X86-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
491 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
492 ; X86-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
493 ; X86-AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
494 ; X86-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
495 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
496 ; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
497 ; X86-AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
498 ; X86-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
499 ; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
500 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
501 ; X86-AVX1-NEXT: retl
503 ; X86-AVX2-LABEL: var_shift_v32i8:
505 ; X86-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
506 ; X86-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
507 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
508 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
509 ; X86-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
510 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
511 ; X86-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
512 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
513 ; X86-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
514 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
515 ; X86-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
516 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
517 ; X86-AVX2-NEXT: retl
518 %shift = lshr <32 x i8> %a, %b
523 ; Uniform Variable Shifts
526 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
527 ; AVX1-LABEL: splatvar_shift_v4i64:
529 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
530 ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
531 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
532 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
535 ; AVX2-LABEL: splatvar_shift_v4i64:
537 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
540 ; XOPAVX1-LABEL: splatvar_shift_v4i64:
542 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
543 ; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
544 ; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
545 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
548 ; XOPAVX2-LABEL: splatvar_shift_v4i64:
550 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
553 ; AVX512-LABEL: splatvar_shift_v4i64:
555 ; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
558 ; AVX512VL-LABEL: splatvar_shift_v4i64:
560 ; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
561 ; AVX512VL-NEXT: retq
563 ; X86-AVX1-LABEL: splatvar_shift_v4i64:
565 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
566 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
567 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
568 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
569 ; X86-AVX1-NEXT: retl
571 ; X86-AVX2-LABEL: splatvar_shift_v4i64:
573 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
574 ; X86-AVX2-NEXT: retl
575 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
576 %shift = lshr <4 x i64> %a, %splat
580 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
581 ; AVX1-LABEL: splatvar_shift_v8i32:
583 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
584 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
585 ; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
586 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
587 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
590 ; AVX2-LABEL: splatvar_shift_v8i32:
592 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
593 ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
596 ; XOPAVX1-LABEL: splatvar_shift_v8i32:
598 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
599 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
600 ; XOPAVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
601 ; XOPAVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
602 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
605 ; XOPAVX2-LABEL: splatvar_shift_v8i32:
607 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
608 ; XOPAVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
611 ; AVX512-LABEL: splatvar_shift_v8i32:
613 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
614 ; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0
617 ; AVX512VL-LABEL: splatvar_shift_v8i32:
619 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
620 ; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0
621 ; AVX512VL-NEXT: retq
623 ; X86-AVX1-LABEL: splatvar_shift_v8i32:
625 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
626 ; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
627 ; X86-AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
628 ; X86-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
629 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
630 ; X86-AVX1-NEXT: retl
632 ; X86-AVX2-LABEL: splatvar_shift_v8i32:
634 ; X86-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
635 ; X86-AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
636 ; X86-AVX2-NEXT: retl
637 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
638 %shift = lshr <8 x i32> %a, %splat
642 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
643 ; AVX1-LABEL: splatvar_shift_v16i16:
645 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
646 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
647 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
648 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
649 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
652 ; AVX2-LABEL: splatvar_shift_v16i16:
654 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
655 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
658 ; XOPAVX1-LABEL: splatvar_shift_v16i16:
660 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
661 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
662 ; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
663 ; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
664 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
667 ; XOPAVX2-LABEL: splatvar_shift_v16i16:
669 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
670 ; XOPAVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
673 ; AVX512-LABEL: splatvar_shift_v16i16:
675 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
676 ; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
679 ; AVX512VL-LABEL: splatvar_shift_v16i16:
681 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
682 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
683 ; AVX512VL-NEXT: retq
685 ; X86-AVX1-LABEL: splatvar_shift_v16i16:
687 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
688 ; X86-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
689 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
690 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
691 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
692 ; X86-AVX1-NEXT: retl
694 ; X86-AVX2-LABEL: splatvar_shift_v16i16:
696 ; X86-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
697 ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
698 ; X86-AVX2-NEXT: retl
699 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
700 %shift = lshr <16 x i16> %a, %splat
701 ret <16 x i16> %shift
704 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
705 ; AVX1-LABEL: splatvar_shift_v32i8:
707 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
708 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
709 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
710 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
711 ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
712 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
713 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
714 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
715 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
716 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
719 ; AVX2-LABEL: splatvar_shift_v32i8:
721 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
722 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
723 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
724 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
725 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
726 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
727 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
730 ; XOPAVX1-LABEL: splatvar_shift_v32i8:
732 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
733 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
734 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
735 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
736 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2
737 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
738 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
741 ; XOPAVX2-LABEL: splatvar_shift_v32i8:
743 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
744 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
745 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
746 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
747 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2
748 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
749 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
752 ; AVX512DQ-LABEL: splatvar_shift_v32i8:
754 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
755 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
756 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
757 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
758 ; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
759 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
760 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
761 ; AVX512DQ-NEXT: retq
763 ; AVX512BW-LABEL: splatvar_shift_v32i8:
765 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
766 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
767 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
768 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
769 ; AVX512BW-NEXT: retq
771 ; AVX512DQVL-LABEL: splatvar_shift_v32i8:
772 ; AVX512DQVL: # %bb.0:
773 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
774 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
775 ; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
776 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
777 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
778 ; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
779 ; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
780 ; AVX512DQVL-NEXT: retq
782 ; AVX512BWVL-LABEL: splatvar_shift_v32i8:
783 ; AVX512BWVL: # %bb.0:
784 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
785 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
786 ; AVX512BWVL-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
787 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
788 ; AVX512BWVL-NEXT: retq
790 ; X86-AVX1-LABEL: splatvar_shift_v32i8:
792 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
793 ; X86-AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
794 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
795 ; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
796 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
797 ; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
798 ; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
799 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
800 ; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
801 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
802 ; X86-AVX1-NEXT: retl
804 ; X86-AVX2-LABEL: splatvar_shift_v32i8:
806 ; X86-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
807 ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
808 ; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
809 ; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
810 ; X86-AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
811 ; X86-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
812 ; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
813 ; X86-AVX2-NEXT: retl
814 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
815 %shift = lshr <32 x i8> %a, %splat
820 ; Uniform Variable Modulo Shifts
823 define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
824 ; AVX1-LABEL: splatvar_modulo_shift_v4i64:
826 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
827 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
828 ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
829 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
830 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
833 ; AVX2-LABEL: splatvar_modulo_shift_v4i64:
835 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
836 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
839 ; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64:
841 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
842 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
843 ; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
844 ; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
845 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
848 ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64:
850 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
851 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
854 ; AVX512-LABEL: splatvar_modulo_shift_v4i64:
856 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
857 ; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
860 ; AVX512VL-LABEL: splatvar_modulo_shift_v4i64:
862 ; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1
863 ; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
864 ; AVX512VL-NEXT: retq
866 ; X86-AVX1-LABEL: splatvar_modulo_shift_v4i64:
868 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
869 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
870 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
871 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
872 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
873 ; X86-AVX1-NEXT: retl
875 ; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64:
877 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
878 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
879 ; X86-AVX2-NEXT: retl
880 %mod = and <4 x i64> %b, <i64 63, i64 63, i64 63, i64 63>
881 %splat = shufflevector <4 x i64> %mod, <4 x i64> undef, <4 x i32> zeroinitializer
882 %shift = lshr <4 x i64> %a, %splat
886 define <8 x i32> @splatvar_modulo_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
887 ; AVX1-LABEL: splatvar_modulo_shift_v8i32:
889 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
890 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
891 ; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
892 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
893 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
896 ; AVX2-LABEL: splatvar_modulo_shift_v8i32:
898 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
899 ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
902 ; XOPAVX1-LABEL: splatvar_modulo_shift_v8i32:
904 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
905 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
906 ; XOPAVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
907 ; XOPAVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
908 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
911 ; XOPAVX2-LABEL: splatvar_modulo_shift_v8i32:
913 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
914 ; XOPAVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
917 ; AVX512-LABEL: splatvar_modulo_shift_v8i32:
919 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
920 ; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0
923 ; AVX512VL-LABEL: splatvar_modulo_shift_v8i32:
925 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
926 ; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0
927 ; AVX512VL-NEXT: retq
929 ; X86-AVX1-LABEL: splatvar_modulo_shift_v8i32:
931 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
932 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
933 ; X86-AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
934 ; X86-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
935 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
936 ; X86-AVX1-NEXT: retl
938 ; X86-AVX2-LABEL: splatvar_modulo_shift_v8i32:
940 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
941 ; X86-AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
942 ; X86-AVX2-NEXT: retl
943 %mod = and <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
944 %splat = shufflevector <8 x i32> %mod, <8 x i32> undef, <8 x i32> zeroinitializer
945 %shift = lshr <8 x i32> %a, %splat
949 define <16 x i16> @splatvar_modulo_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
950 ; AVX1-LABEL: splatvar_modulo_shift_v16i16:
952 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
953 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
954 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
955 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
956 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
959 ; AVX2-LABEL: splatvar_modulo_shift_v16i16:
961 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
962 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
965 ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i16:
967 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
968 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
969 ; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
970 ; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
971 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
974 ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i16:
976 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
977 ; XOPAVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
980 ; AVX512-LABEL: splatvar_modulo_shift_v16i16:
982 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
983 ; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
986 ; AVX512VL-LABEL: splatvar_modulo_shift_v16i16:
988 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
989 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
990 ; AVX512VL-NEXT: retq
992 ; X86-AVX1-LABEL: splatvar_modulo_shift_v16i16:
994 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
995 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
996 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
997 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
998 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
999 ; X86-AVX1-NEXT: retl
1001 ; X86-AVX2-LABEL: splatvar_modulo_shift_v16i16:
1002 ; X86-AVX2: # %bb.0:
1003 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1004 ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1005 ; X86-AVX2-NEXT: retl
1006 %mod = and <16 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
1007 %splat = shufflevector <16 x i16> %mod, <16 x i16> undef, <16 x i32> zeroinitializer
1008 %shift = lshr <16 x i16> %a, %splat
1009 ret <16 x i16> %shift
1012 define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
1013 ; AVX1-LABEL: splatvar_modulo_shift_v32i8:
1015 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1016 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1017 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
1018 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1019 ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
1020 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1021 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1022 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1023 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1024 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1027 ; AVX2-LABEL: splatvar_modulo_shift_v32i8:
1029 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1030 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1031 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1032 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
1033 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
1034 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
1035 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1038 ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
1040 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1041 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1042 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1043 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1044 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1045 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2
1046 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
1047 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1048 ; XOPAVX1-NEXT: retq
1050 ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8:
1052 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1053 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1054 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1055 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1056 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1057 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2
1058 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
1059 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1060 ; XOPAVX2-NEXT: retq
1062 ; AVX512DQ-LABEL: splatvar_modulo_shift_v32i8:
1063 ; AVX512DQ: # %bb.0:
1064 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1065 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1066 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1067 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
1068 ; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
1069 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
1070 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
1071 ; AVX512DQ-NEXT: retq
1073 ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
1074 ; AVX512BW: # %bb.0:
1075 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1076 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1077 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1078 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
1079 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1080 ; AVX512BW-NEXT: retq
1082 ; AVX512DQVL-LABEL: splatvar_modulo_shift_v32i8:
1083 ; AVX512DQVL: # %bb.0:
1084 ; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1085 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1086 ; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1087 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
1088 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
1089 ; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
1090 ; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
1091 ; AVX512DQVL-NEXT: retq
1093 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
1094 ; AVX512BWVL: # %bb.0:
1095 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1096 ; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
1097 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1098 ; AVX512BWVL-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
1099 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
1100 ; AVX512BWVL-NEXT: retq
1102 ; X86-AVX1-LABEL: splatvar_modulo_shift_v32i8:
1103 ; X86-AVX1: # %bb.0:
1104 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1105 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1106 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
1107 ; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1108 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
1109 ; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1110 ; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1111 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1112 ; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1113 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1114 ; X86-AVX1-NEXT: retl
1116 ; X86-AVX2-LABEL: splatvar_modulo_shift_v32i8:
1117 ; X86-AVX2: # %bb.0:
1118 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1119 ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1120 ; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1121 ; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
1122 ; X86-AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
1123 ; X86-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
1124 ; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1125 ; X86-AVX2-NEXT: retl
1126 %mod = and <32 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
1127 %splat = shufflevector <32 x i8> %mod, <32 x i8> undef, <32 x i32> zeroinitializer
1128 %shift = lshr <32 x i8> %a, %splat
1129 ret <32 x i8> %shift
1136 define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
1137 ; AVX1-LABEL: constant_shift_v4i64:
1139 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1140 ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
1141 ; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
1142 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1143 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
1144 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
1145 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1146 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1149 ; AVX2-LABEL: constant_shift_v4i64:
1151 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1154 ; XOPAVX1-LABEL: constant_shift_v4i64:
1156 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1157 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1158 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1159 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1160 ; XOPAVX1-NEXT: retq
1162 ; XOPAVX2-LABEL: constant_shift_v4i64:
1164 ; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1165 ; XOPAVX2-NEXT: retq
1167 ; AVX512-LABEL: constant_shift_v4i64:
1169 ; AVX512-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1172 ; AVX512VL-LABEL: constant_shift_v4i64:
1173 ; AVX512VL: # %bb.0:
1174 ; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1175 ; AVX512VL-NEXT: retq
1177 ; X86-AVX1-LABEL: constant_shift_v4i64:
1178 ; X86-AVX1: # %bb.0:
1179 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1180 ; X86-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
1181 ; X86-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
1182 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1183 ; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
1184 ; X86-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
1185 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1186 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1187 ; X86-AVX1-NEXT: retl
1189 ; X86-AVX2-LABEL: constant_shift_v4i64:
1190 ; X86-AVX2: # %bb.0:
1191 ; X86-AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1192 ; X86-AVX2-NEXT: retl
1193 %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
1194 ret <4 x i64> %shift
1197 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
1198 ; AVX1-LABEL: constant_shift_v8i32:
1200 ; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
1201 ; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
1202 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1203 ; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2
1204 ; AVX1-NEXT: vpsrld $4, %xmm0, %xmm3
1205 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1206 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1207 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1208 ; AVX1-NEXT: vpsrld $7, %xmm0, %xmm2
1209 ; AVX1-NEXT: vpsrld $9, %xmm0, %xmm3
1210 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1211 ; AVX1-NEXT: vpsrld $8, %xmm0, %xmm0
1212 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1213 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1216 ; AVX2-LABEL: constant_shift_v8i32:
1218 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1221 ; XOPAVX1-LABEL: constant_shift_v8i32:
1223 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1224 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1225 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1226 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1227 ; XOPAVX1-NEXT: retq
1229 ; XOPAVX2-LABEL: constant_shift_v8i32:
1231 ; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1232 ; XOPAVX2-NEXT: retq
1234 ; AVX512-LABEL: constant_shift_v8i32:
1236 ; AVX512-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1239 ; AVX512VL-LABEL: constant_shift_v8i32:
1240 ; AVX512VL: # %bb.0:
1241 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1242 ; AVX512VL-NEXT: retq
1244 ; X86-AVX1-LABEL: constant_shift_v8i32:
1245 ; X86-AVX1: # %bb.0:
1246 ; X86-AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
1247 ; X86-AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
1248 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1249 ; X86-AVX1-NEXT: vpsrld $6, %xmm0, %xmm2
1250 ; X86-AVX1-NEXT: vpsrld $4, %xmm0, %xmm3
1251 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1252 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1253 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1254 ; X86-AVX1-NEXT: vpsrld $7, %xmm0, %xmm2
1255 ; X86-AVX1-NEXT: vpsrld $9, %xmm0, %xmm3
1256 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1257 ; X86-AVX1-NEXT: vpsrld $8, %xmm0, %xmm0
1258 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1259 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1260 ; X86-AVX1-NEXT: retl
1262 ; X86-AVX2-LABEL: constant_shift_v8i32:
1263 ; X86-AVX2: # %bb.0:
1264 ; X86-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1265 ; X86-AVX2-NEXT: retl
1266 %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
1267 ret <8 x i32> %shift
1270 define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
1271 ; AVX1-LABEL: constant_shift_v16i16:
1273 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,4096,2048,1024,512]
1274 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1275 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1276 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2]
1277 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1280 ; AVX2-LABEL: constant_shift_v16i16:
1282 ; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1283 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1284 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1287 ; XOPAVX1-LABEL: constant_shift_v16i16:
1289 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1290 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1291 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1292 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1293 ; XOPAVX1-NEXT: retq
1295 ; XOPAVX2-LABEL: constant_shift_v16i16:
1297 ; XOPAVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1298 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1299 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1300 ; XOPAVX2-NEXT: retq
1302 ; AVX512DQ-LABEL: constant_shift_v16i16:
1303 ; AVX512DQ: # %bb.0:
1304 ; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1305 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1306 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1307 ; AVX512DQ-NEXT: retq
1309 ; AVX512BW-LABEL: constant_shift_v16i16:
1310 ; AVX512BW: # %bb.0:
1311 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1312 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1313 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1314 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1315 ; AVX512BW-NEXT: retq
1317 ; AVX512DQVL-LABEL: constant_shift_v16i16:
1318 ; AVX512DQVL: # %bb.0:
1319 ; AVX512DQVL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1320 ; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1321 ; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1322 ; AVX512DQVL-NEXT: retq
1324 ; AVX512BWVL-LABEL: constant_shift_v16i16:
1325 ; AVX512BWVL: # %bb.0:
1326 ; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1327 ; AVX512BWVL-NEXT: retq
1329 ; X86-AVX1-LABEL: constant_shift_v16i16:
1330 ; X86-AVX1: # %bb.0:
1331 ; X86-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [u,32768,16384,8192,4096,2048,1024,512]
1332 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1333 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1334 ; X86-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [256,128,64,32,16,8,4,2]
1335 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1336 ; X86-AVX1-NEXT: retl
1338 ; X86-AVX2-LABEL: constant_shift_v16i16:
1339 ; X86-AVX2: # %bb.0:
1340 ; X86-AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1341 ; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1342 ; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1343 ; X86-AVX2-NEXT: retl
1344 %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1345 ret <16 x i16> %shift
1348 define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
1349 ; AVX1-LABEL: constant_shift_v32i8:
1351 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1352 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1353 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1354 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,4,8,16,32,64,128,256]
1355 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
1356 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1357 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1358 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [256,128,64,32,16,8,4,2]
1359 ; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
1360 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1361 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
1362 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1363 ; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
1364 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1365 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1366 ; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
1367 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1368 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1369 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1372 ; AVX2-LABEL: constant_shift_v32i8:
1374 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1375 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1376 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
1377 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
1378 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1379 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1380 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1381 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1384 ; XOPAVX1-LABEL: constant_shift_v32i8:
1386 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1387 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1388 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1389 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1390 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1391 ; XOPAVX1-NEXT: retq
1393 ; XOPAVX2-LABEL: constant_shift_v32i8:
1395 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1396 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1397 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1398 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1399 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1400 ; XOPAVX2-NEXT: retq
1402 ; AVX512DQ-LABEL: constant_shift_v32i8:
1403 ; AVX512DQ: # %bb.0:
1404 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
1405 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1406 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
1407 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
1408 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1409 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1410 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
1411 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1412 ; AVX512DQ-NEXT: retq
1414 ; AVX512BW-LABEL: constant_shift_v32i8:
1415 ; AVX512BW: # %bb.0:
1416 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1417 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1418 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1419 ; AVX512BW-NEXT: retq
1421 ; AVX512DQVL-LABEL: constant_shift_v32i8:
1422 ; AVX512DQVL: # %bb.0:
1423 ; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1424 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1425 ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
1426 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2
1427 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1428 ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1429 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0
1430 ; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1431 ; AVX512DQVL-NEXT: retq
1433 ; AVX512BWVL-LABEL: constant_shift_v32i8:
1434 ; AVX512BWVL: # %bb.0:
1435 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1436 ; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1437 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
1438 ; AVX512BWVL-NEXT: retq
1440 ; X86-AVX1-LABEL: constant_shift_v32i8:
1441 ; X86-AVX1: # %bb.0:
1442 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1443 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1444 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1445 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,4,8,16,32,64,128,256]
1446 ; X86-AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
1447 ; X86-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1448 ; X86-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1449 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [256,128,64,32,16,8,4,2]
1450 ; X86-AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
1451 ; X86-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1452 ; X86-AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
1453 ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1454 ; X86-AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
1455 ; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1456 ; X86-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1457 ; X86-AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
1458 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1459 ; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1460 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1461 ; X86-AVX1-NEXT: retl
1463 ; X86-AVX2-LABEL: constant_shift_v32i8:
1464 ; X86-AVX2: # %bb.0:
1465 ; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1466 ; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1467 ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
1468 ; X86-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
1469 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1470 ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1471 ; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1472 ; X86-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1473 ; X86-AVX2-NEXT: retl
1474 %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1475 ret <32 x i8> %shift
1479 ; Uniform Constant Shifts
1482 define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
1483 ; AVX1-LABEL: splatconstant_shift_v4i64:
1485 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
1486 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1487 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
1488 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1491 ; AVX2-LABEL: splatconstant_shift_v4i64:
1493 ; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1496 ; XOPAVX1-LABEL: splatconstant_shift_v4i64:
1498 ; XOPAVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
1499 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1500 ; XOPAVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
1501 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1502 ; XOPAVX1-NEXT: retq
1504 ; XOPAVX2-LABEL: splatconstant_shift_v4i64:
1506 ; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1507 ; XOPAVX2-NEXT: retq
1509 ; AVX512-LABEL: splatconstant_shift_v4i64:
1511 ; AVX512-NEXT: vpsrlq $7, %ymm0, %ymm0
1514 ; AVX512VL-LABEL: splatconstant_shift_v4i64:
1515 ; AVX512VL: # %bb.0:
1516 ; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0
1517 ; AVX512VL-NEXT: retq
1519 ; X86-AVX1-LABEL: splatconstant_shift_v4i64:
1520 ; X86-AVX1: # %bb.0:
1521 ; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
1522 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1523 ; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
1524 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1525 ; X86-AVX1-NEXT: retl
1527 ; X86-AVX2-LABEL: splatconstant_shift_v4i64:
1528 ; X86-AVX2: # %bb.0:
1529 ; X86-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
1530 ; X86-AVX2-NEXT: retl
1531 %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
1532 ret <4 x i64> %shift
1535 define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
1536 ; AVX1-LABEL: splatconstant_shift_v8i32:
1538 ; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
1539 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1540 ; AVX1-NEXT: vpsrld $5, %xmm0, %xmm0
1541 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1544 ; AVX2-LABEL: splatconstant_shift_v8i32:
1546 ; AVX2-NEXT: vpsrld $5, %ymm0, %ymm0
1549 ; XOPAVX1-LABEL: splatconstant_shift_v8i32:
1551 ; XOPAVX1-NEXT: vpsrld $5, %xmm0, %xmm1
1552 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1553 ; XOPAVX1-NEXT: vpsrld $5, %xmm0, %xmm0
1554 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1555 ; XOPAVX1-NEXT: retq
1557 ; XOPAVX2-LABEL: splatconstant_shift_v8i32:
1559 ; XOPAVX2-NEXT: vpsrld $5, %ymm0, %ymm0
1560 ; XOPAVX2-NEXT: retq
1562 ; AVX512-LABEL: splatconstant_shift_v8i32:
1564 ; AVX512-NEXT: vpsrld $5, %ymm0, %ymm0
1567 ; AVX512VL-LABEL: splatconstant_shift_v8i32:
1568 ; AVX512VL: # %bb.0:
1569 ; AVX512VL-NEXT: vpsrld $5, %ymm0, %ymm0
1570 ; AVX512VL-NEXT: retq
1572 ; X86-AVX1-LABEL: splatconstant_shift_v8i32:
1573 ; X86-AVX1: # %bb.0:
1574 ; X86-AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
1575 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1576 ; X86-AVX1-NEXT: vpsrld $5, %xmm0, %xmm0
1577 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1578 ; X86-AVX1-NEXT: retl
1580 ; X86-AVX2-LABEL: splatconstant_shift_v8i32:
1581 ; X86-AVX2: # %bb.0:
1582 ; X86-AVX2-NEXT: vpsrld $5, %ymm0, %ymm0
1583 ; X86-AVX2-NEXT: retl
1584 %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
1585 ret <8 x i32> %shift
1588 define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
1589 ; AVX1-LABEL: splatconstant_shift_v16i16:
1591 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
1592 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1593 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1594 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1597 ; AVX2-LABEL: splatconstant_shift_v16i16:
1599 ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1602 ; XOPAVX1-LABEL: splatconstant_shift_v16i16:
1604 ; XOPAVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
1605 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1606 ; XOPAVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1607 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1608 ; XOPAVX1-NEXT: retq
1610 ; XOPAVX2-LABEL: splatconstant_shift_v16i16:
1612 ; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1613 ; XOPAVX2-NEXT: retq
1615 ; AVX512-LABEL: splatconstant_shift_v16i16:
1617 ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
1620 ; AVX512VL-LABEL: splatconstant_shift_v16i16:
1621 ; AVX512VL: # %bb.0:
1622 ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
1623 ; AVX512VL-NEXT: retq
1625 ; X86-AVX1-LABEL: splatconstant_shift_v16i16:
1626 ; X86-AVX1: # %bb.0:
1627 ; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
1628 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1629 ; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1630 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1631 ; X86-AVX1-NEXT: retl
1633 ; X86-AVX2-LABEL: splatconstant_shift_v16i16:
1634 ; X86-AVX2: # %bb.0:
1635 ; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1636 ; X86-AVX2-NEXT: retl
1637 %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1638 ret <16 x i16> %shift
1641 define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
1642 ; AVX1-LABEL: splatconstant_shift_v32i8:
1644 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1645 ; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
1646 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1647 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1648 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1649 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1650 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1653 ; AVX2-LABEL: splatconstant_shift_v32i8:
1655 ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1656 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1659 ; XOPAVX1-LABEL: splatconstant_shift_v32i8:
1661 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1662 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253]
1663 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1664 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1665 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1666 ; XOPAVX1-NEXT: retq
1668 ; XOPAVX2-LABEL: splatconstant_shift_v32i8:
1670 ; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1671 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1672 ; XOPAVX2-NEXT: retq
1674 ; AVX512-LABEL: splatconstant_shift_v32i8:
1676 ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
1677 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1680 ; AVX512VL-LABEL: splatconstant_shift_v32i8:
1681 ; AVX512VL: # %bb.0:
1682 ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
1683 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
1684 ; AVX512VL-NEXT: retq
1686 ; X86-AVX1-LABEL: splatconstant_shift_v32i8:
1687 ; X86-AVX1: # %bb.0:
1688 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1689 ; X86-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
1690 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1691 ; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1692 ; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
1693 ; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1694 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1695 ; X86-AVX1-NEXT: retl
1697 ; X86-AVX2-LABEL: splatconstant_shift_v32i8:
1698 ; X86-AVX2: # %bb.0:
1699 ; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
1700 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1701 ; X86-AVX2-NEXT: retl
1702 %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1703 ret <32 x i8> %shift
1710 define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
1711 ; AVX1-LABEL: shift32_v4i64:
1713 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1714 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
1715 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1718 ; AVX2-LABEL: shift32_v4i64:
1720 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
1723 ; XOPAVX1-LABEL: shift32_v4i64:
1725 ; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1726 ; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
1727 ; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1728 ; XOPAVX1-NEXT: retq
1730 ; XOPAVX2-LABEL: shift32_v4i64:
1732 ; XOPAVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
1733 ; XOPAVX2-NEXT: retq
1735 ; AVX512-LABEL: shift32_v4i64:
1737 ; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
1740 ; AVX512VL-LABEL: shift32_v4i64:
1741 ; AVX512VL: # %bb.0:
1742 ; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0
1743 ; AVX512VL-NEXT: retq
1745 ; X86-AVX1-LABEL: shift32_v4i64:
1746 ; X86-AVX1: # %bb.0:
1747 ; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1748 ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
1749 ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1750 ; X86-AVX1-NEXT: retl
1752 ; X86-AVX2-LABEL: shift32_v4i64:
1753 ; X86-AVX2: # %bb.0:
1754 ; X86-AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
1755 ; X86-AVX2-NEXT: retl
1756 %shift = lshr <4 x i64> %a, <i64 32, i64 32, i64 32, i64 32>
1757 ret <4 x i64> %shift
1760 define <4 x i32> @sh_trunc_sh_vec(<4 x i64> %x) {
1761 ; AVX1-LABEL: sh_trunc_sh_vec:
1763 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1764 ; AVX1-NEXT: vpsrlq $36, %xmm1, %xmm1
1765 ; AVX1-NEXT: vpsrlq $36, %xmm0, %xmm0
1766 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1767 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1768 ; AVX1-NEXT: vzeroupper
1771 ; AVX2-LABEL: sh_trunc_sh_vec:
1773 ; AVX2-NEXT: vpsrlq $36, %ymm0, %ymm0
1774 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1775 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1776 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
1777 ; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
1778 ; AVX2-NEXT: vzeroupper
1781 ; XOPAVX1-LABEL: sh_trunc_sh_vec:
1783 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1784 ; XOPAVX1-NEXT: vpsrlq $36, %xmm1, %xmm1
1785 ; XOPAVX1-NEXT: vpsrlq $36, %xmm0, %xmm0
1786 ; XOPAVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1787 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1788 ; XOPAVX1-NEXT: vzeroupper
1789 ; XOPAVX1-NEXT: retq
1791 ; XOPAVX2-LABEL: sh_trunc_sh_vec:
1793 ; XOPAVX2-NEXT: vpsrlq $36, %ymm0, %ymm0
1794 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1795 ; XOPAVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1796 ; XOPAVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
1797 ; XOPAVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
1798 ; XOPAVX2-NEXT: vzeroupper
1799 ; XOPAVX2-NEXT: retq
1801 ; AVX512-LABEL: sh_trunc_sh_vec:
1803 ; AVX512-NEXT: vpsrlq $36, %ymm0, %ymm0
1804 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1805 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
1806 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1807 ; AVX512-NEXT: vzeroupper
1810 ; AVX512VL-LABEL: sh_trunc_sh_vec:
1811 ; AVX512VL: # %bb.0:
1812 ; AVX512VL-NEXT: vpsrlq $36, %ymm0, %ymm0
1813 ; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
1814 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1815 ; AVX512VL-NEXT: vzeroupper
1816 ; AVX512VL-NEXT: retq
1818 ; X86-AVX1-LABEL: sh_trunc_sh_vec:
1819 ; X86-AVX1: # %bb.0:
1820 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1821 ; X86-AVX1-NEXT: vpsrlq $36, %xmm1, %xmm1
1822 ; X86-AVX1-NEXT: vpsrlq $36, %xmm0, %xmm0
1823 ; X86-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1824 ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1825 ; X86-AVX1-NEXT: vzeroupper
1826 ; X86-AVX1-NEXT: retl
1828 ; X86-AVX2-LABEL: sh_trunc_sh_vec:
1829 ; X86-AVX2: # %bb.0:
1830 ; X86-AVX2-NEXT: vpsrlq $36, %ymm0, %ymm0
1831 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1832 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1833 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
1834 ; X86-AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
1835 ; X86-AVX2-NEXT: vzeroupper
1836 ; X86-AVX2-NEXT: retl
1837 %s = lshr <4 x i64> %x, <i64 24, i64 24, i64 24, i64 24>
1838 %t = trunc <4 x i64> %s to <4 x i32>
1839 %r = lshr <4 x i32> %t, <i32 12, i32 12, i32 12, i32 12>