1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
13 ; Just one 32-bit run to make sure we do reasonable things for i64 rotates.
14 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
20 define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
21 ; SSE2-LABEL: var_rotate_v2i64:
23 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64]
24 ; SSE2-NEXT: psubq %xmm1, %xmm2
25 ; SSE2-NEXT: movdqa %xmm0, %xmm3
26 ; SSE2-NEXT: psllq %xmm1, %xmm3
27 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
28 ; SSE2-NEXT: movdqa %xmm0, %xmm4
29 ; SSE2-NEXT: psllq %xmm1, %xmm4
30 ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
31 ; SSE2-NEXT: movdqa %xmm0, %xmm1
32 ; SSE2-NEXT: psrlq %xmm2, %xmm1
33 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
34 ; SSE2-NEXT: psrlq %xmm2, %xmm0
35 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
36 ; SSE2-NEXT: orpd %xmm4, %xmm0
39 ; SSE41-LABEL: var_rotate_v2i64:
41 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [64,64]
42 ; SSE41-NEXT: psubq %xmm1, %xmm2
43 ; SSE41-NEXT: movdqa %xmm0, %xmm3
44 ; SSE41-NEXT: psllq %xmm1, %xmm3
45 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
46 ; SSE41-NEXT: movdqa %xmm0, %xmm4
47 ; SSE41-NEXT: psllq %xmm1, %xmm4
48 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7]
49 ; SSE41-NEXT: movdqa %xmm0, %xmm1
50 ; SSE41-NEXT: psrlq %xmm2, %xmm1
51 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
52 ; SSE41-NEXT: psrlq %xmm2, %xmm0
53 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
54 ; SSE41-NEXT: por %xmm4, %xmm0
57 ; AVX1-LABEL: var_rotate_v2i64:
59 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
60 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
61 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3
62 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
63 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
64 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
65 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3
66 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
67 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
68 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
69 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
72 ; AVX2-LABEL: var_rotate_v2i64:
74 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
75 ; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2
76 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1
77 ; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0
78 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
81 ; AVX512F-LABEL: var_rotate_v2i64:
83 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
84 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
85 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
86 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
87 ; AVX512F-NEXT: vzeroupper
90 ; AVX512VL-LABEL: var_rotate_v2i64:
92 ; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0
95 ; AVX512BW-LABEL: var_rotate_v2i64:
97 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
98 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
99 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
100 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
101 ; AVX512BW-NEXT: vzeroupper
102 ; AVX512BW-NEXT: retq
104 ; AVX512VLBW-LABEL: var_rotate_v2i64:
105 ; AVX512VLBW: # %bb.0:
106 ; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0
107 ; AVX512VLBW-NEXT: retq
109 ; XOP-LABEL: var_rotate_v2i64:
111 ; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0
114 ; X32-SSE-LABEL: var_rotate_v2i64:
116 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [64,0,64,0]
117 ; X32-SSE-NEXT: psubq %xmm1, %xmm2
118 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
119 ; X32-SSE-NEXT: psllq %xmm1, %xmm3
120 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
121 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
122 ; X32-SSE-NEXT: psllq %xmm1, %xmm4
123 ; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
124 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
125 ; X32-SSE-NEXT: psrlq %xmm2, %xmm1
126 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
127 ; X32-SSE-NEXT: psrlq %xmm2, %xmm0
128 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
129 ; X32-SSE-NEXT: orpd %xmm4, %xmm0
131 %b64 = sub <2 x i64> <i64 64, i64 64>, %b
132 %shl = shl <2 x i64> %a, %b
133 %lshr = lshr <2 x i64> %a, %b64
134 %or = or <2 x i64> %shl, %lshr
138 define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
139 ; SSE2-LABEL: var_rotate_v4i32:
141 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
142 ; SSE2-NEXT: pslld $23, %xmm1
143 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
144 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
145 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
146 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
147 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
148 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
149 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
150 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
151 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
152 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
153 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
154 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
155 ; SSE2-NEXT: por %xmm3, %xmm0
158 ; SSE41-LABEL: var_rotate_v4i32:
160 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
161 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
162 ; SSE41-NEXT: pslld $23, %xmm1
163 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
164 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
165 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
166 ; SSE41-NEXT: pmuludq %xmm2, %xmm3
167 ; SSE41-NEXT: pmuludq %xmm1, %xmm0
168 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
169 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
170 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
171 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
172 ; SSE41-NEXT: por %xmm1, %xmm0
175 ; AVX1-LABEL: var_rotate_v4i32:
177 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
178 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
179 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
180 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
181 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
182 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
183 ; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
184 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
185 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
186 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
187 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
188 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
189 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
192 ; AVX2-LABEL: var_rotate_v4i32:
194 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
195 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
196 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2
197 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
198 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
199 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
200 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
203 ; AVX512F-LABEL: var_rotate_v4i32:
205 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
206 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
207 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
208 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
209 ; AVX512F-NEXT: vzeroupper
212 ; AVX512VL-LABEL: var_rotate_v4i32:
214 ; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0
215 ; AVX512VL-NEXT: retq
217 ; AVX512BW-LABEL: var_rotate_v4i32:
219 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
220 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
221 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
222 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
223 ; AVX512BW-NEXT: vzeroupper
224 ; AVX512BW-NEXT: retq
226 ; AVX512VLBW-LABEL: var_rotate_v4i32:
227 ; AVX512VLBW: # %bb.0:
228 ; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0
229 ; AVX512VLBW-NEXT: retq
231 ; XOP-LABEL: var_rotate_v4i32:
233 ; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0
236 ; X32-SSE-LABEL: var_rotate_v4i32:
238 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
239 ; X32-SSE-NEXT: pslld $23, %xmm1
240 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
241 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
242 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
243 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
244 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
245 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
246 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
247 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
248 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
249 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
250 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
251 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
252 ; X32-SSE-NEXT: por %xmm3, %xmm0
254 %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b
255 %shl = shl <4 x i32> %a, %b
256 %lshr = lshr <4 x i32> %a, %b32
257 %or = or <4 x i32> %shl, %lshr
261 define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
262 ; SSE2-LABEL: var_rotate_v8i16:
264 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
265 ; SSE2-NEXT: pxor %xmm2, %xmm2
266 ; SSE2-NEXT: movdqa %xmm1, %xmm3
267 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
268 ; SSE2-NEXT: pslld $23, %xmm3
269 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
270 ; SSE2-NEXT: paddd %xmm4, %xmm3
271 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
272 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
273 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
274 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
275 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
276 ; SSE2-NEXT: pslld $23, %xmm1
277 ; SSE2-NEXT: paddd %xmm4, %xmm1
278 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
279 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
280 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
281 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
282 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
283 ; SSE2-NEXT: movdqa %xmm0, %xmm2
284 ; SSE2-NEXT: pmulhuw %xmm1, %xmm2
285 ; SSE2-NEXT: pmullw %xmm1, %xmm0
286 ; SSE2-NEXT: por %xmm2, %xmm0
289 ; SSE41-LABEL: var_rotate_v8i16:
291 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
292 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
293 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
294 ; SSE41-NEXT: pslld $23, %xmm1
295 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
296 ; SSE41-NEXT: paddd %xmm3, %xmm1
297 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
298 ; SSE41-NEXT: pslld $23, %xmm2
299 ; SSE41-NEXT: paddd %xmm3, %xmm2
300 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
301 ; SSE41-NEXT: packusdw %xmm1, %xmm2
302 ; SSE41-NEXT: movdqa %xmm0, %xmm1
303 ; SSE41-NEXT: pmulhuw %xmm2, %xmm1
304 ; SSE41-NEXT: pmullw %xmm2, %xmm0
305 ; SSE41-NEXT: por %xmm1, %xmm0
308 ; AVX1-LABEL: var_rotate_v8i16:
310 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
311 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
312 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
313 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
314 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
315 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
316 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
317 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
318 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
319 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
320 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
321 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
322 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
323 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
326 ; AVX2-LABEL: var_rotate_v8i16:
328 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
329 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
330 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
331 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm2
332 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
333 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
334 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
335 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
336 ; AVX2-NEXT: vpsubw %xmm1, %xmm4, %xmm1
337 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
338 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
339 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
340 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
341 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
342 ; AVX2-NEXT: vzeroupper
345 ; AVX512F-LABEL: var_rotate_v8i16:
347 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
348 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
349 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
350 ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm2
351 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
352 ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1
353 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
354 ; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
355 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
356 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
357 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
358 ; AVX512F-NEXT: vzeroupper
361 ; AVX512VL-LABEL: var_rotate_v8i16:
363 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
364 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
365 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
366 ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm2
367 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
368 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1
369 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
370 ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
371 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
372 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
373 ; AVX512VL-NEXT: vzeroupper
374 ; AVX512VL-NEXT: retq
376 ; AVX512BW-LABEL: var_rotate_v8i16:
378 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
379 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
380 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
381 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
382 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
383 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
384 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0
385 ; AVX512BW-NEXT: vzeroupper
386 ; AVX512BW-NEXT: retq
388 ; AVX512VLBW-LABEL: var_rotate_v8i16:
389 ; AVX512VLBW: # %bb.0:
390 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
391 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2
392 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
393 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
394 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
395 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0
396 ; AVX512VLBW-NEXT: retq
398 ; XOP-LABEL: var_rotate_v8i16:
400 ; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0
403 ; X32-SSE-LABEL: var_rotate_v8i16:
405 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
406 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
407 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
408 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
409 ; X32-SSE-NEXT: pslld $23, %xmm3
410 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
411 ; X32-SSE-NEXT: paddd %xmm4, %xmm3
412 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
413 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
414 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
415 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
416 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
417 ; X32-SSE-NEXT: pslld $23, %xmm1
418 ; X32-SSE-NEXT: paddd %xmm4, %xmm1
419 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
420 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
421 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
422 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
423 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
424 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
425 ; X32-SSE-NEXT: pmulhuw %xmm1, %xmm2
426 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0
427 ; X32-SSE-NEXT: por %xmm2, %xmm0
429 %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
430 %shl = shl <8 x i16> %a, %b
431 %lshr = lshr <8 x i16> %a, %b16
432 %or = or <8 x i16> %shl, %lshr
436 define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
437 ; SSE2-LABEL: var_rotate_v16i8:
439 ; SSE2-NEXT: movdqa %xmm0, %xmm2
440 ; SSE2-NEXT: psllw $5, %xmm1
441 ; SSE2-NEXT: pxor %xmm0, %xmm0
442 ; SSE2-NEXT: pxor %xmm3, %xmm3
443 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
444 ; SSE2-NEXT: movdqa %xmm2, %xmm4
445 ; SSE2-NEXT: psrlw $4, %xmm4
446 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
447 ; SSE2-NEXT: movdqa %xmm2, %xmm5
448 ; SSE2-NEXT: psllw $4, %xmm5
449 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm5
450 ; SSE2-NEXT: por %xmm4, %xmm5
451 ; SSE2-NEXT: pand %xmm3, %xmm5
452 ; SSE2-NEXT: pandn %xmm2, %xmm3
453 ; SSE2-NEXT: por %xmm5, %xmm3
454 ; SSE2-NEXT: movdqa %xmm3, %xmm2
455 ; SSE2-NEXT: psrlw $6, %xmm2
456 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
457 ; SSE2-NEXT: movdqa %xmm3, %xmm4
458 ; SSE2-NEXT: psllw $2, %xmm4
459 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
460 ; SSE2-NEXT: por %xmm2, %xmm4
461 ; SSE2-NEXT: paddb %xmm1, %xmm1
462 ; SSE2-NEXT: pxor %xmm2, %xmm2
463 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
464 ; SSE2-NEXT: pand %xmm2, %xmm4
465 ; SSE2-NEXT: pandn %xmm3, %xmm2
466 ; SSE2-NEXT: por %xmm4, %xmm2
467 ; SSE2-NEXT: movdqa %xmm2, %xmm3
468 ; SSE2-NEXT: paddb %xmm2, %xmm3
469 ; SSE2-NEXT: movdqa %xmm2, %xmm4
470 ; SSE2-NEXT: psrlw $7, %xmm4
471 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
472 ; SSE2-NEXT: por %xmm3, %xmm4
473 ; SSE2-NEXT: paddb %xmm1, %xmm1
474 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm0
475 ; SSE2-NEXT: pand %xmm0, %xmm4
476 ; SSE2-NEXT: pandn %xmm2, %xmm0
477 ; SSE2-NEXT: por %xmm4, %xmm0
480 ; SSE41-LABEL: var_rotate_v16i8:
482 ; SSE41-NEXT: movdqa %xmm1, %xmm2
483 ; SSE41-NEXT: movdqa %xmm0, %xmm1
484 ; SSE41-NEXT: psrlw $4, %xmm0
485 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
486 ; SSE41-NEXT: movdqa %xmm1, %xmm3
487 ; SSE41-NEXT: psllw $4, %xmm3
488 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
489 ; SSE41-NEXT: por %xmm0, %xmm3
490 ; SSE41-NEXT: psllw $5, %xmm2
491 ; SSE41-NEXT: movdqa %xmm2, %xmm0
492 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
493 ; SSE41-NEXT: movdqa %xmm1, %xmm0
494 ; SSE41-NEXT: psrlw $6, %xmm0
495 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
496 ; SSE41-NEXT: movdqa %xmm1, %xmm3
497 ; SSE41-NEXT: psllw $2, %xmm3
498 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
499 ; SSE41-NEXT: por %xmm0, %xmm3
500 ; SSE41-NEXT: paddb %xmm2, %xmm2
501 ; SSE41-NEXT: movdqa %xmm2, %xmm0
502 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
503 ; SSE41-NEXT: movdqa %xmm1, %xmm0
504 ; SSE41-NEXT: paddb %xmm1, %xmm0
505 ; SSE41-NEXT: movdqa %xmm1, %xmm3
506 ; SSE41-NEXT: psrlw $7, %xmm3
507 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
508 ; SSE41-NEXT: por %xmm0, %xmm3
509 ; SSE41-NEXT: paddb %xmm2, %xmm2
510 ; SSE41-NEXT: movdqa %xmm2, %xmm0
511 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
512 ; SSE41-NEXT: movdqa %xmm1, %xmm0
515 ; AVX-LABEL: var_rotate_v16i8:
517 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
518 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
519 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm3
520 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
521 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
522 ; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
523 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
524 ; AVX-NEXT: vpsrlw $6, %xmm0, %xmm2
525 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
526 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm3
527 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
528 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
529 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
530 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
531 ; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
532 ; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3
533 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
534 ; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
535 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
536 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
539 ; AVX512F-LABEL: var_rotate_v16i8:
541 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
542 ; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm2
543 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
544 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
545 ; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm1
546 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
547 ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
548 ; AVX512F-NEXT: vpord %zmm0, %zmm1, %zmm0
549 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
550 ; AVX512F-NEXT: vzeroupper
553 ; AVX512VL-LABEL: var_rotate_v16i8:
555 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
556 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm2
557 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
558 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
559 ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm1
560 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
561 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
562 ; AVX512VL-NEXT: vpord %zmm0, %zmm1, %zmm0
563 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
564 ; AVX512VL-NEXT: vzeroupper
565 ; AVX512VL-NEXT: retq
567 ; AVX512BW-LABEL: var_rotate_v16i8:
569 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
570 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2
571 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
572 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
573 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
574 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
575 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
576 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
577 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
578 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
579 ; AVX512BW-NEXT: vzeroupper
580 ; AVX512BW-NEXT: retq
582 ; AVX512VLBW-LABEL: var_rotate_v16i8:
583 ; AVX512VLBW: # %bb.0:
584 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
585 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm2
586 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
587 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
588 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm1
589 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
590 ; AVX512VLBW-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0
591 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0
592 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
593 ; AVX512VLBW-NEXT: vzeroupper
594 ; AVX512VLBW-NEXT: retq
596 ; XOP-LABEL: var_rotate_v16i8:
598 ; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0
601 ; X32-SSE-LABEL: var_rotate_v16i8:
603 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
604 ; X32-SSE-NEXT: psllw $5, %xmm1
605 ; X32-SSE-NEXT: pxor %xmm0, %xmm0
606 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
607 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
608 ; X32-SSE-NEXT: movdqa %xmm2, %xmm4
609 ; X32-SSE-NEXT: psrlw $4, %xmm4
610 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
611 ; X32-SSE-NEXT: movdqa %xmm2, %xmm5
612 ; X32-SSE-NEXT: psllw $4, %xmm5
613 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm5
614 ; X32-SSE-NEXT: por %xmm4, %xmm5
615 ; X32-SSE-NEXT: pand %xmm3, %xmm5
616 ; X32-SSE-NEXT: pandn %xmm2, %xmm3
617 ; X32-SSE-NEXT: por %xmm5, %xmm3
618 ; X32-SSE-NEXT: movdqa %xmm3, %xmm2
619 ; X32-SSE-NEXT: psrlw $6, %xmm2
620 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
621 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
622 ; X32-SSE-NEXT: psllw $2, %xmm4
623 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
624 ; X32-SSE-NEXT: por %xmm2, %xmm4
625 ; X32-SSE-NEXT: paddb %xmm1, %xmm1
626 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
627 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
628 ; X32-SSE-NEXT: pand %xmm2, %xmm4
629 ; X32-SSE-NEXT: pandn %xmm3, %xmm2
630 ; X32-SSE-NEXT: por %xmm4, %xmm2
631 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
632 ; X32-SSE-NEXT: paddb %xmm2, %xmm3
633 ; X32-SSE-NEXT: movdqa %xmm2, %xmm4
634 ; X32-SSE-NEXT: psrlw $7, %xmm4
635 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
636 ; X32-SSE-NEXT: por %xmm3, %xmm4
637 ; X32-SSE-NEXT: paddb %xmm1, %xmm1
638 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm0
639 ; X32-SSE-NEXT: pand %xmm0, %xmm4
640 ; X32-SSE-NEXT: pandn %xmm2, %xmm0
641 ; X32-SSE-NEXT: por %xmm4, %xmm0
643 %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
644 %shl = shl <16 x i8> %a, %b
645 %lshr = lshr <16 x i8> %a, %b8
646 %or = or <16 x i8> %shl, %lshr
651 ; Uniform Variable Rotates
654 define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
655 ; SSE-LABEL: splatvar_rotate_v2i64:
657 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [64,64]
658 ; SSE-NEXT: psubq %xmm1, %xmm2
659 ; SSE-NEXT: movdqa %xmm0, %xmm3
660 ; SSE-NEXT: psllq %xmm1, %xmm3
661 ; SSE-NEXT: psrlq %xmm2, %xmm0
662 ; SSE-NEXT: por %xmm3, %xmm0
665 ; AVX-LABEL: splatvar_rotate_v2i64:
667 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
668 ; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm2
669 ; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm1
670 ; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
671 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
674 ; AVX512F-LABEL: splatvar_rotate_v2i64:
676 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
677 ; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
678 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
679 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
680 ; AVX512F-NEXT: vzeroupper
683 ; AVX512VL-LABEL: splatvar_rotate_v2i64:
685 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
686 ; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0
687 ; AVX512VL-NEXT: retq
689 ; AVX512BW-LABEL: splatvar_rotate_v2i64:
691 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
692 ; AVX512BW-NEXT: vpbroadcastq %xmm1, %xmm1
693 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
694 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
695 ; AVX512BW-NEXT: vzeroupper
696 ; AVX512BW-NEXT: retq
698 ; AVX512VLBW-LABEL: splatvar_rotate_v2i64:
699 ; AVX512VLBW: # %bb.0:
700 ; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %xmm1
701 ; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0
702 ; AVX512VLBW-NEXT: retq
704 ; XOPAVX1-LABEL: splatvar_rotate_v2i64:
706 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
707 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
710 ; XOPAVX2-LABEL: splatvar_rotate_v2i64:
712 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
713 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
716 ; X32-SSE-LABEL: splatvar_rotate_v2i64:
718 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
719 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
720 ; X32-SSE-NEXT: psubq %xmm2, %xmm3
721 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
722 ; X32-SSE-NEXT: psllq %xmm1, %xmm2
723 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
724 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1
725 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
726 ; X32-SSE-NEXT: psrlq %xmm3, %xmm0
727 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
728 ; X32-SSE-NEXT: orpd %xmm2, %xmm0
730 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
731 %splat64 = sub <2 x i64> <i64 64, i64 64>, %splat
732 %shl = shl <2 x i64> %a, %splat
733 %lshr = lshr <2 x i64> %a, %splat64
734 %or = or <2 x i64> %shl, %lshr
738 define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
739 ; SSE2-LABEL: splatvar_rotate_v4i32:
741 ; SSE2-NEXT: movd %xmm1, %eax
742 ; SSE2-NEXT: andl $31, %eax
743 ; SSE2-NEXT: movd %eax, %xmm1
744 ; SSE2-NEXT: movdqa %xmm0, %xmm2
745 ; SSE2-NEXT: pslld %xmm1, %xmm2
746 ; SSE2-NEXT: movl $32, %ecx
747 ; SSE2-NEXT: subl %eax, %ecx
748 ; SSE2-NEXT: movd %ecx, %xmm1
749 ; SSE2-NEXT: psrld %xmm1, %xmm0
750 ; SSE2-NEXT: por %xmm2, %xmm0
753 ; SSE41-LABEL: splatvar_rotate_v4i32:
755 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
756 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
757 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
758 ; SSE41-NEXT: movdqa %xmm0, %xmm3
759 ; SSE41-NEXT: pslld %xmm2, %xmm3
760 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32]
761 ; SSE41-NEXT: psubd %xmm1, %xmm2
762 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero
763 ; SSE41-NEXT: psrld %xmm1, %xmm0
764 ; SSE41-NEXT: por %xmm3, %xmm0
767 ; AVX1-LABEL: splatvar_rotate_v4i32:
769 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
770 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
771 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
772 ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2
773 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
774 ; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1
775 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
776 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
777 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
780 ; AVX2-LABEL: splatvar_rotate_v4i32:
782 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
783 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
784 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
785 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
786 ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2
787 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
788 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
789 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
790 ; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0
791 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
794 ; AVX512F-LABEL: splatvar_rotate_v4i32:
796 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
797 ; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1
798 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
799 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
800 ; AVX512F-NEXT: vzeroupper
803 ; AVX512VL-LABEL: splatvar_rotate_v4i32:
805 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
806 ; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0
807 ; AVX512VL-NEXT: retq
809 ; AVX512BW-LABEL: splatvar_rotate_v4i32:
811 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
812 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1
813 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
814 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
815 ; AVX512BW-NEXT: vzeroupper
816 ; AVX512BW-NEXT: retq
818 ; AVX512VLBW-LABEL: splatvar_rotate_v4i32:
819 ; AVX512VLBW: # %bb.0:
820 ; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %xmm1
821 ; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0
822 ; AVX512VLBW-NEXT: retq
824 ; XOPAVX1-LABEL: splatvar_rotate_v4i32:
826 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
827 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
830 ; XOPAVX2-LABEL: splatvar_rotate_v4i32:
832 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
833 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
836 ; X32-SSE-LABEL: splatvar_rotate_v4i32:
838 ; X32-SSE-NEXT: movd %xmm1, %eax
839 ; X32-SSE-NEXT: andl $31, %eax
840 ; X32-SSE-NEXT: movd %eax, %xmm1
841 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
842 ; X32-SSE-NEXT: pslld %xmm1, %xmm2
843 ; X32-SSE-NEXT: movl $32, %ecx
844 ; X32-SSE-NEXT: subl %eax, %ecx
845 ; X32-SSE-NEXT: movd %ecx, %xmm1
846 ; X32-SSE-NEXT: psrld %xmm1, %xmm0
847 ; X32-SSE-NEXT: por %xmm2, %xmm0
849 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
850 %splat32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %splat
851 %shl = shl <4 x i32> %a, %splat
852 %lshr = lshr <4 x i32> %a, %splat32
853 %or = or <4 x i32> %shl, %lshr
857 define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
858 ; SSE2-LABEL: splatvar_rotate_v8i16:
860 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
861 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
862 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
863 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
864 ; SSE2-NEXT: psubw %xmm1, %xmm2
865 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
866 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
867 ; SSE2-NEXT: movdqa %xmm0, %xmm3
868 ; SSE2-NEXT: psllw %xmm1, %xmm3
869 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
870 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
871 ; SSE2-NEXT: psrlw %xmm2, %xmm0
872 ; SSE2-NEXT: por %xmm3, %xmm0
875 ; SSE41-LABEL: splatvar_rotate_v8i16:
877 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
878 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
879 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
880 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
881 ; SSE41-NEXT: movdqa %xmm0, %xmm3
882 ; SSE41-NEXT: psllw %xmm2, %xmm3
883 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
884 ; SSE41-NEXT: psubw %xmm1, %xmm2
885 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
886 ; SSE41-NEXT: psrlw %xmm1, %xmm0
887 ; SSE41-NEXT: por %xmm3, %xmm0
890 ; AVX1-LABEL: splatvar_rotate_v8i16:
892 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
893 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
894 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
895 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
896 ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm2
897 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
898 ; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1
899 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
900 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
901 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
904 ; AVX2-LABEL: splatvar_rotate_v8i16:
906 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
907 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
908 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
909 ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm2
910 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
911 ; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
912 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
913 ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
914 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
917 ; AVX512-LABEL: splatvar_rotate_v8i16:
919 ; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1
920 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
921 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
922 ; AVX512-NEXT: vpsllw %xmm2, %xmm0, %xmm2
923 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
924 ; AVX512-NEXT: vpsubw %xmm1, %xmm3, %xmm1
925 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
926 ; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
927 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
930 ; XOPAVX1-LABEL: splatvar_rotate_v8i16:
932 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
933 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
934 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
937 ; XOPAVX2-LABEL: splatvar_rotate_v8i16:
939 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
940 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
943 ; X32-SSE-LABEL: splatvar_rotate_v8i16:
945 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
946 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
947 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
948 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
949 ; X32-SSE-NEXT: psubw %xmm1, %xmm2
950 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
951 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
952 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
953 ; X32-SSE-NEXT: psllw %xmm1, %xmm3
954 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
955 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
956 ; X32-SSE-NEXT: psrlw %xmm2, %xmm0
957 ; X32-SSE-NEXT: por %xmm3, %xmm0
959 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
960 %splat16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
961 %shl = shl <8 x i16> %a, %splat
962 %lshr = lshr <8 x i16> %a, %splat16
963 %or = or <8 x i16> %shl, %lshr
967 define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
968 ; SSE2-LABEL: splatvar_rotate_v16i8:
970 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
971 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
972 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
973 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
974 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
975 ; SSE2-NEXT: psubb %xmm1, %xmm2
976 ; SSE2-NEXT: movdqa %xmm1, %xmm3
977 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
978 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
979 ; SSE2-NEXT: movdqa %xmm0, %xmm1
980 ; SSE2-NEXT: psllw %xmm3, %xmm1
981 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
982 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
983 ; SSE2-NEXT: psllw %xmm3, %xmm5
984 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
985 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,2,3,4,5,6,7]
986 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
987 ; SSE2-NEXT: pand %xmm3, %xmm1
988 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
989 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
990 ; SSE2-NEXT: psrlw %xmm2, %xmm0
991 ; SSE2-NEXT: psrlw %xmm2, %xmm4
992 ; SSE2-NEXT: psrlw $8, %xmm4
993 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
994 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,3,4,5,6,7]
995 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
996 ; SSE2-NEXT: pand %xmm0, %xmm2
997 ; SSE2-NEXT: por %xmm2, %xmm1
998 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1001 ; SSE41-LABEL: splatvar_rotate_v16i8:
1003 ; SSE41-NEXT: pxor %xmm3, %xmm3
1004 ; SSE41-NEXT: pshufb %xmm3, %xmm1
1005 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
1006 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1007 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1008 ; SSE41-NEXT: psllw %xmm4, %xmm2
1009 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
1010 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
1011 ; SSE41-NEXT: psllw %xmm4, %xmm6
1012 ; SSE41-NEXT: pshufb %xmm3, %xmm6
1013 ; SSE41-NEXT: pand %xmm6, %xmm2
1014 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1015 ; SSE41-NEXT: psubb %xmm1, %xmm3
1016 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1017 ; SSE41-NEXT: psrlw %xmm1, %xmm0
1018 ; SSE41-NEXT: psrlw %xmm1, %xmm5
1019 ; SSE41-NEXT: pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1020 ; SSE41-NEXT: pand %xmm0, %xmm5
1021 ; SSE41-NEXT: por %xmm5, %xmm2
1022 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1025 ; AVX1-LABEL: splatvar_rotate_v16i8:
1027 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1028 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1029 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
1030 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1031 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm4
1032 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1033 ; AVX1-NEXT: vpsllw %xmm3, %xmm5, %xmm3
1034 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1035 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
1036 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1037 ; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1
1038 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1039 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1040 ; AVX1-NEXT: vpsrlw %xmm1, %xmm5, %xmm1
1041 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1042 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1043 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
1046 ; AVX2-LABEL: splatvar_rotate_v16i8:
1048 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1049 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
1050 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1051 ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm3
1052 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
1053 ; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
1054 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
1055 ; AVX2-NEXT: vpand %xmm2, %xmm3, %xmm2
1056 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1057 ; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
1058 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1059 ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1060 ; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
1061 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
1062 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1063 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1064 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
1067 ; AVX512F-LABEL: splatvar_rotate_v16i8:
1069 ; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
1070 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1071 ; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm2
1072 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1073 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1074 ; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm1
1075 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1076 ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
1077 ; AVX512F-NEXT: vpord %zmm0, %zmm1, %zmm0
1078 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1079 ; AVX512F-NEXT: vzeroupper
1080 ; AVX512F-NEXT: retq
1082 ; AVX512VL-LABEL: splatvar_rotate_v16i8:
1083 ; AVX512VL: # %bb.0:
1084 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
1085 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1086 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm2
1087 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1088 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1089 ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm1
1090 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1091 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
1092 ; AVX512VL-NEXT: vpord %zmm0, %zmm1, %zmm0
1093 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
1094 ; AVX512VL-NEXT: vzeroupper
1095 ; AVX512VL-NEXT: retq
1097 ; AVX512BW-LABEL: splatvar_rotate_v16i8:
1098 ; AVX512BW: # %bb.0:
1099 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
1100 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1101 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2
1102 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1103 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1104 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
1105 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1106 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
1107 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
1108 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1109 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1110 ; AVX512BW-NEXT: vzeroupper
1111 ; AVX512BW-NEXT: retq
1113 ; AVX512VLBW-LABEL: splatvar_rotate_v16i8:
1114 ; AVX512VLBW: # %bb.0:
1115 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
1116 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1117 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm2
1118 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1119 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1120 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm1
1121 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1122 ; AVX512VLBW-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0
1123 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0
1124 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1125 ; AVX512VLBW-NEXT: vzeroupper
1126 ; AVX512VLBW-NEXT: retq
1128 ; XOPAVX1-LABEL: splatvar_rotate_v16i8:
1130 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1131 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1132 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
1133 ; XOPAVX1-NEXT: retq
1135 ; XOPAVX2-LABEL: splatvar_rotate_v16i8:
1137 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1138 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
1139 ; XOPAVX2-NEXT: retq
1141 ; X32-SSE-LABEL: splatvar_rotate_v16i8:
1143 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1144 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
1145 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1146 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
1147 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1148 ; X32-SSE-NEXT: psubb %xmm1, %xmm2
1149 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
1150 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
1151 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1152 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1153 ; X32-SSE-NEXT: psllw %xmm3, %xmm1
1154 ; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm4
1155 ; X32-SSE-NEXT: pcmpeqd %xmm5, %xmm5
1156 ; X32-SSE-NEXT: psllw %xmm3, %xmm5
1157 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1158 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,2,3,4,5,6,7]
1159 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1160 ; X32-SSE-NEXT: pand %xmm3, %xmm1
1161 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
1162 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1163 ; X32-SSE-NEXT: psrlw %xmm2, %xmm0
1164 ; X32-SSE-NEXT: psrlw %xmm2, %xmm4
1165 ; X32-SSE-NEXT: psrlw $8, %xmm4
1166 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1167 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,3,4,5,6,7]
1168 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1169 ; X32-SSE-NEXT: pand %xmm0, %xmm2
1170 ; X32-SSE-NEXT: por %xmm2, %xmm1
1171 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0
1172 ; X32-SSE-NEXT: retl
1173 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
1174 %splat8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
1175 %shl = shl <16 x i8> %a, %splat
1176 %lshr = lshr <16 x i8> %a, %splat8
1177 %or = or <16 x i8> %shl, %lshr
1185 define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
1186 ; SSE2-LABEL: constant_rotate_v2i64:
1188 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1189 ; SSE2-NEXT: psllq $4, %xmm1
1190 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1191 ; SSE2-NEXT: psllq $14, %xmm2
1192 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1193 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1194 ; SSE2-NEXT: psrlq $60, %xmm1
1195 ; SSE2-NEXT: psrlq $50, %xmm0
1196 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1197 ; SSE2-NEXT: orpd %xmm2, %xmm0
1200 ; SSE41-LABEL: constant_rotate_v2i64:
1202 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1203 ; SSE41-NEXT: psllq $14, %xmm1
1204 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1205 ; SSE41-NEXT: psllq $4, %xmm2
1206 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1207 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1208 ; SSE41-NEXT: psrlq $50, %xmm1
1209 ; SSE41-NEXT: psrlq $60, %xmm0
1210 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1211 ; SSE41-NEXT: por %xmm2, %xmm0
1214 ; AVX1-LABEL: constant_rotate_v2i64:
1216 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1
1217 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm2
1218 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1219 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm2
1220 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0
1221 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1222 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
1225 ; AVX2-LABEL: constant_rotate_v2i64:
1227 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1
1228 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1229 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
1232 ; AVX512F-LABEL: constant_rotate_v2i64:
1234 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1235 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14]
1236 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1237 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1238 ; AVX512F-NEXT: vzeroupper
1239 ; AVX512F-NEXT: retq
1241 ; AVX512VL-LABEL: constant_rotate_v2i64:
1242 ; AVX512VL: # %bb.0:
1243 ; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0
1244 ; AVX512VL-NEXT: retq
1246 ; AVX512BW-LABEL: constant_rotate_v2i64:
1247 ; AVX512BW: # %bb.0:
1248 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1249 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14]
1250 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1251 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1252 ; AVX512BW-NEXT: vzeroupper
1253 ; AVX512BW-NEXT: retq
1255 ; AVX512VLBW-LABEL: constant_rotate_v2i64:
1256 ; AVX512VLBW: # %bb.0:
1257 ; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0
1258 ; AVX512VLBW-NEXT: retq
1260 ; XOP-LABEL: constant_rotate_v2i64:
1262 ; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
1265 ; X32-SSE-LABEL: constant_rotate_v2i64:
1267 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1268 ; X32-SSE-NEXT: psllq $4, %xmm1
1269 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1270 ; X32-SSE-NEXT: psllq $14, %xmm2
1271 ; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1272 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1273 ; X32-SSE-NEXT: psrlq $60, %xmm1
1274 ; X32-SSE-NEXT: psrlq $50, %xmm0
1275 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1276 ; X32-SSE-NEXT: orpd %xmm2, %xmm0
1277 ; X32-SSE-NEXT: retl
1278 %shl = shl <2 x i64> %a, <i64 4, i64 14>
1279 %lshr = lshr <2 x i64> %a, <i64 60, i64 50>
1280 %or = or <2 x i64> %shl, %lshr
1284 define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
1285 ; SSE2-LABEL: constant_rotate_v4i32:
1287 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
1288 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1289 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
1290 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
1291 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1292 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
1293 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
1294 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1295 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1296 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1297 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1298 ; SSE2-NEXT: por %xmm3, %xmm0
1301 ; SSE41-LABEL: constant_rotate_v4i32:
1303 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
1304 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1305 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1306 ; SSE41-NEXT: pmuludq %xmm2, %xmm3
1307 ; SSE41-NEXT: pmuludq %xmm1, %xmm0
1308 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1309 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1310 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
1311 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1312 ; SSE41-NEXT: por %xmm1, %xmm0
1315 ; AVX1-LABEL: constant_rotate_v4i32:
1317 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,32,64,128]
1318 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1319 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1320 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
1321 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1322 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1323 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1324 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1325 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1326 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1329 ; AVX2-LABEL: constant_rotate_v4i32:
1331 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
1332 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
1333 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1336 ; AVX512F-LABEL: constant_rotate_v4i32:
1338 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1339 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
1340 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1341 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1342 ; AVX512F-NEXT: vzeroupper
1343 ; AVX512F-NEXT: retq
1345 ; AVX512VL-LABEL: constant_rotate_v4i32:
1346 ; AVX512VL: # %bb.0:
1347 ; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0
1348 ; AVX512VL-NEXT: retq
1350 ; AVX512BW-LABEL: constant_rotate_v4i32:
1351 ; AVX512BW: # %bb.0:
1352 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1353 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
1354 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1355 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1356 ; AVX512BW-NEXT: vzeroupper
1357 ; AVX512BW-NEXT: retq
1359 ; AVX512VLBW-LABEL: constant_rotate_v4i32:
1360 ; AVX512VLBW: # %bb.0:
1361 ; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0
1362 ; AVX512VLBW-NEXT: retq
1364 ; XOP-LABEL: constant_rotate_v4i32:
1366 ; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
1369 ; X32-SSE-LABEL: constant_rotate_v4i32:
1371 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
1372 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1373 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
1374 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
1375 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1376 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
1377 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
1378 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1379 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1380 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1381 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1382 ; X32-SSE-NEXT: por %xmm3, %xmm0
1383 ; X32-SSE-NEXT: retl
1384 %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
1385 %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25>
1386 %or = or <4 x i32> %shl, %lshr
1390 define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
1391 ; SSE-LABEL: constant_rotate_v8i16:
1393 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1394 ; SSE-NEXT: movdqa %xmm0, %xmm2
1395 ; SSE-NEXT: pmulhuw %xmm1, %xmm2
1396 ; SSE-NEXT: pmullw %xmm1, %xmm0
1397 ; SSE-NEXT: por %xmm2, %xmm0
1400 ; AVX-LABEL: constant_rotate_v8i16:
1402 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1403 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
1404 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1405 ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
1408 ; AVX512F-LABEL: constant_rotate_v8i16:
1410 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1411 ; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
1412 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1413 ; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0
1414 ; AVX512F-NEXT: retq
1416 ; AVX512VL-LABEL: constant_rotate_v8i16:
1417 ; AVX512VL: # %bb.0:
1418 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1419 ; AVX512VL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
1420 ; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1421 ; AVX512VL-NEXT: vpor %xmm2, %xmm0, %xmm0
1422 ; AVX512VL-NEXT: retq
1424 ; AVX512BW-LABEL: constant_rotate_v8i16:
1425 ; AVX512BW: # %bb.0:
1426 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1427 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1428 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9]
1429 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2
1430 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1431 ; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0
1432 ; AVX512BW-NEXT: vzeroupper
1433 ; AVX512BW-NEXT: retq
1435 ; AVX512VLBW-LABEL: constant_rotate_v8i16:
1436 ; AVX512VLBW: # %bb.0:
1437 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm1
1438 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
1439 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1440 ; AVX512VLBW-NEXT: retq
1442 ; XOP-LABEL: constant_rotate_v8i16:
1444 ; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
1447 ; X32-SSE-LABEL: constant_rotate_v8i16:
1449 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1450 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1451 ; X32-SSE-NEXT: pmulhuw %xmm1, %xmm2
1452 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0
1453 ; X32-SSE-NEXT: por %xmm2, %xmm0
1454 ; X32-SSE-NEXT: retl
1455 %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
1456 %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9>
1457 %or = or <8 x i16> %shl, %lshr
1461 define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
1462 ; SSE2-LABEL: constant_rotate_v16i8:
1464 ; SSE2-NEXT: pxor %xmm1, %xmm1
1465 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1466 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1467 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
1468 ; SSE2-NEXT: psrlw $8, %xmm2
1469 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1470 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1471 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3
1472 ; SSE2-NEXT: psrlw $8, %xmm3
1473 ; SSE2-NEXT: packuswb %xmm2, %xmm3
1474 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1475 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1476 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1
1477 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1478 ; SSE2-NEXT: pand %xmm2, %xmm1
1479 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1480 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
1481 ; SSE2-NEXT: pand %xmm2, %xmm0
1482 ; SSE2-NEXT: packuswb %xmm1, %xmm0
1483 ; SSE2-NEXT: por %xmm3, %xmm0
1486 ; SSE41-LABEL: constant_rotate_v16i8:
1488 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1489 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
1490 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2
1491 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1492 ; SSE41-NEXT: pand %xmm3, %xmm2
1493 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1494 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1495 ; SSE41-NEXT: pmullw %xmm1, %xmm4
1496 ; SSE41-NEXT: pand %xmm3, %xmm4
1497 ; SSE41-NEXT: packuswb %xmm2, %xmm4
1498 ; SSE41-NEXT: pxor %xmm2, %xmm2
1499 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1500 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
1501 ; SSE41-NEXT: psrlw $8, %xmm0
1502 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
1503 ; SSE41-NEXT: psrlw $8, %xmm1
1504 ; SSE41-NEXT: packuswb %xmm0, %xmm1
1505 ; SSE41-NEXT: por %xmm4, %xmm1
1506 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1509 ; AVX1-LABEL: constant_rotate_v16i8:
1511 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1512 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
1513 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1514 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1515 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1516 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm4
1517 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
1518 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
1519 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1520 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1521 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
1522 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1523 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm2
1524 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1525 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
1526 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
1529 ; AVX2-LABEL: constant_rotate_v16i8:
1531 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1532 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1
1533 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1534 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1535 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1536 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1537 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1538 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1539 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1540 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1541 ; AVX2-NEXT: vzeroupper
1544 ; AVX512F-LABEL: constant_rotate_v16i8:
1546 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1547 ; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm1
1548 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
1549 ; AVX512F-NEXT: vpord %zmm0, %zmm1, %zmm0
1550 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1551 ; AVX512F-NEXT: vzeroupper
1552 ; AVX512F-NEXT: retq
1554 ; AVX512VL-LABEL: constant_rotate_v16i8:
1555 ; AVX512VL: # %bb.0:
1556 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1557 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm1
1558 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
1559 ; AVX512VL-NEXT: vpord %zmm0, %zmm1, %zmm0
1560 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
1561 ; AVX512VL-NEXT: vzeroupper
1562 ; AVX512VL-NEXT: retq
1564 ; AVX512BW-LABEL: constant_rotate_v16i8:
1565 ; AVX512BW: # %bb.0:
1566 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1567 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1568 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
1569 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1570 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
1571 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
1572 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1573 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1574 ; AVX512BW-NEXT: vzeroupper
1575 ; AVX512BW-NEXT: retq
1577 ; AVX512VLBW-LABEL: constant_rotate_v16i8:
1578 ; AVX512VLBW: # %bb.0:
1579 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1580 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm1
1581 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
1582 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0
1583 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1584 ; AVX512VLBW-NEXT: vzeroupper
1585 ; AVX512VLBW-NEXT: retq
1587 ; XOP-LABEL: constant_rotate_v16i8:
1589 ; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0
1592 ; X32-SSE-LABEL: constant_rotate_v16i8:
1594 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
1595 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
1596 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1597 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm2
1598 ; X32-SSE-NEXT: psrlw $8, %xmm2
1599 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
1600 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1601 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm3
1602 ; X32-SSE-NEXT: psrlw $8, %xmm3
1603 ; X32-SSE-NEXT: packuswb %xmm2, %xmm3
1604 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1605 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1606 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm1
1607 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1608 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1609 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1610 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
1611 ; X32-SSE-NEXT: pand %xmm2, %xmm0
1612 ; X32-SSE-NEXT: packuswb %xmm1, %xmm0
1613 ; X32-SSE-NEXT: por %xmm3, %xmm0
1614 ; X32-SSE-NEXT: retl
1615 %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1616 %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1617 %or = or <16 x i8> %shl, %lshr
1622 ; Uniform Constant Rotates
1625 define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
1626 ; SSE-LABEL: splatconstant_rotate_v2i64:
1628 ; SSE-NEXT: movdqa %xmm0, %xmm1
1629 ; SSE-NEXT: psllq $14, %xmm1
1630 ; SSE-NEXT: psrlq $50, %xmm0
1631 ; SSE-NEXT: por %xmm1, %xmm0
1634 ; AVX-LABEL: splatconstant_rotate_v2i64:
1636 ; AVX-NEXT: vpsllq $14, %xmm0, %xmm1
1637 ; AVX-NEXT: vpsrlq $50, %xmm0, %xmm0
1638 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
1641 ; AVX512F-LABEL: splatconstant_rotate_v2i64:
1643 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1644 ; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0
1645 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1646 ; AVX512F-NEXT: vzeroupper
1647 ; AVX512F-NEXT: retq
1649 ; AVX512VL-LABEL: splatconstant_rotate_v2i64:
1650 ; AVX512VL: # %bb.0:
1651 ; AVX512VL-NEXT: vprolq $14, %xmm0, %xmm0
1652 ; AVX512VL-NEXT: retq
1654 ; AVX512BW-LABEL: splatconstant_rotate_v2i64:
1655 ; AVX512BW: # %bb.0:
1656 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1657 ; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0
1658 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1659 ; AVX512BW-NEXT: vzeroupper
1660 ; AVX512BW-NEXT: retq
1662 ; AVX512VLBW-LABEL: splatconstant_rotate_v2i64:
1663 ; AVX512VLBW: # %bb.0:
1664 ; AVX512VLBW-NEXT: vprolq $14, %xmm0, %xmm0
1665 ; AVX512VLBW-NEXT: retq
1667 ; XOP-LABEL: splatconstant_rotate_v2i64:
1669 ; XOP-NEXT: vprotq $14, %xmm0, %xmm0
1672 ; X32-SSE-LABEL: splatconstant_rotate_v2i64:
1674 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1675 ; X32-SSE-NEXT: psllq $14, %xmm1
1676 ; X32-SSE-NEXT: psrlq $50, %xmm0
1677 ; X32-SSE-NEXT: por %xmm1, %xmm0
1678 ; X32-SSE-NEXT: retl
1679 %shl = shl <2 x i64> %a, <i64 14, i64 14>
1680 %lshr = lshr <2 x i64> %a, <i64 50, i64 50>
1681 %or = or <2 x i64> %shl, %lshr
1685 define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
1686 ; SSE-LABEL: splatconstant_rotate_v4i32:
1688 ; SSE-NEXT: movdqa %xmm0, %xmm1
1689 ; SSE-NEXT: psrld $28, %xmm1
1690 ; SSE-NEXT: pslld $4, %xmm0
1691 ; SSE-NEXT: por %xmm1, %xmm0
1694 ; AVX-LABEL: splatconstant_rotate_v4i32:
1696 ; AVX-NEXT: vpsrld $28, %xmm0, %xmm1
1697 ; AVX-NEXT: vpslld $4, %xmm0, %xmm0
1698 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1701 ; AVX512F-LABEL: splatconstant_rotate_v4i32:
1703 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1704 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1705 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1706 ; AVX512F-NEXT: vzeroupper
1707 ; AVX512F-NEXT: retq
1709 ; AVX512VL-LABEL: splatconstant_rotate_v4i32:
1710 ; AVX512VL: # %bb.0:
1711 ; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0
1712 ; AVX512VL-NEXT: retq
1714 ; AVX512BW-LABEL: splatconstant_rotate_v4i32:
1715 ; AVX512BW: # %bb.0:
1716 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1717 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1718 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1719 ; AVX512BW-NEXT: vzeroupper
1720 ; AVX512BW-NEXT: retq
1722 ; AVX512VLBW-LABEL: splatconstant_rotate_v4i32:
1723 ; AVX512VLBW: # %bb.0:
1724 ; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0
1725 ; AVX512VLBW-NEXT: retq
1727 ; XOP-LABEL: splatconstant_rotate_v4i32:
1729 ; XOP-NEXT: vprotd $4, %xmm0, %xmm0
1732 ; X32-SSE-LABEL: splatconstant_rotate_v4i32:
1734 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1735 ; X32-SSE-NEXT: psrld $28, %xmm1
1736 ; X32-SSE-NEXT: pslld $4, %xmm0
1737 ; X32-SSE-NEXT: por %xmm1, %xmm0
1738 ; X32-SSE-NEXT: retl
1739 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
1740 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
1741 %or = or <4 x i32> %shl, %lshr
1745 define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind {
1746 ; SSE-LABEL: splatconstant_rotate_v8i16:
1748 ; SSE-NEXT: movdqa %xmm0, %xmm1
1749 ; SSE-NEXT: psrlw $9, %xmm1
1750 ; SSE-NEXT: psllw $7, %xmm0
1751 ; SSE-NEXT: por %xmm1, %xmm0
1754 ; AVX-LABEL: splatconstant_rotate_v8i16:
1756 ; AVX-NEXT: vpsrlw $9, %xmm0, %xmm1
1757 ; AVX-NEXT: vpsllw $7, %xmm0, %xmm0
1758 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1761 ; AVX512-LABEL: splatconstant_rotate_v8i16:
1763 ; AVX512-NEXT: vpsrlw $9, %xmm0, %xmm1
1764 ; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0
1765 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1768 ; XOP-LABEL: splatconstant_rotate_v8i16:
1770 ; XOP-NEXT: vprotw $7, %xmm0, %xmm0
1773 ; X32-SSE-LABEL: splatconstant_rotate_v8i16:
1775 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1776 ; X32-SSE-NEXT: psrlw $9, %xmm1
1777 ; X32-SSE-NEXT: psllw $7, %xmm0
1778 ; X32-SSE-NEXT: por %xmm1, %xmm0
1779 ; X32-SSE-NEXT: retl
1780 %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1781 %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1782 %or = or <8 x i16> %shl, %lshr
1786 define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
1787 ; SSE-LABEL: splatconstant_rotate_v16i8:
1789 ; SSE-NEXT: movdqa %xmm0, %xmm1
1790 ; SSE-NEXT: psrlw $4, %xmm1
1791 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
1792 ; SSE-NEXT: psllw $4, %xmm0
1793 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
1794 ; SSE-NEXT: por %xmm1, %xmm0
1797 ; AVX-LABEL: splatconstant_rotate_v16i8:
1799 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1
1800 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
1801 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
1802 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1803 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1806 ; AVX512-LABEL: splatconstant_rotate_v16i8:
1808 ; AVX512-NEXT: vpsllw $4, %xmm0, %xmm1
1809 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
1810 ; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
1811 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1812 ; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
1815 ; XOP-LABEL: splatconstant_rotate_v16i8:
1817 ; XOP-NEXT: vprotb $4, %xmm0, %xmm0
1820 ; X32-SSE-LABEL: splatconstant_rotate_v16i8:
1822 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1823 ; X32-SSE-NEXT: psrlw $4, %xmm1
1824 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
1825 ; X32-SSE-NEXT: psllw $4, %xmm0
1826 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1827 ; X32-SSE-NEXT: por %xmm1, %xmm0
1828 ; X32-SSE-NEXT: retl
1829 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1830 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1831 %or = or <16 x i8> %shl, %lshr
1836 ; Masked Uniform Constant Rotates
1839 define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
1840 ; SSE-LABEL: splatconstant_rotate_mask_v2i64:
1842 ; SSE-NEXT: psrlq $49, %xmm0
1843 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
1846 ; AVX-LABEL: splatconstant_rotate_mask_v2i64:
1848 ; AVX-NEXT: vpsrlq $49, %xmm0, %xmm0
1849 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1852 ; AVX512F-LABEL: splatconstant_rotate_mask_v2i64:
1854 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1855 ; AVX512F-NEXT: vprolq $15, %zmm0, %zmm0
1856 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1857 ; AVX512F-NEXT: vzeroupper
1858 ; AVX512F-NEXT: retq
1860 ; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64:
1861 ; AVX512VL: # %bb.0:
1862 ; AVX512VL-NEXT: vprolq $15, %xmm0, %xmm0
1863 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1864 ; AVX512VL-NEXT: retq
1866 ; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64:
1867 ; AVX512BW: # %bb.0:
1868 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1869 ; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0
1870 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1871 ; AVX512BW-NEXT: vzeroupper
1872 ; AVX512BW-NEXT: retq
1874 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v2i64:
1875 ; AVX512VLBW: # %bb.0:
1876 ; AVX512VLBW-NEXT: vprolq $15, %xmm0, %xmm0
1877 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1878 ; AVX512VLBW-NEXT: retq
1880 ; XOP-LABEL: splatconstant_rotate_mask_v2i64:
1882 ; XOP-NEXT: vprotq $15, %xmm0, %xmm0
1883 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1886 ; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64:
1888 ; X32-SSE-NEXT: psrlq $49, %xmm0
1889 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1890 ; X32-SSE-NEXT: retl
1891 %shl = shl <2 x i64> %a, <i64 15, i64 15>
1892 %lshr = lshr <2 x i64> %a, <i64 49, i64 49>
1893 %rmask = and <2 x i64> %lshr, <i64 255, i64 127>
1894 %lmask = and <2 x i64> %shl, <i64 65, i64 33>
1895 %or = or <2 x i64> %lmask, %rmask
1899 define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
1900 ; SSE-LABEL: splatconstant_rotate_mask_v4i32:
1902 ; SSE-NEXT: movdqa %xmm0, %xmm1
1903 ; SSE-NEXT: psrld $28, %xmm1
1904 ; SSE-NEXT: pslld $4, %xmm0
1905 ; SSE-NEXT: por %xmm1, %xmm0
1906 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
1909 ; AVX-LABEL: splatconstant_rotate_mask_v4i32:
1911 ; AVX-NEXT: vpsrld $28, %xmm0, %xmm1
1912 ; AVX-NEXT: vpslld $4, %xmm0, %xmm0
1913 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1914 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1917 ; AVX512F-LABEL: splatconstant_rotate_mask_v4i32:
1919 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1920 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1921 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1922 ; AVX512F-NEXT: vzeroupper
1923 ; AVX512F-NEXT: retq
1925 ; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32:
1926 ; AVX512VL: # %bb.0:
1927 ; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0
1928 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1929 ; AVX512VL-NEXT: retq
1931 ; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32:
1932 ; AVX512BW: # %bb.0:
1933 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1934 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1935 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1936 ; AVX512BW-NEXT: vzeroupper
1937 ; AVX512BW-NEXT: retq
1939 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i32:
1940 ; AVX512VLBW: # %bb.0:
1941 ; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0
1942 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1943 ; AVX512VLBW-NEXT: retq
1945 ; XOP-LABEL: splatconstant_rotate_mask_v4i32:
1947 ; XOP-NEXT: vprotd $4, %xmm0, %xmm0
1948 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1951 ; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32:
1953 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
1954 ; X32-SSE-NEXT: psrld $28, %xmm1
1955 ; X32-SSE-NEXT: pslld $4, %xmm0
1956 ; X32-SSE-NEXT: por %xmm1, %xmm0
1957 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1958 ; X32-SSE-NEXT: retl
1959 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
1960 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
1961 %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023>
1962 %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127>
1963 %or = or <4 x i32> %lmask, %rmask
1967 define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
1968 ; SSE-LABEL: splatconstant_rotate_mask_v8i16:
1970 ; SSE-NEXT: movdqa %xmm0, %xmm1
1971 ; SSE-NEXT: psrlw $11, %xmm1
1972 ; SSE-NEXT: psllw $5, %xmm0
1973 ; SSE-NEXT: por %xmm1, %xmm0
1974 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
1977 ; AVX-LABEL: splatconstant_rotate_mask_v8i16:
1979 ; AVX-NEXT: vpsrlw $11, %xmm0, %xmm1
1980 ; AVX-NEXT: vpsllw $5, %xmm0, %xmm0
1981 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1982 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1985 ; AVX512-LABEL: splatconstant_rotate_mask_v8i16:
1987 ; AVX512-NEXT: vpsrlw $11, %xmm0, %xmm1
1988 ; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0
1989 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1990 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1993 ; XOP-LABEL: splatconstant_rotate_mask_v8i16:
1995 ; XOP-NEXT: vprotw $5, %xmm0, %xmm0
1996 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1999 ; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16:
2001 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
2002 ; X32-SSE-NEXT: psrlw $11, %xmm1
2003 ; X32-SSE-NEXT: psllw $5, %xmm0
2004 ; X32-SSE-NEXT: por %xmm1, %xmm0
2005 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2006 ; X32-SSE-NEXT: retl
2007 %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
2008 %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
2009 %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
2010 %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
2011 %or = or <8 x i16> %lmask, %rmask
2015 define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
2016 ; SSE-LABEL: splatconstant_rotate_mask_v16i8:
2018 ; SSE-NEXT: movdqa %xmm0, %xmm1
2019 ; SSE-NEXT: psrlw $4, %xmm1
2020 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
2021 ; SSE-NEXT: psllw $4, %xmm0
2022 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
2023 ; SSE-NEXT: por %xmm1, %xmm0
2024 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
2027 ; AVX-LABEL: splatconstant_rotate_mask_v16i8:
2029 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1
2030 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
2031 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
2032 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2033 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2034 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2037 ; AVX512-LABEL: splatconstant_rotate_mask_v16i8:
2039 ; AVX512-NEXT: vpsllw $4, %xmm0, %xmm1
2040 ; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
2041 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2042 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
2043 ; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
2046 ; XOP-LABEL: splatconstant_rotate_mask_v16i8:
2048 ; XOP-NEXT: vprotb $4, %xmm0, %xmm0
2049 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2052 ; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8:
2054 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
2055 ; X32-SSE-NEXT: psrlw $4, %xmm1
2056 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
2057 ; X32-SSE-NEXT: psllw $4, %xmm0
2058 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2059 ; X32-SSE-NEXT: por %xmm1, %xmm0
2060 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2061 ; X32-SSE-NEXT: retl
2062 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2063 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2064 %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
2065 %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
2066 %or = or <16 x i8> %lmask, %rmask