1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
15 define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
16 ; AVX1-LABEL: var_rotate_v4i64:
18 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
19 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
20 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
21 ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
22 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
23 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6
24 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
25 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4
26 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
27 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6
28 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
29 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
30 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
31 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
32 ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4
33 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
34 ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2
35 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
36 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4
37 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
38 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0
39 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
40 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
41 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
44 ; AVX2-LABEL: var_rotate_v4i64:
46 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64]
47 ; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2
48 ; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1
49 ; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0
50 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
53 ; AVX512F-LABEL: var_rotate_v4i64:
55 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
56 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
57 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
58 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
61 ; AVX512VL-LABEL: var_rotate_v4i64:
63 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
66 ; AVX512BW-LABEL: var_rotate_v4i64:
68 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
69 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
70 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
71 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
74 ; AVX512VLBW-LABEL: var_rotate_v4i64:
75 ; AVX512VLBW: # %bb.0:
76 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
77 ; AVX512VLBW-NEXT: retq
79 ; XOPAVX1-LABEL: var_rotate_v4i64:
81 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
82 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
83 ; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
84 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
85 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
88 ; XOPAVX2-LABEL: var_rotate_v4i64:
90 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
91 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
92 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
93 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
94 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
96 %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
97 %shl = shl <4 x i64> %a, %b
98 %lshr = lshr <4 x i64> %a, %b64
99 %or = or <4 x i64> %shl, %lshr
103 define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
104 ; AVX1-LABEL: var_rotate_v8i32:
106 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
107 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
108 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
109 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
110 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
111 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
112 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
113 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
114 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
115 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
116 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
117 ; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm2
118 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
119 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
120 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
121 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
122 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
123 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
124 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
125 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
126 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
127 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
128 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
129 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
130 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
131 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
132 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
133 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
134 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
135 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
136 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
139 ; AVX2-LABEL: var_rotate_v8i32:
141 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
142 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
143 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2
144 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
145 ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
146 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
147 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
150 ; AVX512F-LABEL: var_rotate_v8i32:
152 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
153 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
154 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
155 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
158 ; AVX512VL-LABEL: var_rotate_v8i32:
160 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
161 ; AVX512VL-NEXT: retq
163 ; AVX512BW-LABEL: var_rotate_v8i32:
165 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
166 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
167 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
168 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
169 ; AVX512BW-NEXT: retq
171 ; AVX512VLBW-LABEL: var_rotate_v8i32:
172 ; AVX512VLBW: # %bb.0:
173 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
174 ; AVX512VLBW-NEXT: retq
176 ; XOPAVX1-LABEL: var_rotate_v8i32:
178 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
179 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
180 ; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
181 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
182 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
185 ; XOPAVX2-LABEL: var_rotate_v8i32:
187 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
188 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
189 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
190 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
191 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
193 %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
194 %shl = shl <8 x i32> %a, %b
195 %lshr = lshr <8 x i32> %a, %b32
196 %or = or <8 x i32> %shl, %lshr
200 define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
201 ; AVX1-LABEL: var_rotate_v16i16:
203 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
204 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
205 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
206 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
207 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
208 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
209 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
210 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
211 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
212 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
213 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
214 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
215 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
216 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
217 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6
218 ; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
219 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
220 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
221 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
222 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
223 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
224 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
225 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
226 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
227 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
228 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
229 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
230 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
231 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
232 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
233 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
236 ; AVX2-LABEL: var_rotate_v16i16:
238 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
239 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
240 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
241 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
242 ; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm4
243 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
244 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
245 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
246 ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm5
247 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
248 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4
249 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
250 ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1
251 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
252 ; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3
253 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
254 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
255 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
256 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
257 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
258 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
261 ; AVX512F-LABEL: var_rotate_v16i16:
263 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
264 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
265 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
266 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
267 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
268 ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1
269 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
270 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
271 ; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0
272 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
275 ; AVX512VL-LABEL: var_rotate_v16i16:
277 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
278 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
279 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
280 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
281 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
282 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm3, %ymm1
283 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
284 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
285 ; AVX512VL-NEXT: vpord %zmm0, %zmm2, %zmm0
286 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
287 ; AVX512VL-NEXT: retq
289 ; AVX512BW-LABEL: var_rotate_v16i16:
291 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
292 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
293 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
294 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
295 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
296 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
297 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
298 ; AVX512BW-NEXT: retq
300 ; AVX512VLBW-LABEL: var_rotate_v16i16:
301 ; AVX512VLBW: # %bb.0:
302 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
303 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
304 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
305 ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
306 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
307 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
308 ; AVX512VLBW-NEXT: retq
310 ; XOPAVX1-LABEL: var_rotate_v16i16:
312 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
313 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
314 ; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
315 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
316 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
319 ; XOPAVX2-LABEL: var_rotate_v16i16:
321 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
322 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
323 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
324 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
325 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
327 %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
328 %shl = shl <16 x i16> %a, %b
329 %lshr = lshr <16 x i16> %a, %b16
330 %or = or <16 x i16> %shl, %lshr
334 define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
335 ; AVX1-LABEL: var_rotate_v32i8:
337 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
338 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
339 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
340 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
341 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
342 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
343 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
344 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
345 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
346 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
347 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
348 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
349 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
350 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7
351 ; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
352 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
353 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
354 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
355 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
356 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
357 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
358 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm7
359 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
360 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
361 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
362 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
363 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
364 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
365 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
366 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
367 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
368 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
369 ; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3
370 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
371 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
372 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
373 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
374 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
375 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
376 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
377 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
378 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
379 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
380 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
381 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
382 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
385 ; AVX2-LABEL: var_rotate_v32i8:
387 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
388 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
389 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3
390 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
391 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
392 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
393 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
394 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2
395 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
396 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3
397 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
398 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
399 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
400 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
401 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
402 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3
403 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
404 ; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
405 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
406 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
409 ; AVX512F-LABEL: var_rotate_v32i8:
411 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
412 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
413 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
414 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
415 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
416 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
417 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
418 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm2
419 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
420 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3
421 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
422 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
423 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
424 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
425 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
426 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
427 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
428 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
429 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
430 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
433 ; AVX512VL-LABEL: var_rotate_v32i8:
435 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
436 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
437 ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
438 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
439 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
440 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
441 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
442 ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
443 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
444 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
445 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
446 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
447 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
448 ; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
449 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
450 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
451 ; AVX512VL-NEXT: retq
453 ; AVX512BW-LABEL: var_rotate_v32i8:
455 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
456 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
457 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
458 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
459 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
460 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
461 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
462 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
463 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
464 ; AVX512BW-NEXT: retq
466 ; AVX512VLBW-LABEL: var_rotate_v32i8:
467 ; AVX512VLBW: # %bb.0:
468 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
469 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
470 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
471 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
472 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
473 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
474 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
475 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
476 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
477 ; AVX512VLBW-NEXT: retq
479 ; XOPAVX1-LABEL: var_rotate_v32i8:
481 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
482 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
483 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
484 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
485 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
488 ; XOPAVX2-LABEL: var_rotate_v32i8:
490 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
491 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
492 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
493 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
494 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
496 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
497 %shl = shl <32 x i8> %a, %b
498 %lshr = lshr <32 x i8> %a, %b8
499 %or = or <32 x i8> %shl, %lshr
504 ; Uniform Variable Rotates
507 define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
508 ; AVX1-LABEL: splatvar_rotate_v4i64:
510 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
511 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64]
512 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
513 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
514 ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4
515 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
516 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
517 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
518 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
519 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
520 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
523 ; AVX2-LABEL: splatvar_rotate_v4i64:
525 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
526 ; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2
527 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm1
528 ; AVX2-NEXT: vpsrlq %xmm2, %ymm0, %ymm0
529 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
532 ; AVX512F-LABEL: splatvar_rotate_v4i64:
534 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
535 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
536 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
537 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
540 ; AVX512VL-LABEL: splatvar_rotate_v4i64:
542 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
543 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
544 ; AVX512VL-NEXT: retq
546 ; AVX512BW-LABEL: splatvar_rotate_v4i64:
548 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
549 ; AVX512BW-NEXT: vpbroadcastq %xmm1, %ymm1
550 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
551 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
552 ; AVX512BW-NEXT: retq
554 ; AVX512VLBW-LABEL: splatvar_rotate_v4i64:
555 ; AVX512VLBW: # %bb.0:
556 ; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %ymm1
557 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
558 ; AVX512VLBW-NEXT: retq
560 ; XOPAVX1-LABEL: splatvar_rotate_v4i64:
562 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
563 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
564 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
565 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
566 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
569 ; XOPAVX2-LABEL: splatvar_rotate_v4i64:
571 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %ymm1
572 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
573 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
574 ; XOPAVX2-NEXT: vprotq %xmm3, %xmm2, %xmm2
575 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
576 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
578 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
579 %splat64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %splat
580 %shl = shl <4 x i64> %a, %splat
581 %lshr = lshr <4 x i64> %a, %splat64
582 %or = or <4 x i64> %shl, %lshr
586 define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
587 ; AVX1-LABEL: splatvar_rotate_v8i32:
589 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
590 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
591 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
592 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
593 ; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm4
594 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
595 ; AVX1-NEXT: vpsubd %xmm1, %xmm5, %xmm1
596 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
597 ; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
598 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
599 ; AVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3
600 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
601 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
602 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
605 ; AVX2-LABEL: splatvar_rotate_v8i32:
607 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
608 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
609 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
610 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
611 ; AVX2-NEXT: vpslld %xmm2, %ymm0, %ymm2
612 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
613 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
614 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
615 ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
616 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
619 ; AVX512F-LABEL: splatvar_rotate_v8i32:
621 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
622 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
623 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
624 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
627 ; AVX512VL-LABEL: splatvar_rotate_v8i32:
629 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %ymm1
630 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
631 ; AVX512VL-NEXT: retq
633 ; AVX512BW-LABEL: splatvar_rotate_v8i32:
635 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
636 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1
637 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
638 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
639 ; AVX512BW-NEXT: retq
641 ; AVX512VLBW-LABEL: splatvar_rotate_v8i32:
642 ; AVX512VLBW: # %bb.0:
643 ; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %ymm1
644 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
645 ; AVX512VLBW-NEXT: retq
647 ; XOPAVX1-LABEL: splatvar_rotate_v8i32:
649 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
650 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
651 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
652 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
653 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
656 ; XOPAVX2-LABEL: splatvar_rotate_v8i32:
658 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1
659 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
660 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
661 ; XOPAVX2-NEXT: vprotd %xmm3, %xmm2, %xmm2
662 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
663 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
665 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
666 %splat32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
667 %shl = shl <8 x i32> %a, %splat
668 %lshr = lshr <8 x i32> %a, %splat32
669 %or = or <8 x i32> %shl, %lshr
673 define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
674 ; AVX1-LABEL: splatvar_rotate_v16i16:
676 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
677 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
678 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
679 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
680 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
681 ; AVX1-NEXT: vpsllw %xmm3, %xmm2, %xmm4
682 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
683 ; AVX1-NEXT: vpsubw %xmm1, %xmm5, %xmm1
684 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
685 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
686 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
687 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
688 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
689 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
690 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
693 ; AVX2-LABEL: splatvar_rotate_v16i16:
695 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
696 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
697 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
698 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2
699 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
700 ; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
701 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
702 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
703 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
706 ; AVX512-LABEL: splatvar_rotate_v16i16:
708 ; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1
709 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
710 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
711 ; AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm2
712 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
713 ; AVX512-NEXT: vpsubw %xmm1, %xmm3, %xmm1
714 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
715 ; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
716 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
719 ; XOPAVX1-LABEL: splatvar_rotate_v16i16:
721 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
722 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
723 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
724 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
725 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
726 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
729 ; XOPAVX2-LABEL: splatvar_rotate_v16i16:
731 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %ymm1
732 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
733 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
734 ; XOPAVX2-NEXT: vprotw %xmm3, %xmm2, %xmm2
735 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
736 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
738 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
739 %splat16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
740 %shl = shl <16 x i16> %a, %splat
741 %lshr = lshr <16 x i16> %a, %splat16
742 %or = or <16 x i16> %shl, %lshr
746 define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
747 ; AVX1-LABEL: splatvar_rotate_v32i8:
749 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
750 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
751 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
752 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
753 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
754 ; AVX1-NEXT: vpsllw %xmm3, %xmm4, %xmm5
755 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
756 ; AVX1-NEXT: vpsllw %xmm3, %xmm6, %xmm7
757 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
758 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
759 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
760 ; AVX1-NEXT: vpsubb %xmm1, %xmm7, %xmm1
761 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
762 ; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
763 ; AVX1-NEXT: vpsrlw %xmm1, %xmm6, %xmm6
764 ; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
765 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
766 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
767 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
768 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
769 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
770 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
771 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
772 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
775 ; AVX2-LABEL: splatvar_rotate_v32i8:
777 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
778 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
779 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
780 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3
781 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
782 ; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
783 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
784 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
785 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
786 ; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
787 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
788 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
789 ; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
790 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
791 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
792 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
793 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
796 ; AVX512F-LABEL: splatvar_rotate_v32i8:
798 ; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
799 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
800 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
801 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3
802 ; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
803 ; AVX512F-NEXT: vpsllw %xmm2, %xmm4, %xmm2
804 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
805 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
806 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
807 ; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
808 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
809 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
810 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
811 ; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1
812 ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
813 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
814 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
817 ; AVX512VL-LABEL: splatvar_rotate_v32i8:
819 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
820 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
821 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
822 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3
823 ; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
824 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm4, %xmm2
825 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
826 ; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
827 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
828 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
829 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
830 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
831 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
832 ; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1
833 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
834 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
835 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
836 ; AVX512VL-NEXT: retq
838 ; AVX512BW-LABEL: splatvar_rotate_v32i8:
840 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1
841 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
842 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
843 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
844 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
845 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
846 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
847 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
848 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
849 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
850 ; AVX512BW-NEXT: retq
852 ; AVX512VLBW-LABEL: splatvar_rotate_v32i8:
853 ; AVX512VLBW: # %bb.0:
854 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %ymm1
855 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
856 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
857 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
858 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
859 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
860 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
861 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
862 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
863 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
864 ; AVX512VLBW-NEXT: retq
866 ; XOPAVX1-LABEL: splatvar_rotate_v32i8:
868 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
869 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
870 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
871 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2
872 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
873 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
876 ; XOPAVX2-LABEL: splatvar_rotate_v32i8:
878 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1
879 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
880 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
881 ; XOPAVX2-NEXT: vprotb %xmm3, %xmm2, %xmm2
882 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
883 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
885 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
886 %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
887 %shl = shl <32 x i8> %a, %splat
888 %lshr = lshr <32 x i8> %a, %splat8
889 %or = or <32 x i8> %shl, %lshr
897 define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
898 ; AVX1-LABEL: constant_rotate_v4i64:
900 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
901 ; AVX1-NEXT: vpsllq $60, %xmm1, %xmm2
902 ; AVX1-NEXT: vpsllq $50, %xmm1, %xmm3
903 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
904 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3
905 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm4
906 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
907 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
908 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3
909 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm1
910 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
911 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3
912 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0
913 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
914 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
915 ; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
918 ; AVX2-LABEL: constant_rotate_v4i64:
920 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1
921 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
922 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
925 ; AVX512F-LABEL: constant_rotate_v4i64:
927 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
928 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
929 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
930 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
933 ; AVX512VL-LABEL: constant_rotate_v4i64:
935 ; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
936 ; AVX512VL-NEXT: retq
938 ; AVX512BW-LABEL: constant_rotate_v4i64:
940 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
941 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
942 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
943 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
944 ; AVX512BW-NEXT: retq
946 ; AVX512VLBW-LABEL: constant_rotate_v4i64:
947 ; AVX512VLBW: # %bb.0:
948 ; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
949 ; AVX512VLBW-NEXT: retq
951 ; XOPAVX1-LABEL: constant_rotate_v4i64:
953 ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
954 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
955 ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
956 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
959 ; XOPAVX2-LABEL: constant_rotate_v4i64:
961 ; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
962 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
963 ; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
964 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
966 %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
967 %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4>
968 %or = or <4 x i64> %shl, %lshr
972 define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
973 ; AVX1-LABEL: constant_rotate_v8i32:
975 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [256,512,1024,2048]
976 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
977 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
978 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
979 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
980 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
981 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
982 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
983 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
984 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
985 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
986 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,32,64,128]
987 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
988 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
989 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
990 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
991 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
992 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
993 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
994 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
995 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
996 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
999 ; AVX2-LABEL: constant_rotate_v8i32:
1001 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm1
1002 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
1003 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1006 ; AVX512F-LABEL: constant_rotate_v8i32:
1008 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1009 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1010 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1011 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1012 ; AVX512F-NEXT: retq
1014 ; AVX512VL-LABEL: constant_rotate_v8i32:
1015 ; AVX512VL: # %bb.0:
1016 ; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
1017 ; AVX512VL-NEXT: retq
1019 ; AVX512BW-LABEL: constant_rotate_v8i32:
1020 ; AVX512BW: # %bb.0:
1021 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1022 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1023 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1024 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1025 ; AVX512BW-NEXT: retq
1027 ; AVX512VLBW-LABEL: constant_rotate_v8i32:
1028 ; AVX512VLBW: # %bb.0:
1029 ; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
1030 ; AVX512VLBW-NEXT: retq
1032 ; XOPAVX1-LABEL: constant_rotate_v8i32:
1034 ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
1035 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1036 ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
1037 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1038 ; XOPAVX1-NEXT: retq
1040 ; XOPAVX2-LABEL: constant_rotate_v8i32:
1042 ; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
1043 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1044 ; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
1045 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1046 ; XOPAVX2-NEXT: retq
1047 %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1048 %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
1049 %or = or <8 x i32> %shl, %lshr
1053 define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
1054 ; AVX1-LABEL: constant_rotate_v16i16:
1056 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1057 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
1058 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
1059 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1060 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1061 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1062 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3
1063 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1064 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1065 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1068 ; AVX2-LABEL: constant_rotate_v16i16:
1070 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1071 ; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1072 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1073 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
1076 ; AVX512F-LABEL: constant_rotate_v16i16:
1078 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1079 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1080 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1081 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
1082 ; AVX512F-NEXT: retq
1084 ; AVX512VL-LABEL: constant_rotate_v16i16:
1085 ; AVX512VL: # %bb.0:
1086 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1087 ; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1088 ; AVX512VL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1089 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
1090 ; AVX512VL-NEXT: retq
1092 ; AVX512BW-LABEL: constant_rotate_v16i16:
1093 ; AVX512BW: # %bb.0:
1094 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1095 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1096 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1097 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2
1098 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1099 ; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0
1100 ; AVX512BW-NEXT: retq
1102 ; AVX512VLBW-LABEL: constant_rotate_v16i16:
1103 ; AVX512VLBW: # %bb.0:
1104 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
1105 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1106 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1107 ; AVX512VLBW-NEXT: retq
1109 ; XOPAVX1-LABEL: constant_rotate_v16i16:
1111 ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
1112 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1113 ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
1114 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1115 ; XOPAVX1-NEXT: retq
1117 ; XOPAVX2-LABEL: constant_rotate_v16i16:
1119 ; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
1120 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1121 ; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
1122 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1123 ; XOPAVX2-NEXT: retq
1124 %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1125 %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
1126 %or = or <16 x i16> %shl, %lshr
1130 define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
1131 ; AVX1-LABEL: constant_rotate_v32i8:
1133 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1134 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
1135 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1136 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [256,128,64,32,16,8,4,2]
1137 ; AVX1-NEXT: vpmullw %xmm9, %xmm3, %xmm3
1138 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1139 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1140 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [256,2,4,8,16,32,64,128]
1141 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm7
1142 ; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
1143 ; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
1144 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1145 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
1146 ; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1
1147 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1148 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1149 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1150 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm5
1151 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
1152 ; AVX1-NEXT: vpackuswb %xmm1, %xmm5, %xmm1
1153 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1154 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1155 ; AVX1-NEXT: vpmullw %xmm9, %xmm3, %xmm3
1156 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1157 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1158 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm6
1159 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
1160 ; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
1161 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1162 ; AVX1-NEXT: vpmullw %xmm7, %xmm0, %xmm0
1163 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1164 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
1165 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
1166 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
1167 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1168 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1171 ; AVX2-LABEL: constant_rotate_v32i8:
1173 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
1174 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1175 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1176 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
1177 ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1178 ; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3
1179 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1180 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1181 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1182 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1183 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1184 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1185 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1186 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1187 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
1188 ; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
1189 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1190 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1191 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1192 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1193 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1196 ; AVX512F-LABEL: constant_rotate_v32i8:
1198 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
1199 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1200 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1201 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1202 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1203 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3
1204 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1205 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1206 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1207 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1208 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1209 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1210 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1211 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1212 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
1213 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
1214 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1215 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1216 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1217 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1218 ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0
1219 ; AVX512F-NEXT: retq
1221 ; AVX512VL-LABEL: constant_rotate_v32i8:
1222 ; AVX512VL: # %bb.0:
1223 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1224 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1225 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1226 ; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
1227 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1228 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3
1229 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1230 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1231 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1232 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1233 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1234 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1235 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1236 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1237 ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
1238 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1239 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1240 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1241 ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1242 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1243 ; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1244 ; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0
1245 ; AVX512VL-NEXT: retq
1247 ; AVX512BW-LABEL: constant_rotate_v32i8:
1248 ; AVX512BW: # %bb.0:
1249 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1250 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
1251 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
1252 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
1253 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1254 ; AVX512BW-NEXT: retq
1256 ; AVX512VLBW-LABEL: constant_rotate_v32i8:
1257 ; AVX512VLBW: # %bb.0:
1258 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1259 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
1260 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
1261 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
1262 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
1263 ; AVX512VLBW-NEXT: retq
1265 ; XOPAVX1-LABEL: constant_rotate_v32i8:
1267 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1268 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1269 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1
1270 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0
1271 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1272 ; XOPAVX1-NEXT: retq
1274 ; XOPAVX2-LABEL: constant_rotate_v32i8:
1276 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1277 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1278 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1
1279 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0
1280 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1281 ; XOPAVX2-NEXT: retq
1282 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1283 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1284 %or = or <32 x i8> %shl, %lshr
1289 ; Uniform Constant Rotates
1292 define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
1293 ; AVX1-LABEL: splatconstant_rotate_v4i64:
1295 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1
1296 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1297 ; AVX1-NEXT: vpsllq $14, %xmm2, %xmm3
1298 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1299 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm0
1300 ; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm2
1301 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1302 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
1305 ; AVX2-LABEL: splatconstant_rotate_v4i64:
1307 ; AVX2-NEXT: vpsllq $14, %ymm0, %ymm1
1308 ; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm0
1309 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1312 ; AVX512F-LABEL: splatconstant_rotate_v4i64:
1314 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1315 ; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0
1316 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1317 ; AVX512F-NEXT: retq
1319 ; AVX512VL-LABEL: splatconstant_rotate_v4i64:
1320 ; AVX512VL: # %bb.0:
1321 ; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0
1322 ; AVX512VL-NEXT: retq
1324 ; AVX512BW-LABEL: splatconstant_rotate_v4i64:
1325 ; AVX512BW: # %bb.0:
1326 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1327 ; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0
1328 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1329 ; AVX512BW-NEXT: retq
1331 ; AVX512VLBW-LABEL: splatconstant_rotate_v4i64:
1332 ; AVX512VLBW: # %bb.0:
1333 ; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0
1334 ; AVX512VLBW-NEXT: retq
1336 ; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
1338 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
1339 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1340 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
1341 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1342 ; XOPAVX1-NEXT: retq
1344 ; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
1346 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
1347 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1348 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
1349 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1350 ; XOPAVX2-NEXT: retq
1351 %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
1352 %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
1353 %or = or <4 x i64> %shl, %lshr
1357 define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
1358 ; AVX1-LABEL: splatconstant_rotate_v8i32:
1360 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1361 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1362 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1363 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1364 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1365 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1366 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1367 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1370 ; AVX2-LABEL: splatconstant_rotate_v8i32:
1372 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1373 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1374 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1377 ; AVX512F-LABEL: splatconstant_rotate_v8i32:
1379 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1380 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1381 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1382 ; AVX512F-NEXT: retq
1384 ; AVX512VL-LABEL: splatconstant_rotate_v8i32:
1385 ; AVX512VL: # %bb.0:
1386 ; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
1387 ; AVX512VL-NEXT: retq
1389 ; AVX512BW-LABEL: splatconstant_rotate_v8i32:
1390 ; AVX512BW: # %bb.0:
1391 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1392 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1393 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1394 ; AVX512BW-NEXT: retq
1396 ; AVX512VLBW-LABEL: splatconstant_rotate_v8i32:
1397 ; AVX512VLBW: # %bb.0:
1398 ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
1399 ; AVX512VLBW-NEXT: retq
1401 ; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
1403 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1404 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1405 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1406 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1407 ; XOPAVX1-NEXT: retq
1409 ; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
1411 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1412 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1413 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1414 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1415 ; XOPAVX2-NEXT: retq
1416 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1417 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1418 %or = or <8 x i32> %shl, %lshr
1422 define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
1423 ; AVX1-LABEL: splatconstant_rotate_v16i16:
1425 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1426 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
1427 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
1428 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1429 ; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm2
1430 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
1431 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1432 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1435 ; AVX2-LABEL: splatconstant_rotate_v16i16:
1437 ; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm1
1438 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
1439 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1442 ; AVX512-LABEL: splatconstant_rotate_v16i16:
1444 ; AVX512-NEXT: vpsrlw $9, %ymm0, %ymm1
1445 ; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0
1446 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
1449 ; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
1451 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
1452 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1453 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
1454 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1455 ; XOPAVX1-NEXT: retq
1457 ; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
1459 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
1460 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1461 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
1462 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1463 ; XOPAVX2-NEXT: retq
1464 %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1465 %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1466 %or = or <16 x i16> %shl, %lshr
1470 define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
1471 ; AVX1-LABEL: splatconstant_rotate_v32i8:
1473 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1474 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1475 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1476 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1477 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1478 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1479 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1480 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1481 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1482 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1483 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1484 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1485 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1488 ; AVX2-LABEL: splatconstant_rotate_v32i8:
1490 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1491 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1492 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1493 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1494 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1497 ; AVX512F-LABEL: splatconstant_rotate_v32i8:
1499 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1
1500 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1501 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1502 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1503 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1504 ; AVX512F-NEXT: retq
1506 ; AVX512VL-LABEL: splatconstant_rotate_v32i8:
1507 ; AVX512VL: # %bb.0:
1508 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1509 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
1510 ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
1511 ; AVX512VL-NEXT: retq
1513 ; AVX512BW-LABEL: splatconstant_rotate_v32i8:
1514 ; AVX512BW: # %bb.0:
1515 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1
1516 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1517 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0
1518 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1519 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
1520 ; AVX512BW-NEXT: retq
1522 ; AVX512VLBW-LABEL: splatconstant_rotate_v32i8:
1523 ; AVX512VLBW: # %bb.0:
1524 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
1525 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
1526 ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
1527 ; AVX512VLBW-NEXT: retq
1529 ; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
1531 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1532 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1533 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1534 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1535 ; XOPAVX1-NEXT: retq
1537 ; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
1539 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1540 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1541 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1542 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1543 ; XOPAVX2-NEXT: retq
1544 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1545 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1546 %or = or <32 x i8> %shl, %lshr
1551 ; Masked Uniform Constant Rotates
1554 define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
1555 ; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
1557 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm1
1558 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1559 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
1560 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1561 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1564 ; AVX2-LABEL: splatconstant_rotate_mask_v4i64:
1566 ; AVX2-NEXT: vpsrlq $49, %ymm0, %ymm0
1567 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1570 ; AVX512F-LABEL: splatconstant_rotate_mask_v4i64:
1572 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1573 ; AVX512F-NEXT: vprolq $15, %zmm0, %zmm0
1574 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1575 ; AVX512F-NEXT: retq
1577 ; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64:
1578 ; AVX512VL: # %bb.0:
1579 ; AVX512VL-NEXT: vprolq $15, %ymm0, %ymm0
1580 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1581 ; AVX512VL-NEXT: retq
1583 ; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64:
1584 ; AVX512BW: # %bb.0:
1585 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1586 ; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0
1587 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1588 ; AVX512BW-NEXT: retq
1590 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i64:
1591 ; AVX512VLBW: # %bb.0:
1592 ; AVX512VLBW-NEXT: vprolq $15, %ymm0, %ymm0
1593 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1594 ; AVX512VLBW-NEXT: retq
1596 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
1598 ; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm1
1599 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1600 ; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0
1601 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1602 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1603 ; XOPAVX1-NEXT: retq
1605 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
1607 ; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1
1608 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1609 ; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0
1610 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1611 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1612 ; XOPAVX2-NEXT: retq
1613 %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
1614 %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
1615 %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255>
1616 %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257>
1617 %or = or <4 x i64> %lmask, %rmask
1621 define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
1622 ; AVX1-LABEL: splatconstant_rotate_mask_v8i32:
1624 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1625 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1626 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1627 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1628 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1629 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1630 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1631 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1632 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1635 ; AVX2-LABEL: splatconstant_rotate_mask_v8i32:
1637 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1638 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1639 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1640 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1643 ; AVX512F-LABEL: splatconstant_rotate_mask_v8i32:
1645 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1646 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1647 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1648 ; AVX512F-NEXT: retq
1650 ; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32:
1651 ; AVX512VL: # %bb.0:
1652 ; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
1653 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1654 ; AVX512VL-NEXT: retq
1656 ; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32:
1657 ; AVX512BW: # %bb.0:
1658 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1659 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1660 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1661 ; AVX512BW-NEXT: retq
1663 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i32:
1664 ; AVX512VLBW: # %bb.0:
1665 ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
1666 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1667 ; AVX512VLBW-NEXT: retq
1669 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
1671 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1672 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1673 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1674 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1675 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1676 ; XOPAVX1-NEXT: retq
1678 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
1680 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1681 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1682 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1683 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1684 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1685 ; XOPAVX2-NEXT: retq
1686 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1687 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1688 %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
1689 %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
1690 %or = or <8 x i32> %lmask, %rmask
1694 define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
1695 ; AVX1-LABEL: splatconstant_rotate_mask_v16i16:
1697 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1698 ; AVX1-NEXT: vpsrlw $11, %xmm1, %xmm2
1699 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
1700 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1701 ; AVX1-NEXT: vpsrlw $11, %xmm0, %xmm2
1702 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
1703 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1704 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1705 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1708 ; AVX2-LABEL: splatconstant_rotate_mask_v16i16:
1710 ; AVX2-NEXT: vpsrlw $11, %ymm0, %ymm1
1711 ; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0
1712 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1713 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1716 ; AVX512-LABEL: splatconstant_rotate_mask_v16i16:
1718 ; AVX512-NEXT: vpsrlw $11, %ymm0, %ymm1
1719 ; AVX512-NEXT: vpsllw $5, %ymm0, %ymm0
1720 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
1721 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1724 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
1726 ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1
1727 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1728 ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0
1729 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1730 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1731 ; XOPAVX1-NEXT: retq
1733 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
1735 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1
1736 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1737 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0
1738 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1739 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1740 ; XOPAVX2-NEXT: retq
1741 %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1742 %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1743 %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1744 %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1745 %or = or <16 x i16> %lmask, %rmask
1749 define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
1750 ; AVX1-LABEL: splatconstant_rotate_mask_v32i8:
1752 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1753 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1754 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1755 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1756 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1757 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1758 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1759 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1760 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1761 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1762 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1763 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1764 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1765 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1768 ; AVX2-LABEL: splatconstant_rotate_mask_v32i8:
1770 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1771 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1772 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1773 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1774 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1775 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1778 ; AVX512F-LABEL: splatconstant_rotate_mask_v32i8:
1780 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1
1781 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1782 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1783 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1784 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1785 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1786 ; AVX512F-NEXT: retq
1788 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i8:
1789 ; AVX512VL: # %bb.0:
1790 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1791 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
1792 ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
1793 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1794 ; AVX512VL-NEXT: retq
1796 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i8:
1797 ; AVX512BW: # %bb.0:
1798 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1
1799 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0
1800 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1801 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1802 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
1803 ; AVX512BW-NEXT: retq
1805 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i8:
1806 ; AVX512VLBW: # %bb.0:
1807 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
1808 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
1809 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1810 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1811 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0
1812 ; AVX512VLBW-NEXT: retq
1814 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
1816 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1817 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1818 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1819 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1820 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1821 ; XOPAVX1-NEXT: retq
1823 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
1825 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1826 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1827 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1828 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1829 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1830 ; XOPAVX2-NEXT: retq
1831 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1832 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1833 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1834 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1835 %or = or <32 x i8> %lmask, %rmask