1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
15 define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
16 ; AVX1-LABEL: var_rotate_v4i64:
18 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
19 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
20 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
21 ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
22 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
23 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6
24 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
25 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4
26 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
27 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6
28 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
29 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
30 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
31 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
32 ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4
33 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
34 ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2
35 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
36 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4
37 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
38 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0
39 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
40 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
41 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
44 ; AVX2-LABEL: var_rotate_v4i64:
46 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64]
47 ; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2
48 ; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1
49 ; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0
50 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
53 ; AVX512F-LABEL: var_rotate_v4i64:
55 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
56 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
57 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
58 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
61 ; AVX512VL-LABEL: var_rotate_v4i64:
63 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
66 ; AVX512BW-LABEL: var_rotate_v4i64:
68 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
69 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
70 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
71 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
74 ; AVX512VLBW-LABEL: var_rotate_v4i64:
75 ; AVX512VLBW: # %bb.0:
76 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
77 ; AVX512VLBW-NEXT: retq
79 ; XOPAVX1-LABEL: var_rotate_v4i64:
81 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
82 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
83 ; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
84 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
85 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
88 ; XOPAVX2-LABEL: var_rotate_v4i64:
90 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
91 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
92 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
93 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
94 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
96 %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
97 %shl = shl <4 x i64> %a, %b
98 %lshr = lshr <4 x i64> %a, %b64
99 %or = or <4 x i64> %shl, %lshr
103 define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
104 ; AVX1-LABEL: var_rotate_v8i32:
106 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
107 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
108 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
109 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
110 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
111 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
112 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
113 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
114 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
115 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
116 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
117 ; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm2
118 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
119 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
120 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
121 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
122 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
123 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
124 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
125 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
126 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
127 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
128 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
129 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
130 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
131 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
132 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
133 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
134 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
135 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
136 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
139 ; AVX2-LABEL: var_rotate_v8i32:
141 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
142 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
143 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2
144 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
145 ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
146 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
147 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
150 ; AVX512F-LABEL: var_rotate_v8i32:
152 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
153 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
154 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
155 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
158 ; AVX512VL-LABEL: var_rotate_v8i32:
160 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
161 ; AVX512VL-NEXT: retq
163 ; AVX512BW-LABEL: var_rotate_v8i32:
165 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
166 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
167 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
168 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
169 ; AVX512BW-NEXT: retq
171 ; AVX512VLBW-LABEL: var_rotate_v8i32:
172 ; AVX512VLBW: # %bb.0:
173 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
174 ; AVX512VLBW-NEXT: retq
176 ; XOPAVX1-LABEL: var_rotate_v8i32:
178 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
179 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
180 ; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
181 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
182 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
185 ; XOPAVX2-LABEL: var_rotate_v8i32:
187 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
188 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
189 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
190 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
191 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
193 %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
194 %shl = shl <8 x i32> %a, %b
195 %lshr = lshr <8 x i32> %a, %b32
196 %or = or <8 x i32> %shl, %lshr
200 define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
201 ; AVX1-LABEL: var_rotate_v16i16:
203 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
204 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
205 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
206 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
207 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
208 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
209 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
210 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
211 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
212 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
213 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
214 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
215 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
216 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
217 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6
218 ; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
219 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
220 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
221 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
222 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
223 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
224 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
225 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
226 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
227 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
228 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
229 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
230 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
231 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
232 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
233 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
236 ; AVX2-LABEL: var_rotate_v16i16:
238 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
239 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
240 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
241 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
242 ; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm4
243 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
244 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
245 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
246 ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm5
247 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
248 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4
249 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
250 ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1
251 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
252 ; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3
253 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
254 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
255 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
256 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
257 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
258 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
261 ; AVX512F-LABEL: var_rotate_v16i16:
263 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
264 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
265 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
266 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
267 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
268 ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1
269 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
270 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
271 ; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0
272 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
275 ; AVX512VL-LABEL: var_rotate_v16i16:
277 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
278 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
279 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
280 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
281 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
282 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm3, %ymm1
283 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
284 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
285 ; AVX512VL-NEXT: vpord %zmm0, %zmm2, %zmm0
286 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
287 ; AVX512VL-NEXT: retq
289 ; AVX512BW-LABEL: var_rotate_v16i16:
291 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
292 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
293 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
294 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
295 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
296 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
297 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
298 ; AVX512BW-NEXT: retq
300 ; AVX512VLBW-LABEL: var_rotate_v16i16:
301 ; AVX512VLBW: # %bb.0:
302 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
303 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
304 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
305 ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
306 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
307 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
308 ; AVX512VLBW-NEXT: retq
310 ; XOPAVX1-LABEL: var_rotate_v16i16:
312 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
313 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
314 ; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
315 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
316 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
319 ; XOPAVX2-LABEL: var_rotate_v16i16:
321 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
322 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
323 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
324 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
325 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
327 %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
328 %shl = shl <16 x i16> %a, %b
329 %lshr = lshr <16 x i16> %a, %b16
330 %or = or <16 x i16> %shl, %lshr
334 define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
335 ; AVX1-LABEL: var_rotate_v32i8:
337 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
338 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
339 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
340 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
341 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
342 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
343 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
344 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
345 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
346 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
347 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
348 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
349 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
350 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7
351 ; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
352 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
353 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
354 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
355 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
356 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
357 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
358 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm7
359 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
360 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
361 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
362 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
363 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
364 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
365 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
366 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
367 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
368 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
369 ; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3
370 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
371 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
372 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
373 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
374 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
375 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
376 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
377 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
378 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
379 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
380 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
381 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
382 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
385 ; AVX2-LABEL: var_rotate_v32i8:
387 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
388 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
389 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3
390 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
391 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
392 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
393 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
394 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2
395 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
396 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3
397 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
398 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
399 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
400 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
401 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
402 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3
403 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
404 ; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
405 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
406 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
409 ; AVX512F-LABEL: var_rotate_v32i8:
411 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
412 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
413 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
414 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
415 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
416 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
417 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
418 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm2
419 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
420 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3
421 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
422 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
423 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
424 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
425 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
426 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
427 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
428 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
429 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
430 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
433 ; AVX512VL-LABEL: var_rotate_v32i8:
435 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
436 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
437 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
438 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
439 ; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
440 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
441 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
442 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm2
443 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
444 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3
445 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
446 ; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
447 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
448 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
449 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
450 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
451 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
452 ; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
453 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
454 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
455 ; AVX512VL-NEXT: retq
457 ; AVX512BW-LABEL: var_rotate_v32i8:
459 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
460 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
461 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
462 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
463 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
464 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
465 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
466 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
467 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
468 ; AVX512BW-NEXT: retq
470 ; AVX512VLBW-LABEL: var_rotate_v32i8:
471 ; AVX512VLBW: # %bb.0:
472 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
473 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
474 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
475 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
476 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
477 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
478 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
479 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
480 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
481 ; AVX512VLBW-NEXT: retq
483 ; XOPAVX1-LABEL: var_rotate_v32i8:
485 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
486 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
487 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
488 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
489 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
492 ; XOPAVX2-LABEL: var_rotate_v32i8:
494 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
495 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
496 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
497 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
498 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
500 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
501 %shl = shl <32 x i8> %a, %b
502 %lshr = lshr <32 x i8> %a, %b8
503 %or = or <32 x i8> %shl, %lshr
508 ; Uniform Variable Rotates
511 define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
512 ; AVX1-LABEL: splatvar_rotate_v4i64:
514 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
515 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64]
516 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
517 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
518 ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4
519 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
520 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
521 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
522 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
523 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
524 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
527 ; AVX2-LABEL: splatvar_rotate_v4i64:
529 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
530 ; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2
531 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm1
532 ; AVX2-NEXT: vpsrlq %xmm2, %ymm0, %ymm0
533 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
536 ; AVX512F-LABEL: splatvar_rotate_v4i64:
538 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
539 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
540 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
541 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
544 ; AVX512VL-LABEL: splatvar_rotate_v4i64:
546 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
547 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
548 ; AVX512VL-NEXT: retq
550 ; AVX512BW-LABEL: splatvar_rotate_v4i64:
552 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
553 ; AVX512BW-NEXT: vpbroadcastq %xmm1, %ymm1
554 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
555 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
556 ; AVX512BW-NEXT: retq
558 ; AVX512VLBW-LABEL: splatvar_rotate_v4i64:
559 ; AVX512VLBW: # %bb.0:
560 ; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %ymm1
561 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
562 ; AVX512VLBW-NEXT: retq
564 ; XOPAVX1-LABEL: splatvar_rotate_v4i64:
566 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
567 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
568 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
569 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
570 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
573 ; XOPAVX2-LABEL: splatvar_rotate_v4i64:
575 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %ymm1
576 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
577 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
578 ; XOPAVX2-NEXT: vprotq %xmm3, %xmm2, %xmm2
579 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
580 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
582 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
583 %splat64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %splat
584 %shl = shl <4 x i64> %a, %splat
585 %lshr = lshr <4 x i64> %a, %splat64
586 %or = or <4 x i64> %shl, %lshr
590 define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
591 ; AVX1-LABEL: splatvar_rotate_v8i32:
593 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
594 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
595 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
596 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
597 ; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm4
598 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
599 ; AVX1-NEXT: vpsubd %xmm1, %xmm5, %xmm1
600 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
601 ; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
602 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
603 ; AVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3
604 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
605 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
606 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
609 ; AVX2-LABEL: splatvar_rotate_v8i32:
611 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
612 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
613 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
614 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
615 ; AVX2-NEXT: vpslld %xmm2, %ymm0, %ymm2
616 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
617 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
618 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
619 ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
620 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
623 ; AVX512F-LABEL: splatvar_rotate_v8i32:
625 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
626 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
627 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
628 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
631 ; AVX512VL-LABEL: splatvar_rotate_v8i32:
633 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %ymm1
634 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
635 ; AVX512VL-NEXT: retq
637 ; AVX512BW-LABEL: splatvar_rotate_v8i32:
639 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
640 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1
641 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
642 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
643 ; AVX512BW-NEXT: retq
645 ; AVX512VLBW-LABEL: splatvar_rotate_v8i32:
646 ; AVX512VLBW: # %bb.0:
647 ; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %ymm1
648 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
649 ; AVX512VLBW-NEXT: retq
651 ; XOPAVX1-LABEL: splatvar_rotate_v8i32:
653 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
654 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
655 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
656 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
657 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
660 ; XOPAVX2-LABEL: splatvar_rotate_v8i32:
662 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1
663 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
664 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
665 ; XOPAVX2-NEXT: vprotd %xmm3, %xmm2, %xmm2
666 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
667 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
669 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
670 %splat32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
671 %shl = shl <8 x i32> %a, %splat
672 %lshr = lshr <8 x i32> %a, %splat32
673 %or = or <8 x i32> %shl, %lshr
677 define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
678 ; AVX1-LABEL: splatvar_rotate_v16i16:
680 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
681 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
682 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
683 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
684 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
685 ; AVX1-NEXT: vpsllw %xmm3, %xmm2, %xmm4
686 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
687 ; AVX1-NEXT: vpsubw %xmm1, %xmm5, %xmm1
688 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
689 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
690 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
691 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
692 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
693 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
694 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
697 ; AVX2-LABEL: splatvar_rotate_v16i16:
699 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
700 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
701 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
702 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2
703 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
704 ; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
705 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
706 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
707 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
710 ; AVX512-LABEL: splatvar_rotate_v16i16:
712 ; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1
713 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
714 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
715 ; AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm2
716 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
717 ; AVX512-NEXT: vpsubw %xmm1, %xmm3, %xmm1
718 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
719 ; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
720 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
723 ; XOPAVX1-LABEL: splatvar_rotate_v16i16:
725 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
726 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
727 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
728 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
729 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
730 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
733 ; XOPAVX2-LABEL: splatvar_rotate_v16i16:
735 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %ymm1
736 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
737 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
738 ; XOPAVX2-NEXT: vprotw %xmm3, %xmm2, %xmm2
739 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
740 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
742 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
743 %splat16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
744 %shl = shl <16 x i16> %a, %splat
745 %lshr = lshr <16 x i16> %a, %splat16
746 %or = or <16 x i16> %shl, %lshr
750 define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
751 ; AVX1-LABEL: splatvar_rotate_v32i8:
753 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
754 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
755 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
756 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
757 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
758 ; AVX1-NEXT: vpsllw %xmm3, %xmm4, %xmm5
759 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
760 ; AVX1-NEXT: vpsllw %xmm3, %xmm6, %xmm7
761 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
762 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
763 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
764 ; AVX1-NEXT: vpsubb %xmm1, %xmm7, %xmm1
765 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
766 ; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
767 ; AVX1-NEXT: vpsrlw %xmm1, %xmm6, %xmm6
768 ; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
769 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
770 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
771 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
772 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
773 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
774 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
775 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
776 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
779 ; AVX2-LABEL: splatvar_rotate_v32i8:
781 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
782 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
783 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
784 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3
785 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
786 ; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
787 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
788 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
789 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
790 ; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
791 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
792 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
793 ; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
794 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
795 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
796 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
797 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
800 ; AVX512F-LABEL: splatvar_rotate_v32i8:
802 ; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
803 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
804 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
805 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3
806 ; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
807 ; AVX512F-NEXT: vpsllw %xmm2, %xmm4, %xmm2
808 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
809 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
810 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
811 ; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
812 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
813 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
814 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
815 ; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1
816 ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
817 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
818 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
821 ; AVX512VL-LABEL: splatvar_rotate_v32i8:
823 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
824 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
825 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
826 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3
827 ; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
828 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm4, %xmm2
829 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
830 ; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
831 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
832 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
833 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
834 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
835 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
836 ; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1
837 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
838 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
839 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
840 ; AVX512VL-NEXT: retq
842 ; AVX512BW-LABEL: splatvar_rotate_v32i8:
844 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1
845 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
846 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
847 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
848 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
849 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
850 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
851 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
852 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
853 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
854 ; AVX512BW-NEXT: retq
856 ; AVX512VLBW-LABEL: splatvar_rotate_v32i8:
857 ; AVX512VLBW: # %bb.0:
858 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %ymm1
859 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
860 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
861 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
862 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
863 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
864 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
865 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
866 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
867 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
868 ; AVX512VLBW-NEXT: retq
870 ; XOPAVX1-LABEL: splatvar_rotate_v32i8:
872 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
873 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
874 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
875 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2
876 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
877 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
880 ; XOPAVX2-LABEL: splatvar_rotate_v32i8:
882 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1
883 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
884 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
885 ; XOPAVX2-NEXT: vprotb %xmm3, %xmm2, %xmm2
886 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
887 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
889 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
890 %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
891 %shl = shl <32 x i8> %a, %splat
892 %lshr = lshr <32 x i8> %a, %splat8
893 %or = or <32 x i8> %shl, %lshr
901 define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
902 ; AVX1-LABEL: constant_rotate_v4i64:
904 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
905 ; AVX1-NEXT: vpsllq $60, %xmm1, %xmm2
906 ; AVX1-NEXT: vpsllq $50, %xmm1, %xmm3
907 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
908 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3
909 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm4
910 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
911 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
912 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3
913 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm1
914 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
915 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3
916 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0
917 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
918 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
919 ; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
922 ; AVX2-LABEL: constant_rotate_v4i64:
924 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1
925 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
926 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
929 ; AVX512F-LABEL: constant_rotate_v4i64:
931 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
932 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
933 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
934 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
937 ; AVX512VL-LABEL: constant_rotate_v4i64:
939 ; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
940 ; AVX512VL-NEXT: retq
942 ; AVX512BW-LABEL: constant_rotate_v4i64:
944 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
945 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
946 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
947 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
948 ; AVX512BW-NEXT: retq
950 ; AVX512VLBW-LABEL: constant_rotate_v4i64:
951 ; AVX512VLBW: # %bb.0:
952 ; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
953 ; AVX512VLBW-NEXT: retq
955 ; XOPAVX1-LABEL: constant_rotate_v4i64:
957 ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
958 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
959 ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
960 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
963 ; XOPAVX2-LABEL: constant_rotate_v4i64:
965 ; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
966 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
967 ; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
968 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
970 %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
971 %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4>
972 %or = or <4 x i64> %shl, %lshr
976 define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
977 ; AVX1-LABEL: constant_rotate_v8i32:
979 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [256,512,1024,2048]
980 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
981 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
982 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
983 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
984 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
985 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
986 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
987 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
988 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
989 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
990 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,32,64,128]
991 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
992 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
993 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
994 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
995 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
996 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
997 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
998 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
999 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1000 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1003 ; AVX2-LABEL: constant_rotate_v8i32:
1005 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm1
1006 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
1007 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1010 ; AVX512F-LABEL: constant_rotate_v8i32:
1012 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1013 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1014 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1015 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1016 ; AVX512F-NEXT: retq
1018 ; AVX512VL-LABEL: constant_rotate_v8i32:
1019 ; AVX512VL: # %bb.0:
1020 ; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
1021 ; AVX512VL-NEXT: retq
1023 ; AVX512BW-LABEL: constant_rotate_v8i32:
1024 ; AVX512BW: # %bb.0:
1025 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1026 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1027 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1028 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1029 ; AVX512BW-NEXT: retq
1031 ; AVX512VLBW-LABEL: constant_rotate_v8i32:
1032 ; AVX512VLBW: # %bb.0:
1033 ; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
1034 ; AVX512VLBW-NEXT: retq
1036 ; XOPAVX1-LABEL: constant_rotate_v8i32:
1038 ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
1039 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1040 ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
1041 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1042 ; XOPAVX1-NEXT: retq
1044 ; XOPAVX2-LABEL: constant_rotate_v8i32:
1046 ; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
1047 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1048 ; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
1049 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1050 ; XOPAVX2-NEXT: retq
1051 %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1052 %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
1053 %or = or <8 x i32> %shl, %lshr
1057 define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
1058 ; AVX1-LABEL: constant_rotate_v16i16:
1060 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1061 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
1062 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
1063 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1064 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1065 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1066 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3
1067 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1068 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1069 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1072 ; AVX2-LABEL: constant_rotate_v16i16:
1074 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1075 ; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1076 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1077 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
1080 ; AVX512F-LABEL: constant_rotate_v16i16:
1082 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1083 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1084 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1085 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
1086 ; AVX512F-NEXT: retq
1088 ; AVX512VL-LABEL: constant_rotate_v16i16:
1089 ; AVX512VL: # %bb.0:
1090 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1091 ; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1092 ; AVX512VL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1093 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
1094 ; AVX512VL-NEXT: retq
1096 ; AVX512BW-LABEL: constant_rotate_v16i16:
1097 ; AVX512BW: # %bb.0:
1098 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1099 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1100 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1101 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2
1102 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1103 ; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0
1104 ; AVX512BW-NEXT: retq
1106 ; AVX512VLBW-LABEL: constant_rotate_v16i16:
1107 ; AVX512VLBW: # %bb.0:
1108 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
1109 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1110 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1111 ; AVX512VLBW-NEXT: retq
1113 ; XOPAVX1-LABEL: constant_rotate_v16i16:
1115 ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
1116 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1117 ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
1118 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1119 ; XOPAVX1-NEXT: retq
1121 ; XOPAVX2-LABEL: constant_rotate_v16i16:
1123 ; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
1124 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1125 ; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
1126 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1127 ; XOPAVX2-NEXT: retq
1128 %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1129 %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
1130 %or = or <16 x i16> %shl, %lshr
1134 define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
1135 ; AVX1-LABEL: constant_rotate_v32i8:
1137 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1138 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
1139 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1140 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [256,128,64,32,16,8,4,2]
1141 ; AVX1-NEXT: vpmullw %xmm9, %xmm3, %xmm3
1142 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1143 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1144 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [256,2,4,8,16,32,64,128]
1145 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm7
1146 ; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
1147 ; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
1148 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1149 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
1150 ; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1
1151 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1152 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1153 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1154 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm5
1155 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
1156 ; AVX1-NEXT: vpackuswb %xmm1, %xmm5, %xmm1
1157 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1158 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1159 ; AVX1-NEXT: vpmullw %xmm9, %xmm3, %xmm3
1160 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1161 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1162 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm6
1163 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
1164 ; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
1165 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1166 ; AVX1-NEXT: vpmullw %xmm7, %xmm0, %xmm0
1167 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1168 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
1169 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
1170 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
1171 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1172 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1175 ; AVX2-LABEL: constant_rotate_v32i8:
1177 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
1178 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1179 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1180 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
1181 ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1182 ; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3
1183 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1184 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1185 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1186 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1187 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1188 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1189 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1190 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1191 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
1192 ; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
1193 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1194 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1195 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1196 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1197 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1200 ; AVX512F-LABEL: constant_rotate_v32i8:
1202 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
1203 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1204 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1205 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1206 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1207 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3
1208 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1209 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1210 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1211 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1212 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1213 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1214 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1215 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1216 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
1217 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
1218 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1219 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1220 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1221 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1222 ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0
1223 ; AVX512F-NEXT: retq
1225 ; AVX512VL-LABEL: constant_rotate_v32i8:
1226 ; AVX512VL: # %bb.0:
1227 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1228 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1229 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1230 ; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
1231 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1232 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3
1233 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1234 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1235 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1236 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1237 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1238 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1239 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1240 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1241 ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
1242 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1243 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1244 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1245 ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1246 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1247 ; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1248 ; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0
1249 ; AVX512VL-NEXT: retq
1251 ; AVX512BW-LABEL: constant_rotate_v32i8:
1252 ; AVX512BW: # %bb.0:
1253 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1254 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
1255 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
1256 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
1257 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1258 ; AVX512BW-NEXT: retq
1260 ; AVX512VLBW-LABEL: constant_rotate_v32i8:
1261 ; AVX512VLBW: # %bb.0:
1262 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1263 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
1264 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
1265 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
1266 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
1267 ; AVX512VLBW-NEXT: retq
1269 ; XOPAVX1-LABEL: constant_rotate_v32i8:
1271 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1272 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1273 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1
1274 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0
1275 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1276 ; XOPAVX1-NEXT: retq
1278 ; XOPAVX2-LABEL: constant_rotate_v32i8:
1280 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1281 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1282 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1
1283 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0
1284 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1285 ; XOPAVX2-NEXT: retq
1286 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1287 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1288 %or = or <32 x i8> %shl, %lshr
1293 ; Uniform Constant Rotates
1296 define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
1297 ; AVX1-LABEL: splatconstant_rotate_v4i64:
1299 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1
1300 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1301 ; AVX1-NEXT: vpsllq $14, %xmm2, %xmm3
1302 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1303 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm0
1304 ; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm2
1305 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1306 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
1309 ; AVX2-LABEL: splatconstant_rotate_v4i64:
1311 ; AVX2-NEXT: vpsllq $14, %ymm0, %ymm1
1312 ; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm0
1313 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1316 ; AVX512F-LABEL: splatconstant_rotate_v4i64:
1318 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1319 ; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0
1320 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1321 ; AVX512F-NEXT: retq
1323 ; AVX512VL-LABEL: splatconstant_rotate_v4i64:
1324 ; AVX512VL: # %bb.0:
1325 ; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0
1326 ; AVX512VL-NEXT: retq
1328 ; AVX512BW-LABEL: splatconstant_rotate_v4i64:
1329 ; AVX512BW: # %bb.0:
1330 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1331 ; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0
1332 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1333 ; AVX512BW-NEXT: retq
1335 ; AVX512VLBW-LABEL: splatconstant_rotate_v4i64:
1336 ; AVX512VLBW: # %bb.0:
1337 ; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0
1338 ; AVX512VLBW-NEXT: retq
1340 ; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
1342 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
1343 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1344 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
1345 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1346 ; XOPAVX1-NEXT: retq
1348 ; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
1350 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
1351 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1352 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
1353 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1354 ; XOPAVX2-NEXT: retq
1355 %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
1356 %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
1357 %or = or <4 x i64> %shl, %lshr
1361 define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
1362 ; AVX1-LABEL: splatconstant_rotate_v8i32:
1364 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1365 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1366 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1367 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1368 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1369 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1370 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1371 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1374 ; AVX2-LABEL: splatconstant_rotate_v8i32:
1376 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1377 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1378 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1381 ; AVX512F-LABEL: splatconstant_rotate_v8i32:
1383 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1384 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1385 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1386 ; AVX512F-NEXT: retq
1388 ; AVX512VL-LABEL: splatconstant_rotate_v8i32:
1389 ; AVX512VL: # %bb.0:
1390 ; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
1391 ; AVX512VL-NEXT: retq
1393 ; AVX512BW-LABEL: splatconstant_rotate_v8i32:
1394 ; AVX512BW: # %bb.0:
1395 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1396 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1397 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1398 ; AVX512BW-NEXT: retq
1400 ; AVX512VLBW-LABEL: splatconstant_rotate_v8i32:
1401 ; AVX512VLBW: # %bb.0:
1402 ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
1403 ; AVX512VLBW-NEXT: retq
1405 ; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
1407 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1408 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1409 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1410 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1411 ; XOPAVX1-NEXT: retq
1413 ; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
1415 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1416 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1417 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1418 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1419 ; XOPAVX2-NEXT: retq
1420 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1421 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1422 %or = or <8 x i32> %shl, %lshr
1426 define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
1427 ; AVX1-LABEL: splatconstant_rotate_v16i16:
1429 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1430 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
1431 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
1432 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1433 ; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm2
1434 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
1435 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1436 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1439 ; AVX2-LABEL: splatconstant_rotate_v16i16:
1441 ; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm1
1442 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
1443 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1446 ; AVX512-LABEL: splatconstant_rotate_v16i16:
1448 ; AVX512-NEXT: vpsrlw $9, %ymm0, %ymm1
1449 ; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0
1450 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
1453 ; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
1455 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
1456 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1457 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
1458 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1459 ; XOPAVX1-NEXT: retq
1461 ; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
1463 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
1464 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1465 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
1466 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1467 ; XOPAVX2-NEXT: retq
1468 %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1469 %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1470 %or = or <16 x i16> %shl, %lshr
1474 define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
1475 ; AVX1-LABEL: splatconstant_rotate_v32i8:
1477 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1478 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1479 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1480 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1481 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1482 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1483 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1484 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1485 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1486 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1487 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1488 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1489 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1492 ; AVX2-LABEL: splatconstant_rotate_v32i8:
1494 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1495 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1496 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1497 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1498 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1501 ; AVX512F-LABEL: splatconstant_rotate_v32i8:
1503 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1
1504 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1505 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1506 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1507 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1508 ; AVX512F-NEXT: retq
1510 ; AVX512VL-LABEL: splatconstant_rotate_v32i8:
1511 ; AVX512VL: # %bb.0:
1512 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm1
1513 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1514 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
1515 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1516 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1517 ; AVX512VL-NEXT: retq
1519 ; AVX512BW-LABEL: splatconstant_rotate_v32i8:
1520 ; AVX512BW: # %bb.0:
1521 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1
1522 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1523 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0
1524 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1525 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
1526 ; AVX512BW-NEXT: retq
1528 ; AVX512VLBW-LABEL: splatconstant_rotate_v32i8:
1529 ; AVX512VLBW: # %bb.0:
1530 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
1531 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1532 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
1533 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1534 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0
1535 ; AVX512VLBW-NEXT: retq
1537 ; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
1539 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1540 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1541 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1542 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1543 ; XOPAVX1-NEXT: retq
1545 ; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
1547 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1548 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1549 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1550 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1551 ; XOPAVX2-NEXT: retq
1552 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1553 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1554 %or = or <32 x i8> %shl, %lshr
1559 ; Masked Uniform Constant Rotates
1562 define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
1563 ; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
1565 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm1
1566 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1567 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
1568 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1569 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1572 ; AVX2-LABEL: splatconstant_rotate_mask_v4i64:
1574 ; AVX2-NEXT: vpsrlq $49, %ymm0, %ymm0
1575 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1578 ; AVX512F-LABEL: splatconstant_rotate_mask_v4i64:
1580 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1581 ; AVX512F-NEXT: vprolq $15, %zmm0, %zmm0
1582 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1583 ; AVX512F-NEXT: retq
1585 ; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64:
1586 ; AVX512VL: # %bb.0:
1587 ; AVX512VL-NEXT: vprolq $15, %ymm0, %ymm0
1588 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1589 ; AVX512VL-NEXT: retq
1591 ; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64:
1592 ; AVX512BW: # %bb.0:
1593 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1594 ; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0
1595 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1596 ; AVX512BW-NEXT: retq
1598 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i64:
1599 ; AVX512VLBW: # %bb.0:
1600 ; AVX512VLBW-NEXT: vprolq $15, %ymm0, %ymm0
1601 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1602 ; AVX512VLBW-NEXT: retq
1604 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
1606 ; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm1
1607 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1608 ; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0
1609 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1610 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1611 ; XOPAVX1-NEXT: retq
1613 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
1615 ; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1
1616 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1617 ; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0
1618 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1619 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1620 ; XOPAVX2-NEXT: retq
1621 %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
1622 %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
1623 %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255>
1624 %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257>
1625 %or = or <4 x i64> %lmask, %rmask
1629 define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
1630 ; AVX1-LABEL: splatconstant_rotate_mask_v8i32:
1632 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1633 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1634 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1635 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1636 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1637 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1638 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1639 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1640 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1643 ; AVX2-LABEL: splatconstant_rotate_mask_v8i32:
1645 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1646 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1647 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1648 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1651 ; AVX512F-LABEL: splatconstant_rotate_mask_v8i32:
1653 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1654 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1655 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1656 ; AVX512F-NEXT: retq
1658 ; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32:
1659 ; AVX512VL: # %bb.0:
1660 ; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
1661 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1662 ; AVX512VL-NEXT: retq
1664 ; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32:
1665 ; AVX512BW: # %bb.0:
1666 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1667 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1668 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1669 ; AVX512BW-NEXT: retq
1671 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i32:
1672 ; AVX512VLBW: # %bb.0:
1673 ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
1674 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1675 ; AVX512VLBW-NEXT: retq
1677 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
1679 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1680 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1681 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1682 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1683 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1684 ; XOPAVX1-NEXT: retq
1686 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
1688 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1689 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1690 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1691 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1692 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1693 ; XOPAVX2-NEXT: retq
1694 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1695 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1696 %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
1697 %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
1698 %or = or <8 x i32> %lmask, %rmask
1702 define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
1703 ; AVX1-LABEL: splatconstant_rotate_mask_v16i16:
1705 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1706 ; AVX1-NEXT: vpsrlw $11, %xmm1, %xmm2
1707 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
1708 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1709 ; AVX1-NEXT: vpsrlw $11, %xmm0, %xmm2
1710 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
1711 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1712 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1713 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1716 ; AVX2-LABEL: splatconstant_rotate_mask_v16i16:
1718 ; AVX2-NEXT: vpsrlw $11, %ymm0, %ymm1
1719 ; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0
1720 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1721 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1724 ; AVX512-LABEL: splatconstant_rotate_mask_v16i16:
1726 ; AVX512-NEXT: vpsrlw $11, %ymm0, %ymm1
1727 ; AVX512-NEXT: vpsllw $5, %ymm0, %ymm0
1728 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
1729 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1732 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
1734 ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1
1735 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1736 ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0
1737 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1738 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1739 ; XOPAVX1-NEXT: retq
1741 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
1743 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1
1744 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1745 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0
1746 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1747 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1748 ; XOPAVX2-NEXT: retq
1749 %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1750 %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1751 %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1752 %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1753 %or = or <16 x i16> %lmask, %rmask
1757 define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
1758 ; AVX1-LABEL: splatconstant_rotate_mask_v32i8:
1760 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1761 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1762 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1763 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1764 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1765 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1766 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1767 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1768 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1769 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1770 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1771 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1772 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1773 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1776 ; AVX2-LABEL: splatconstant_rotate_mask_v32i8:
1778 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1779 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1780 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1781 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1782 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1783 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1786 ; AVX512F-LABEL: splatconstant_rotate_mask_v32i8:
1788 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1
1789 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1790 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1791 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1792 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1793 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1794 ; AVX512F-NEXT: retq
1796 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i8:
1797 ; AVX512VL: # %bb.0:
1798 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm1
1799 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1800 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
1801 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1802 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1803 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1804 ; AVX512VL-NEXT: retq
1806 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i8:
1807 ; AVX512BW: # %bb.0:
1808 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1
1809 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0
1810 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1811 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1812 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
1813 ; AVX512BW-NEXT: retq
1815 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i8:
1816 ; AVX512VLBW: # %bb.0:
1817 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
1818 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
1819 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1820 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1821 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0
1822 ; AVX512VLBW-NEXT: retq
1824 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
1826 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1827 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1828 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1829 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1830 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1831 ; XOPAVX1-NEXT: retq
1833 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
1835 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1836 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1837 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1838 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1839 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1840 ; XOPAVX2-NEXT: retq
1841 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1842 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1843 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1844 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1845 %or = or <32 x i8> %lmask, %rmask