1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
15 define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
16 ; AVX1-LABEL: var_rotate_v4i64:
18 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
19 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
20 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
21 ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
22 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
23 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6
24 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
25 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4
26 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
27 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6
28 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
29 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
30 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
31 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
32 ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4
33 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
34 ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2
35 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
36 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4
37 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
38 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0
39 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
40 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
41 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
44 ; AVX2-LABEL: var_rotate_v4i64:
46 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64]
47 ; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2
48 ; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1
49 ; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0
50 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
53 ; AVX512F-LABEL: var_rotate_v4i64:
55 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
56 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
57 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
58 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
61 ; AVX512VL-LABEL: var_rotate_v4i64:
63 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
66 ; AVX512BW-LABEL: var_rotate_v4i64:
68 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
69 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
70 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
71 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
74 ; AVX512VLBW-LABEL: var_rotate_v4i64:
75 ; AVX512VLBW: # %bb.0:
76 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
77 ; AVX512VLBW-NEXT: retq
79 ; XOPAVX1-LABEL: var_rotate_v4i64:
81 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
82 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
83 ; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
84 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
85 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
88 ; XOPAVX2-LABEL: var_rotate_v4i64:
90 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
91 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
92 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
93 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
94 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
96 %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
97 %shl = shl <4 x i64> %a, %b
98 %lshr = lshr <4 x i64> %a, %b64
99 %or = or <4 x i64> %shl, %lshr
103 define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
104 ; AVX1-LABEL: var_rotate_v8i32:
106 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
107 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
108 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
109 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
110 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
111 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
112 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
113 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
114 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
115 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
116 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
117 ; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm2
118 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
119 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
120 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
121 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
122 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
123 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
124 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
125 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
126 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
127 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
128 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
129 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
130 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
131 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
132 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
133 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
134 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
135 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
136 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
139 ; AVX2-LABEL: var_rotate_v8i32:
141 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
142 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
143 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2
144 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
145 ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
146 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
147 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
150 ; AVX512F-LABEL: var_rotate_v8i32:
152 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
153 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
154 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
155 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
158 ; AVX512VL-LABEL: var_rotate_v8i32:
160 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
161 ; AVX512VL-NEXT: retq
163 ; AVX512BW-LABEL: var_rotate_v8i32:
165 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
166 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
167 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
168 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
169 ; AVX512BW-NEXT: retq
171 ; AVX512VLBW-LABEL: var_rotate_v8i32:
172 ; AVX512VLBW: # %bb.0:
173 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
174 ; AVX512VLBW-NEXT: retq
176 ; XOPAVX1-LABEL: var_rotate_v8i32:
178 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
179 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
180 ; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
181 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
182 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
185 ; XOPAVX2-LABEL: var_rotate_v8i32:
187 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
188 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
189 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
190 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
191 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
193 %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
194 %shl = shl <8 x i32> %a, %b
195 %lshr = lshr <8 x i32> %a, %b32
196 %or = or <8 x i32> %shl, %lshr
200 define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
201 ; AVX1-LABEL: var_rotate_v16i16:
203 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
204 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
205 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
206 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
207 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
208 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
209 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
210 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
211 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
212 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
213 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
214 ; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
215 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
216 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
217 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
218 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm7
219 ; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2
220 ; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2
221 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
222 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
223 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
224 ; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3
225 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
226 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
227 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
228 ; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm1
229 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
230 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
231 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
232 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
233 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
234 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
237 ; AVX2-LABEL: var_rotate_v16i16:
239 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
240 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
241 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
242 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
243 ; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm4
244 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
245 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
246 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
247 ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm5
248 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
249 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4
250 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
251 ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1
252 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
253 ; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3
254 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
255 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
256 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
257 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
258 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
259 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
262 ; AVX512F-LABEL: var_rotate_v16i16:
264 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
265 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
266 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
267 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
268 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
269 ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1
270 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
271 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
272 ; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0
273 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
276 ; AVX512VL-LABEL: var_rotate_v16i16:
278 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
279 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
280 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
281 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
282 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
283 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm3, %ymm1
284 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
285 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
286 ; AVX512VL-NEXT: vpord %zmm0, %zmm2, %zmm0
287 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
288 ; AVX512VL-NEXT: retq
290 ; AVX512BW-LABEL: var_rotate_v16i16:
292 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
293 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
294 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
295 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
296 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
297 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
298 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
299 ; AVX512BW-NEXT: retq
301 ; AVX512VLBW-LABEL: var_rotate_v16i16:
302 ; AVX512VLBW: # %bb.0:
303 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
304 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
305 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
306 ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
307 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
308 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
309 ; AVX512VLBW-NEXT: retq
311 ; XOPAVX1-LABEL: var_rotate_v16i16:
313 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
314 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
315 ; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
316 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
317 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
320 ; XOPAVX2-LABEL: var_rotate_v16i16:
322 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
323 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
324 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
325 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
326 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
328 %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
329 %shl = shl <16 x i16> %a, %b
330 %lshr = lshr <16 x i16> %a, %b16
331 %or = or <16 x i16> %shl, %lshr
335 define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
336 ; AVX1-LABEL: var_rotate_v32i8:
338 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
339 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
340 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
341 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
342 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
343 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
344 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
345 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
346 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
347 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
348 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
349 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
350 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
351 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7
352 ; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
353 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
354 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
355 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
356 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
357 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
358 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
359 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm7
360 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
361 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
362 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
363 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
364 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
365 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
366 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
367 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
368 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
369 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
370 ; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3
371 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
372 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
373 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
374 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
375 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
376 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
377 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
378 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
379 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
380 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
381 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
382 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
383 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
386 ; AVX2-LABEL: var_rotate_v32i8:
388 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
389 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
390 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3
391 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
392 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
393 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
394 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
395 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2
396 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
397 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3
398 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
399 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
400 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
401 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
402 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
403 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3
404 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
405 ; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
406 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
407 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
410 ; AVX512F-LABEL: var_rotate_v32i8:
412 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
413 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
414 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
415 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
416 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
417 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
418 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
419 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm2
420 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
421 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3
422 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
423 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
424 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
425 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
426 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
427 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
428 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
429 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
430 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
431 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
434 ; AVX512VL-LABEL: var_rotate_v32i8:
436 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
437 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
438 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
439 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
440 ; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
441 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
442 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
443 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm2
444 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
445 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3
446 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
447 ; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
448 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
449 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
450 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
451 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
452 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
453 ; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
454 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
455 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
456 ; AVX512VL-NEXT: retq
458 ; AVX512BW-LABEL: var_rotate_v32i8:
460 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
461 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
462 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
463 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
464 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
465 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
466 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
467 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
468 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
469 ; AVX512BW-NEXT: retq
471 ; AVX512VLBW-LABEL: var_rotate_v32i8:
472 ; AVX512VLBW: # %bb.0:
473 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
474 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
475 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
476 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
477 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
478 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
479 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
480 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
481 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
482 ; AVX512VLBW-NEXT: retq
484 ; XOPAVX1-LABEL: var_rotate_v32i8:
486 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
487 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
488 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
489 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
490 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
493 ; XOPAVX2-LABEL: var_rotate_v32i8:
495 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
496 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
497 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
498 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
499 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
501 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
502 %shl = shl <32 x i8> %a, %b
503 %lshr = lshr <32 x i8> %a, %b8
504 %or = or <32 x i8> %shl, %lshr
509 ; Uniform Variable Rotates
512 define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
513 ; AVX1-LABEL: splatvar_rotate_v4i64:
515 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0]
516 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64]
517 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
518 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
519 ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4
520 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
521 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
522 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
523 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
524 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
525 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
528 ; AVX2-LABEL: splatvar_rotate_v4i64:
530 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm2
531 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64]
532 ; AVX2-NEXT: vpsubq %xmm1, %xmm3, %xmm1
533 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
534 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
537 ; AVX512F-LABEL: splatvar_rotate_v4i64:
539 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
540 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
541 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
542 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
545 ; AVX512VL-LABEL: splatvar_rotate_v4i64:
547 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
548 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
549 ; AVX512VL-NEXT: retq
551 ; AVX512BW-LABEL: splatvar_rotate_v4i64:
553 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
554 ; AVX512BW-NEXT: vpbroadcastq %xmm1, %ymm1
555 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
556 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
557 ; AVX512BW-NEXT: retq
559 ; AVX512VLBW-LABEL: splatvar_rotate_v4i64:
560 ; AVX512VLBW: # %bb.0:
561 ; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %ymm1
562 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
563 ; AVX512VLBW-NEXT: retq
565 ; XOPAVX1-LABEL: splatvar_rotate_v4i64:
567 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
568 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
569 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
570 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
571 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
574 ; XOPAVX2-LABEL: splatvar_rotate_v4i64:
576 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %ymm1
577 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
578 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
579 ; XOPAVX2-NEXT: vprotq %xmm3, %xmm2, %xmm2
580 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
581 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
583 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
584 %splat64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %splat
585 %shl = shl <4 x i64> %a, %splat
586 %lshr = lshr <4 x i64> %a, %splat64
587 %or = or <4 x i64> %shl, %lshr
591 define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
592 ; AVX1-LABEL: splatvar_rotate_v8i32:
594 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
595 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
596 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
597 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
598 ; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm4
599 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
600 ; AVX1-NEXT: vpsubd %xmm1, %xmm5, %xmm1
601 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
602 ; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
603 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
604 ; AVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3
605 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
606 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
607 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
610 ; AVX2-LABEL: splatvar_rotate_v8i32:
612 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
613 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
614 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
615 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
616 ; AVX2-NEXT: vpslld %xmm2, %ymm0, %ymm2
617 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
618 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
619 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
620 ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
621 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
624 ; AVX512F-LABEL: splatvar_rotate_v8i32:
626 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
627 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
628 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
629 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
632 ; AVX512VL-LABEL: splatvar_rotate_v8i32:
634 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %ymm1
635 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
636 ; AVX512VL-NEXT: retq
638 ; AVX512BW-LABEL: splatvar_rotate_v8i32:
640 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
641 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1
642 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
643 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
644 ; AVX512BW-NEXT: retq
646 ; AVX512VLBW-LABEL: splatvar_rotate_v8i32:
647 ; AVX512VLBW: # %bb.0:
648 ; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %ymm1
649 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
650 ; AVX512VLBW-NEXT: retq
652 ; XOPAVX1-LABEL: splatvar_rotate_v8i32:
654 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
655 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
656 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
657 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
658 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
661 ; XOPAVX2-LABEL: splatvar_rotate_v8i32:
663 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1
664 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
665 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
666 ; XOPAVX2-NEXT: vprotd %xmm3, %xmm2, %xmm2
667 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
668 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
670 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
671 %splat32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
672 %shl = shl <8 x i32> %a, %splat
673 %lshr = lshr <8 x i32> %a, %splat32
674 %or = or <8 x i32> %shl, %lshr
678 define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
679 ; AVX1-LABEL: splatvar_rotate_v16i16:
681 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
682 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
683 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
684 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
685 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
686 ; AVX1-NEXT: vpsllw %xmm3, %xmm2, %xmm4
687 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
688 ; AVX1-NEXT: vpsubw %xmm1, %xmm5, %xmm1
689 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
690 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
691 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
692 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
693 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
694 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
695 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
698 ; AVX2-LABEL: splatvar_rotate_v16i16:
700 ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
701 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
702 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
703 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2
704 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
705 ; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
706 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
707 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
708 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
711 ; AVX512-LABEL: splatvar_rotate_v16i16:
713 ; AVX512-NEXT: vpbroadcastw %xmm1, %ymm1
714 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
715 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
716 ; AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm2
717 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
718 ; AVX512-NEXT: vpsubw %xmm1, %xmm3, %xmm1
719 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
720 ; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
721 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
724 ; XOPAVX1-LABEL: splatvar_rotate_v16i16:
726 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
727 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
728 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
729 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
730 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
731 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
734 ; XOPAVX2-LABEL: splatvar_rotate_v16i16:
736 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %ymm1
737 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
738 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
739 ; XOPAVX2-NEXT: vprotw %xmm3, %xmm2, %xmm2
740 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
741 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
743 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
744 %splat16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
745 %shl = shl <16 x i16> %a, %splat
746 %lshr = lshr <16 x i16> %a, %splat16
747 %or = or <16 x i16> %shl, %lshr
751 define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
752 ; AVX1-LABEL: splatvar_rotate_v32i8:
754 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
755 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
756 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
757 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
758 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
759 ; AVX1-NEXT: vpsllw %xmm3, %xmm4, %xmm5
760 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
761 ; AVX1-NEXT: vpsllw %xmm3, %xmm6, %xmm7
762 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
763 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
764 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
765 ; AVX1-NEXT: vpsubb %xmm1, %xmm7, %xmm1
766 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
767 ; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
768 ; AVX1-NEXT: vpsrlw %xmm1, %xmm6, %xmm6
769 ; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
770 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
771 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
772 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
773 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
774 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
775 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
776 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
777 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
780 ; AVX2-LABEL: splatvar_rotate_v32i8:
782 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
783 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
784 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
785 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3
786 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
787 ; AVX2-NEXT: vpsllw %xmm2, %ymm4, %ymm2
788 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
789 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
790 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
791 ; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
792 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
793 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
794 ; AVX2-NEXT: vpsrlw %xmm1, %ymm4, %ymm1
795 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
796 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
797 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
798 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
801 ; AVX512F-LABEL: splatvar_rotate_v32i8:
803 ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
804 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
805 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
806 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3
807 ; AVX512F-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
808 ; AVX512F-NEXT: vpsllw %xmm2, %ymm4, %ymm2
809 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
810 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
811 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
812 ; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
813 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
814 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
815 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm4, %ymm1
816 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
817 ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
818 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
819 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
822 ; AVX512VL-LABEL: splatvar_rotate_v32i8:
824 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
825 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
826 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
827 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3
828 ; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
829 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm4, %ymm2
830 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
831 ; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
832 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
833 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
834 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
835 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
836 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm4, %ymm1
837 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
838 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
839 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
840 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
841 ; AVX512VL-NEXT: retq
843 ; AVX512BW-LABEL: splatvar_rotate_v32i8:
845 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1
846 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
847 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
848 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
849 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
850 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
851 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
852 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
853 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
854 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
855 ; AVX512BW-NEXT: retq
857 ; AVX512VLBW-LABEL: splatvar_rotate_v32i8:
858 ; AVX512VLBW: # %bb.0:
859 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %ymm1
860 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
861 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
862 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
863 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
864 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
865 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
866 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
867 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
868 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
869 ; AVX512VLBW-NEXT: retq
871 ; XOPAVX1-LABEL: splatvar_rotate_v32i8:
873 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
874 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
875 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
876 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2
877 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
878 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
881 ; XOPAVX2-LABEL: splatvar_rotate_v32i8:
883 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1
884 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
885 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
886 ; XOPAVX2-NEXT: vprotb %xmm3, %xmm2, %xmm2
887 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
888 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
890 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
891 %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
892 %shl = shl <32 x i8> %a, %splat
893 %lshr = lshr <32 x i8> %a, %splat8
894 %or = or <32 x i8> %shl, %lshr
902 define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
903 ; AVX1-LABEL: constant_rotate_v4i64:
905 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
906 ; AVX1-NEXT: vpsllq $60, %xmm1, %xmm2
907 ; AVX1-NEXT: vpsllq $50, %xmm1, %xmm3
908 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
909 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3
910 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm4
911 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
912 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
913 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3
914 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm1
915 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
916 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3
917 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0
918 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
919 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
920 ; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
923 ; AVX2-LABEL: constant_rotate_v4i64:
925 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1
926 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
927 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
930 ; AVX512F-LABEL: constant_rotate_v4i64:
932 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
933 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
934 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
935 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
938 ; AVX512VL-LABEL: constant_rotate_v4i64:
940 ; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
941 ; AVX512VL-NEXT: retq
943 ; AVX512BW-LABEL: constant_rotate_v4i64:
945 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
946 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
947 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
948 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
949 ; AVX512BW-NEXT: retq
951 ; AVX512VLBW-LABEL: constant_rotate_v4i64:
952 ; AVX512VLBW: # %bb.0:
953 ; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
954 ; AVX512VLBW-NEXT: retq
956 ; XOPAVX1-LABEL: constant_rotate_v4i64:
958 ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
959 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
960 ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
961 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
964 ; XOPAVX2-LABEL: constant_rotate_v4i64:
966 ; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
967 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
968 ; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
969 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
971 %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
972 %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4>
973 %or = or <4 x i64> %shl, %lshr
977 define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
978 ; AVX1-LABEL: constant_rotate_v8i32:
980 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [256,512,1024,2048]
981 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
982 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
983 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
984 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
985 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
986 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
987 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
988 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
989 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
990 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
991 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,32,64,128]
992 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
993 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
994 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
995 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
996 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
997 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
998 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
999 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
1000 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1001 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1004 ; AVX2-LABEL: constant_rotate_v8i32:
1006 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm1
1007 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
1008 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1011 ; AVX512F-LABEL: constant_rotate_v8i32:
1013 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1014 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1015 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1016 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1017 ; AVX512F-NEXT: retq
1019 ; AVX512VL-LABEL: constant_rotate_v8i32:
1020 ; AVX512VL: # %bb.0:
1021 ; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
1022 ; AVX512VL-NEXT: retq
1024 ; AVX512BW-LABEL: constant_rotate_v8i32:
1025 ; AVX512BW: # %bb.0:
1026 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1027 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1028 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1029 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1030 ; AVX512BW-NEXT: retq
1032 ; AVX512VLBW-LABEL: constant_rotate_v8i32:
1033 ; AVX512VLBW: # %bb.0:
1034 ; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
1035 ; AVX512VLBW-NEXT: retq
1037 ; XOPAVX1-LABEL: constant_rotate_v8i32:
1039 ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
1040 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1041 ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
1042 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1043 ; XOPAVX1-NEXT: retq
1045 ; XOPAVX2-LABEL: constant_rotate_v8i32:
1047 ; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
1048 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1049 ; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
1050 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1051 ; XOPAVX2-NEXT: retq
1052 %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1053 %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
1054 %or = or <8 x i32> %shl, %lshr
1058 define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
1059 ; AVX1-LABEL: constant_rotate_v16i16:
1061 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1062 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
1063 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
1064 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1065 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1066 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1067 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3
1068 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1069 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1070 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1073 ; AVX2-LABEL: constant_rotate_v16i16:
1075 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1076 ; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1077 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1078 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
1081 ; AVX512F-LABEL: constant_rotate_v16i16:
1083 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1084 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1085 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1086 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
1087 ; AVX512F-NEXT: retq
1089 ; AVX512VL-LABEL: constant_rotate_v16i16:
1090 ; AVX512VL: # %bb.0:
1091 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1092 ; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1093 ; AVX512VL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1094 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
1095 ; AVX512VL-NEXT: retq
1097 ; AVX512BW-LABEL: constant_rotate_v16i16:
1098 ; AVX512BW: # %bb.0:
1099 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1100 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1101 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1102 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2
1103 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1104 ; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0
1105 ; AVX512BW-NEXT: retq
1107 ; AVX512VLBW-LABEL: constant_rotate_v16i16:
1108 ; AVX512VLBW: # %bb.0:
1109 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
1110 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1111 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1112 ; AVX512VLBW-NEXT: retq
1114 ; XOPAVX1-LABEL: constant_rotate_v16i16:
1116 ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
1117 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1118 ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
1119 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1120 ; XOPAVX1-NEXT: retq
1122 ; XOPAVX2-LABEL: constant_rotate_v16i16:
1124 ; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
1125 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1126 ; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
1127 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1128 ; XOPAVX2-NEXT: retq
1129 %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1130 %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
1131 %or = or <16 x i16> %shl, %lshr
1135 define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
1136 ; AVX1-LABEL: constant_rotate_v32i8:
1138 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1139 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
1140 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1141 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [256,128,64,32,16,8,4,2]
1142 ; AVX1-NEXT: vpmullw %xmm9, %xmm3, %xmm3
1143 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1144 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1145 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [256,2,4,8,16,32,64,128]
1146 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm7
1147 ; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
1148 ; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
1149 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1150 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
1151 ; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1
1152 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1153 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1154 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1155 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm5
1156 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
1157 ; AVX1-NEXT: vpackuswb %xmm1, %xmm5, %xmm1
1158 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1159 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1160 ; AVX1-NEXT: vpmullw %xmm9, %xmm3, %xmm3
1161 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1162 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1163 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm6
1164 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
1165 ; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
1166 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1167 ; AVX1-NEXT: vpmullw %xmm7, %xmm0, %xmm0
1168 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1169 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
1170 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
1171 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
1172 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1173 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1176 ; AVX2-LABEL: constant_rotate_v32i8:
1178 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
1179 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1180 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1181 ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1182 ; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3
1183 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1184 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1185 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1186 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1187 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1188 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1189 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1190 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1191 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
1192 ; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
1193 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1194 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1195 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1196 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1197 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1200 ; AVX512F-LABEL: constant_rotate_v32i8:
1202 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
1203 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1204 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1205 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1206 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3
1207 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1208 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1209 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1210 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1211 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1212 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1213 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1214 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1215 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
1216 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
1217 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1218 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1219 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1220 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1221 ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0
1222 ; AVX512F-NEXT: retq
1224 ; AVX512VL-LABEL: constant_rotate_v32i8:
1225 ; AVX512VL: # %bb.0:
1226 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1227 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1228 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1229 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1230 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3
1231 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1232 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1233 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1234 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1235 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1236 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1237 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1238 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1239 ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
1240 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1241 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1242 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1243 ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1244 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1245 ; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1246 ; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0
1247 ; AVX512VL-NEXT: retq
1249 ; AVX512BW-LABEL: constant_rotate_v32i8:
1250 ; AVX512BW: # %bb.0:
1251 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1252 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
1253 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
1254 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
1255 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1256 ; AVX512BW-NEXT: retq
1258 ; AVX512VLBW-LABEL: constant_rotate_v32i8:
1259 ; AVX512VLBW: # %bb.0:
1260 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1261 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
1262 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
1263 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
1264 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
1265 ; AVX512VLBW-NEXT: retq
1267 ; XOPAVX1-LABEL: constant_rotate_v32i8:
1269 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1270 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1271 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1
1272 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0
1273 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1274 ; XOPAVX1-NEXT: retq
1276 ; XOPAVX2-LABEL: constant_rotate_v32i8:
1278 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1279 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1280 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1
1281 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0
1282 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1283 ; XOPAVX2-NEXT: retq
1284 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1285 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1286 %or = or <32 x i8> %shl, %lshr
1291 ; Uniform Constant Rotates
1294 define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
1295 ; AVX1-LABEL: splatconstant_rotate_v4i64:
1297 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1
1298 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1299 ; AVX1-NEXT: vpsllq $14, %xmm2, %xmm3
1300 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1301 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm0
1302 ; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm2
1303 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1304 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
1307 ; AVX2-LABEL: splatconstant_rotate_v4i64:
1309 ; AVX2-NEXT: vpsllq $14, %ymm0, %ymm1
1310 ; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm0
1311 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1314 ; AVX512F-LABEL: splatconstant_rotate_v4i64:
1316 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1317 ; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0
1318 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1319 ; AVX512F-NEXT: retq
1321 ; AVX512VL-LABEL: splatconstant_rotate_v4i64:
1322 ; AVX512VL: # %bb.0:
1323 ; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0
1324 ; AVX512VL-NEXT: retq
1326 ; AVX512BW-LABEL: splatconstant_rotate_v4i64:
1327 ; AVX512BW: # %bb.0:
1328 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1329 ; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0
1330 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1331 ; AVX512BW-NEXT: retq
1333 ; AVX512VLBW-LABEL: splatconstant_rotate_v4i64:
1334 ; AVX512VLBW: # %bb.0:
1335 ; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0
1336 ; AVX512VLBW-NEXT: retq
1338 ; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
1340 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
1341 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1342 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
1343 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1344 ; XOPAVX1-NEXT: retq
1346 ; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
1348 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
1349 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1350 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
1351 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1352 ; XOPAVX2-NEXT: retq
1353 %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
1354 %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
1355 %or = or <4 x i64> %shl, %lshr
1359 define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
1360 ; AVX1-LABEL: splatconstant_rotate_v8i32:
1362 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1363 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1364 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1365 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1366 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1367 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1368 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1369 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1372 ; AVX2-LABEL: splatconstant_rotate_v8i32:
1374 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1375 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1376 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1379 ; AVX512F-LABEL: splatconstant_rotate_v8i32:
1381 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1382 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1383 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1384 ; AVX512F-NEXT: retq
1386 ; AVX512VL-LABEL: splatconstant_rotate_v8i32:
1387 ; AVX512VL: # %bb.0:
1388 ; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
1389 ; AVX512VL-NEXT: retq
1391 ; AVX512BW-LABEL: splatconstant_rotate_v8i32:
1392 ; AVX512BW: # %bb.0:
1393 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1394 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1395 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1396 ; AVX512BW-NEXT: retq
1398 ; AVX512VLBW-LABEL: splatconstant_rotate_v8i32:
1399 ; AVX512VLBW: # %bb.0:
1400 ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
1401 ; AVX512VLBW-NEXT: retq
1403 ; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
1405 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1406 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1407 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1408 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1409 ; XOPAVX1-NEXT: retq
1411 ; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
1413 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1414 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1415 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1416 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1417 ; XOPAVX2-NEXT: retq
1418 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1419 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1420 %or = or <8 x i32> %shl, %lshr
1424 define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
1425 ; AVX1-LABEL: splatconstant_rotate_v16i16:
1427 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1428 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
1429 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
1430 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1431 ; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm2
1432 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
1433 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1434 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1437 ; AVX2-LABEL: splatconstant_rotate_v16i16:
1439 ; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm1
1440 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
1441 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1444 ; AVX512-LABEL: splatconstant_rotate_v16i16:
1446 ; AVX512-NEXT: vpsrlw $9, %ymm0, %ymm1
1447 ; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0
1448 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
1451 ; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
1453 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
1454 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1455 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
1456 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1457 ; XOPAVX1-NEXT: retq
1459 ; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
1461 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
1462 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1463 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
1464 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1465 ; XOPAVX2-NEXT: retq
1466 %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1467 %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1468 %or = or <16 x i16> %shl, %lshr
1472 define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
1473 ; AVX1-LABEL: splatconstant_rotate_v32i8:
1475 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1476 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1477 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1478 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1479 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1480 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1481 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1482 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1483 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1484 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1485 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1486 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1487 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1490 ; AVX2-LABEL: splatconstant_rotate_v32i8:
1492 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1493 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1494 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1495 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1496 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1499 ; AVX512F-LABEL: splatconstant_rotate_v32i8:
1501 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1
1502 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1503 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1504 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1505 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1506 ; AVX512F-NEXT: retq
1508 ; AVX512VL-LABEL: splatconstant_rotate_v32i8:
1509 ; AVX512VL: # %bb.0:
1510 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm1
1511 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1512 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
1513 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1514 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1515 ; AVX512VL-NEXT: retq
1517 ; AVX512BW-LABEL: splatconstant_rotate_v32i8:
1518 ; AVX512BW: # %bb.0:
1519 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1
1520 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1521 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0
1522 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1523 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
1524 ; AVX512BW-NEXT: retq
1526 ; AVX512VLBW-LABEL: splatconstant_rotate_v32i8:
1527 ; AVX512VLBW: # %bb.0:
1528 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
1529 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1530 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
1531 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1532 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0
1533 ; AVX512VLBW-NEXT: retq
1535 ; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
1537 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1538 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1539 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1540 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1541 ; XOPAVX1-NEXT: retq
1543 ; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
1545 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1546 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1547 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1548 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1549 ; XOPAVX2-NEXT: retq
1550 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1551 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1552 %or = or <32 x i8> %shl, %lshr
1557 ; Masked Uniform Constant Rotates
1560 define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
1561 ; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
1563 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm1
1564 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1565 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
1566 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1567 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1570 ; AVX2-LABEL: splatconstant_rotate_mask_v4i64:
1572 ; AVX2-NEXT: vpsrlq $49, %ymm0, %ymm0
1573 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1576 ; AVX512F-LABEL: splatconstant_rotate_mask_v4i64:
1578 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1579 ; AVX512F-NEXT: vprolq $15, %zmm0, %zmm0
1580 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1581 ; AVX512F-NEXT: retq
1583 ; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64:
1584 ; AVX512VL: # %bb.0:
1585 ; AVX512VL-NEXT: vprolq $15, %ymm0, %ymm0
1586 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1587 ; AVX512VL-NEXT: retq
1589 ; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64:
1590 ; AVX512BW: # %bb.0:
1591 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1592 ; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0
1593 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1594 ; AVX512BW-NEXT: retq
1596 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i64:
1597 ; AVX512VLBW: # %bb.0:
1598 ; AVX512VLBW-NEXT: vprolq $15, %ymm0, %ymm0
1599 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1600 ; AVX512VLBW-NEXT: retq
1602 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
1604 ; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm1
1605 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1606 ; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0
1607 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1608 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1609 ; XOPAVX1-NEXT: retq
1611 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
1613 ; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1
1614 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1615 ; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0
1616 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1617 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1618 ; XOPAVX2-NEXT: retq
1619 %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
1620 %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
1621 %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255>
1622 %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257>
1623 %or = or <4 x i64> %lmask, %rmask
1627 define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
1628 ; AVX1-LABEL: splatconstant_rotate_mask_v8i32:
1630 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1631 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1632 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1633 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1634 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1635 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1636 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1637 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1638 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1641 ; AVX2-LABEL: splatconstant_rotate_mask_v8i32:
1643 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1644 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1645 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1646 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1649 ; AVX512F-LABEL: splatconstant_rotate_mask_v8i32:
1651 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1652 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1653 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1654 ; AVX512F-NEXT: retq
1656 ; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32:
1657 ; AVX512VL: # %bb.0:
1658 ; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
1659 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1660 ; AVX512VL-NEXT: retq
1662 ; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32:
1663 ; AVX512BW: # %bb.0:
1664 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1665 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1666 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1667 ; AVX512BW-NEXT: retq
1669 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i32:
1670 ; AVX512VLBW: # %bb.0:
1671 ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
1672 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1673 ; AVX512VLBW-NEXT: retq
1675 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
1677 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1678 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1679 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1680 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1681 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1682 ; XOPAVX1-NEXT: retq
1684 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
1686 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1687 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1688 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1689 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1690 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1691 ; XOPAVX2-NEXT: retq
1692 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1693 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1694 %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
1695 %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
1696 %or = or <8 x i32> %lmask, %rmask
1700 define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
1701 ; AVX1-LABEL: splatconstant_rotate_mask_v16i16:
1703 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1704 ; AVX1-NEXT: vpsrlw $11, %xmm1, %xmm2
1705 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
1706 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1707 ; AVX1-NEXT: vpsrlw $11, %xmm0, %xmm2
1708 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
1709 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1710 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1711 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1714 ; AVX2-LABEL: splatconstant_rotate_mask_v16i16:
1716 ; AVX2-NEXT: vpsrlw $11, %ymm0, %ymm1
1717 ; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0
1718 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1719 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1722 ; AVX512-LABEL: splatconstant_rotate_mask_v16i16:
1724 ; AVX512-NEXT: vpsrlw $11, %ymm0, %ymm1
1725 ; AVX512-NEXT: vpsllw $5, %ymm0, %ymm0
1726 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
1727 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1730 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
1732 ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1
1733 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1734 ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0
1735 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1736 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1737 ; XOPAVX1-NEXT: retq
1739 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
1741 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1
1742 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1743 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0
1744 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1745 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1746 ; XOPAVX2-NEXT: retq
1747 %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1748 %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1749 %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1750 %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1751 %or = or <16 x i16> %lmask, %rmask
1755 define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
1756 ; AVX1-LABEL: splatconstant_rotate_mask_v32i8:
1758 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1759 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1760 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1761 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1762 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1763 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1764 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1765 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1766 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1767 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1768 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1769 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1770 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1771 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1774 ; AVX2-LABEL: splatconstant_rotate_mask_v32i8:
1776 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1777 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1778 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1779 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1780 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1781 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1784 ; AVX512F-LABEL: splatconstant_rotate_mask_v32i8:
1786 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1
1787 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1788 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1789 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1790 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1791 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1792 ; AVX512F-NEXT: retq
1794 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i8:
1795 ; AVX512VL: # %bb.0:
1796 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm1
1797 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1798 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
1799 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1800 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1801 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1802 ; AVX512VL-NEXT: retq
1804 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i8:
1805 ; AVX512BW: # %bb.0:
1806 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1
1807 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0
1808 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1809 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1810 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
1811 ; AVX512BW-NEXT: retq
1813 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i8:
1814 ; AVX512VLBW: # %bb.0:
1815 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
1816 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
1817 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1818 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1819 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0
1820 ; AVX512VLBW-NEXT: retq
1822 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
1824 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1825 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1826 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1827 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1828 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1829 ; XOPAVX1-NEXT: retq
1831 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
1833 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1834 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1835 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1836 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1837 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1838 ; XOPAVX2-NEXT: retq
1839 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1840 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1841 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1842 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1843 %or = or <32 x i8> %lmask, %rmask