1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512VLBW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefix=AVX512VBMI2
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefix=AVX512VLVBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
17 define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
18 ; AVX1-LABEL: var_rotate_v4i64:
20 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
21 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
22 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
23 ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
24 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
25 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6
26 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
27 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4
28 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
29 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6
30 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
31 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
32 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
33 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
34 ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4
35 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
36 ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2
37 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
38 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4
39 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
40 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0
41 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
42 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
43 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
46 ; AVX2-LABEL: var_rotate_v4i64:
48 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64]
49 ; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2
50 ; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1
51 ; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0
52 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
55 ; AVX512F-LABEL: var_rotate_v4i64:
57 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
58 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
59 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
60 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
63 ; AVX512VL-LABEL: var_rotate_v4i64:
65 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
68 ; AVX512BW-LABEL: var_rotate_v4i64:
70 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
71 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
72 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
73 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
76 ; AVX512VLBW-LABEL: var_rotate_v4i64:
77 ; AVX512VLBW: # %bb.0:
78 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
79 ; AVX512VLBW-NEXT: retq
81 ; AVX512VBMI2-LABEL: var_rotate_v4i64:
82 ; AVX512VBMI2: # %bb.0:
83 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
84 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
85 ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
86 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
87 ; AVX512VBMI2-NEXT: retq
89 ; AVX512VLVBMI2-LABEL: var_rotate_v4i64:
90 ; AVX512VLVBMI2: # %bb.0:
91 ; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0
92 ; AVX512VLVBMI2-NEXT: retq
94 ; XOPAVX1-LABEL: var_rotate_v4i64:
96 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
97 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
98 ; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
99 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
100 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
103 ; XOPAVX2-LABEL: var_rotate_v4i64:
105 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
106 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
107 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
108 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
109 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
111 %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
112 %shl = shl <4 x i64> %a, %b
113 %lshr = lshr <4 x i64> %a, %b64
114 %or = or <4 x i64> %shl, %lshr
118 define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
119 ; AVX1-LABEL: var_rotate_v8i32:
121 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
122 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
123 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
124 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
125 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
126 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
127 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
128 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
129 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
130 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
131 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
132 ; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm2
133 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
134 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
135 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
136 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
137 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
138 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
139 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
140 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
141 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
142 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
143 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
144 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
145 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
146 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
147 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
148 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
149 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
150 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
151 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
154 ; AVX2-LABEL: var_rotate_v8i32:
156 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
157 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
158 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2
159 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
160 ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
161 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
162 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
165 ; AVX512F-LABEL: var_rotate_v8i32:
167 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
168 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
169 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
170 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
173 ; AVX512VL-LABEL: var_rotate_v8i32:
175 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
176 ; AVX512VL-NEXT: retq
178 ; AVX512BW-LABEL: var_rotate_v8i32:
180 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
181 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
182 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
183 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
184 ; AVX512BW-NEXT: retq
186 ; AVX512VLBW-LABEL: var_rotate_v8i32:
187 ; AVX512VLBW: # %bb.0:
188 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
189 ; AVX512VLBW-NEXT: retq
191 ; AVX512VBMI2-LABEL: var_rotate_v8i32:
192 ; AVX512VBMI2: # %bb.0:
193 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
194 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
195 ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
196 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
197 ; AVX512VBMI2-NEXT: retq
199 ; AVX512VLVBMI2-LABEL: var_rotate_v8i32:
200 ; AVX512VLVBMI2: # %bb.0:
201 ; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0
202 ; AVX512VLVBMI2-NEXT: retq
204 ; XOPAVX1-LABEL: var_rotate_v8i32:
206 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
207 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
208 ; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
209 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
210 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
213 ; XOPAVX2-LABEL: var_rotate_v8i32:
215 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
216 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
217 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
218 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
219 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
221 %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
222 %shl = shl <8 x i32> %a, %b
223 %lshr = lshr <8 x i32> %a, %b32
224 %or = or <8 x i32> %shl, %lshr
228 define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
229 ; AVX1-LABEL: var_rotate_v16i16:
231 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
232 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
233 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
234 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
235 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
236 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
237 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
238 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
239 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
240 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
241 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
242 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
243 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
244 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
245 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6
246 ; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
247 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
248 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
249 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
250 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
251 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
252 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
253 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
254 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
255 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
256 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
257 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
258 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
259 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
260 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
261 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
264 ; AVX2-LABEL: var_rotate_v16i16:
266 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
267 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
268 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
269 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
270 ; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm4
271 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
272 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
273 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
274 ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm5
275 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
276 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4
277 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
278 ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1
279 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
280 ; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3
281 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
282 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
283 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
284 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
285 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
286 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
289 ; AVX512F-LABEL: var_rotate_v16i16:
291 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
292 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
293 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
294 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
295 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
296 ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1
297 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
298 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
299 ; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0
300 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
303 ; AVX512VL-LABEL: var_rotate_v16i16:
305 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
306 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
307 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
308 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
309 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
310 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm3, %ymm1
311 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
312 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
313 ; AVX512VL-NEXT: vpord %zmm0, %zmm2, %zmm0
314 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
315 ; AVX512VL-NEXT: retq
317 ; AVX512BW-LABEL: var_rotate_v16i16:
319 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
320 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
321 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
322 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
323 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
324 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
325 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
326 ; AVX512BW-NEXT: retq
328 ; AVX512VLBW-LABEL: var_rotate_v16i16:
329 ; AVX512VLBW: # %bb.0:
330 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
331 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
332 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
333 ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
334 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
335 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
336 ; AVX512VLBW-NEXT: retq
338 ; AVX512VBMI2-LABEL: var_rotate_v16i16:
339 ; AVX512VBMI2: # %bb.0:
340 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
341 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
342 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
343 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
344 ; AVX512VBMI2-NEXT: retq
346 ; AVX512VLVBMI2-LABEL: var_rotate_v16i16:
347 ; AVX512VLVBMI2: # %bb.0:
348 ; AVX512VLVBMI2-NEXT: vpshldvw %ymm1, %ymm0, %ymm0
349 ; AVX512VLVBMI2-NEXT: retq
351 ; XOPAVX1-LABEL: var_rotate_v16i16:
353 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
354 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
355 ; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
356 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
357 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
360 ; XOPAVX2-LABEL: var_rotate_v16i16:
362 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
363 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
364 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
365 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
366 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
368 %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
369 %shl = shl <16 x i16> %a, %b
370 %lshr = lshr <16 x i16> %a, %b16
371 %or = or <16 x i16> %shl, %lshr
375 define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
376 ; AVX1-LABEL: var_rotate_v32i8:
378 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
379 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
380 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
381 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
382 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
383 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
384 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
385 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
386 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
387 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
388 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
389 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
390 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
391 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7
392 ; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
393 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
394 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
395 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
396 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
397 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
398 ; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
399 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm7
400 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
401 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
402 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
403 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
404 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
405 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
406 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
407 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
408 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
409 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
410 ; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3
411 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
412 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
413 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
414 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
415 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
416 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
417 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
418 ; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
419 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
420 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
421 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
422 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
423 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
426 ; AVX2-LABEL: var_rotate_v32i8:
428 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
429 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
430 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3
431 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
432 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
433 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
434 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
435 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2
436 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
437 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3
438 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
439 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
440 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
441 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
442 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
443 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3
444 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
445 ; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
446 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
447 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
450 ; AVX512F-LABEL: var_rotate_v32i8:
452 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
453 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
454 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
455 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
456 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
457 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
458 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
459 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm2
460 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
461 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3
462 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
463 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
464 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
465 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
466 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
467 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
468 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
469 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
470 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
471 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
474 ; AVX512VL-LABEL: var_rotate_v32i8:
476 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
477 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
478 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
479 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
480 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
481 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
482 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
483 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
484 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
485 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
486 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
487 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
488 ; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
489 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
490 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
491 ; AVX512VL-NEXT: retq
493 ; AVX512BW-LABEL: var_rotate_v32i8:
495 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
496 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
497 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
498 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
499 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
500 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
501 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
502 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
503 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
504 ; AVX512BW-NEXT: retq
506 ; AVX512VLBW-LABEL: var_rotate_v32i8:
507 ; AVX512VLBW: # %bb.0:
508 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
509 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
510 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
511 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
512 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
513 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
514 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
515 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
516 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
517 ; AVX512VLBW-NEXT: retq
519 ; AVX512VBMI2-LABEL: var_rotate_v32i8:
520 ; AVX512VBMI2: # %bb.0:
521 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
522 ; AVX512VBMI2-NEXT: vpsubb %ymm1, %ymm2, %ymm2
523 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
524 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
525 ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
526 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
527 ; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
528 ; AVX512VBMI2-NEXT: vporq %zmm0, %zmm1, %zmm0
529 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
530 ; AVX512VBMI2-NEXT: retq
532 ; AVX512VLVBMI2-LABEL: var_rotate_v32i8:
533 ; AVX512VLVBMI2: # %bb.0:
534 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
535 ; AVX512VLVBMI2-NEXT: vpsubb %ymm1, %ymm2, %ymm2
536 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
537 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
538 ; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
539 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
540 ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
541 ; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm1, %zmm0
542 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
543 ; AVX512VLVBMI2-NEXT: retq
545 ; XOPAVX1-LABEL: var_rotate_v32i8:
547 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
548 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
549 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
550 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
551 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
554 ; XOPAVX2-LABEL: var_rotate_v32i8:
556 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
557 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
558 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
559 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
560 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
562 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
563 %shl = shl <32 x i8> %a, %b
564 %lshr = lshr <32 x i8> %a, %b8
565 %or = or <32 x i8> %shl, %lshr
570 ; Uniform Variable Rotates
573 define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
574 ; AVX1-LABEL: splatvar_rotate_v4i64:
576 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
577 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
578 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
579 ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4
580 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
581 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
582 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
583 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
584 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
585 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
588 ; AVX2-LABEL: splatvar_rotate_v4i64:
590 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
591 ; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2
592 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm1
593 ; AVX2-NEXT: vpsrlq %xmm2, %ymm0, %ymm0
594 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
597 ; AVX512F-LABEL: splatvar_rotate_v4i64:
599 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
600 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
601 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
602 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
605 ; AVX512VL-LABEL: splatvar_rotate_v4i64:
607 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
608 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
609 ; AVX512VL-NEXT: retq
611 ; AVX512BW-LABEL: splatvar_rotate_v4i64:
613 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
614 ; AVX512BW-NEXT: vpbroadcastq %xmm1, %ymm1
615 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
616 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
617 ; AVX512BW-NEXT: retq
619 ; AVX512VLBW-LABEL: splatvar_rotate_v4i64:
620 ; AVX512VLBW: # %bb.0:
621 ; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %ymm1
622 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
623 ; AVX512VLBW-NEXT: retq
625 ; AVX512VBMI2-LABEL: splatvar_rotate_v4i64:
626 ; AVX512VBMI2: # %bb.0:
627 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
628 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
629 ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
630 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
631 ; AVX512VBMI2-NEXT: retq
633 ; AVX512VLVBMI2-LABEL: splatvar_rotate_v4i64:
634 ; AVX512VLVBMI2: # %bb.0:
635 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
636 ; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0
637 ; AVX512VLVBMI2-NEXT: retq
639 ; XOPAVX1-LABEL: splatvar_rotate_v4i64:
641 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
642 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
643 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
644 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
645 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
648 ; XOPAVX2-LABEL: splatvar_rotate_v4i64:
650 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
651 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
652 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2
653 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
654 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
656 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
657 %splat64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %splat
658 %shl = shl <4 x i64> %a, %splat
659 %lshr = lshr <4 x i64> %a, %splat64
660 %or = or <4 x i64> %shl, %lshr
664 define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
665 ; AVX1-LABEL: splatvar_rotate_v8i32:
667 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
668 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
669 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
670 ; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm4
671 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
672 ; AVX1-NEXT: vpsubd %xmm1, %xmm5, %xmm1
673 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
674 ; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
675 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
676 ; AVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3
677 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
678 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
679 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
682 ; AVX2-LABEL: splatvar_rotate_v8i32:
684 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
685 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
686 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
687 ; AVX2-NEXT: vpslld %xmm2, %ymm0, %ymm2
688 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
689 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
690 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
691 ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
692 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
695 ; AVX512F-LABEL: splatvar_rotate_v8i32:
697 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
698 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
699 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
700 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
703 ; AVX512VL-LABEL: splatvar_rotate_v8i32:
705 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %ymm1
706 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
707 ; AVX512VL-NEXT: retq
709 ; AVX512BW-LABEL: splatvar_rotate_v8i32:
711 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
712 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1
713 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
714 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
715 ; AVX512BW-NEXT: retq
717 ; AVX512VLBW-LABEL: splatvar_rotate_v8i32:
718 ; AVX512VLBW: # %bb.0:
719 ; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %ymm1
720 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
721 ; AVX512VLBW-NEXT: retq
723 ; AVX512VBMI2-LABEL: splatvar_rotate_v8i32:
724 ; AVX512VBMI2: # %bb.0:
725 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
726 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
727 ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
728 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
729 ; AVX512VBMI2-NEXT: retq
731 ; AVX512VLVBMI2-LABEL: splatvar_rotate_v8i32:
732 ; AVX512VLVBMI2: # %bb.0:
733 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
734 ; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0
735 ; AVX512VLVBMI2-NEXT: retq
737 ; XOPAVX1-LABEL: splatvar_rotate_v8i32:
739 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
740 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
741 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
742 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
743 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
746 ; XOPAVX2-LABEL: splatvar_rotate_v8i32:
748 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
749 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
750 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2
751 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
752 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
754 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
755 %splat32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
756 %shl = shl <8 x i32> %a, %splat
757 %lshr = lshr <8 x i32> %a, %splat32
758 %or = or <8 x i32> %shl, %lshr
762 define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
763 ; AVX1-LABEL: splatvar_rotate_v16i16:
765 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
766 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
767 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
768 ; AVX1-NEXT: vpsllw %xmm3, %xmm2, %xmm4
769 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
770 ; AVX1-NEXT: vpsubw %xmm1, %xmm5, %xmm1
771 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
772 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
773 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
774 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
775 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
776 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
777 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
780 ; AVX2-LABEL: splatvar_rotate_v16i16:
782 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
783 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
784 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2
785 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
786 ; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
787 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
788 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
789 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
792 ; AVX512F-LABEL: splatvar_rotate_v16i16:
794 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
795 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
796 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm2
797 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
798 ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1
799 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
800 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
801 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
804 ; AVX512VL-LABEL: splatvar_rotate_v16i16:
806 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
807 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
808 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm2
809 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
810 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1
811 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
812 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
813 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
814 ; AVX512VL-NEXT: retq
816 ; AVX512BW-LABEL: splatvar_rotate_v16i16:
818 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
819 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
820 ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm2
821 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
822 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
823 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
824 ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
825 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
826 ; AVX512BW-NEXT: retq
828 ; AVX512VLBW-LABEL: splatvar_rotate_v16i16:
829 ; AVX512VLBW: # %bb.0:
830 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
831 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
832 ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm2
833 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
834 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
835 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
836 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
837 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
838 ; AVX512VLBW-NEXT: retq
840 ; AVX512VBMI2-LABEL: splatvar_rotate_v16i16:
841 ; AVX512VBMI2: # %bb.0:
842 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
843 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %ymm1
844 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
845 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
846 ; AVX512VBMI2-NEXT: retq
848 ; AVX512VLVBMI2-LABEL: splatvar_rotate_v16i16:
849 ; AVX512VLVBMI2: # %bb.0:
850 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %ymm1
851 ; AVX512VLVBMI2-NEXT: vpshldvw %ymm1, %ymm0, %ymm0
852 ; AVX512VLVBMI2-NEXT: retq
854 ; XOPAVX1-LABEL: splatvar_rotate_v16i16:
856 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
857 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
858 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
859 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
860 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
861 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
864 ; XOPAVX2-LABEL: splatvar_rotate_v16i16:
866 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
867 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
868 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2
869 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
870 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
872 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
873 %splat16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
874 %shl = shl <16 x i16> %a, %splat
875 %lshr = lshr <16 x i16> %a, %splat16
876 %or = or <16 x i16> %shl, %lshr
880 define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
881 ; AVX1-LABEL: splatvar_rotate_v32i8:
883 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
884 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
885 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
886 ; AVX1-NEXT: vpsllw %xmm3, %xmm2, %xmm4
887 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
888 ; AVX1-NEXT: vpsllw %xmm3, %xmm5, %xmm6
889 ; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
890 ; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6
891 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
892 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
893 ; AVX1-NEXT: vpsubb %xmm1, %xmm7, %xmm1
894 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
895 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
896 ; AVX1-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
897 ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
898 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
899 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
900 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
901 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
902 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
903 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
904 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
905 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
908 ; AVX2-LABEL: splatvar_rotate_v32i8:
910 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
911 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
912 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3
913 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
914 ; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
915 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
916 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
917 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
918 ; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
919 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
920 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
921 ; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
922 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
923 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
924 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
925 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
928 ; AVX512F-LABEL: splatvar_rotate_v32i8:
930 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
931 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
932 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3
933 ; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
934 ; AVX512F-NEXT: vpsllw %xmm2, %xmm4, %xmm2
935 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
936 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
937 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
938 ; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
939 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
940 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
941 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
942 ; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1
943 ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
944 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
945 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
948 ; AVX512VL-LABEL: splatvar_rotate_v32i8:
950 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
951 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
952 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3
953 ; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
954 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm4, %xmm2
955 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
956 ; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
957 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
958 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
959 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
960 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm3
961 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
962 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
963 ; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
964 ; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0
965 ; AVX512VL-NEXT: retq
967 ; AVX512BW-LABEL: splatvar_rotate_v32i8:
969 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
970 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
971 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
972 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
973 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm3, %xmm1
974 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
975 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
976 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
977 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
978 ; AVX512BW-NEXT: retq
980 ; AVX512VLBW-LABEL: splatvar_rotate_v32i8:
981 ; AVX512VLBW: # %bb.0:
982 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
983 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
984 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
985 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
986 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm3, %xmm1
987 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
988 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
989 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
990 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
991 ; AVX512VLBW-NEXT: retq
993 ; AVX512VBMI2-LABEL: splatvar_rotate_v32i8:
994 ; AVX512VBMI2: # %bb.0:
995 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
996 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
997 ; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm2
998 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
999 ; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
1000 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1001 ; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
1002 ; AVX512VBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0
1003 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
1004 ; AVX512VBMI2-NEXT: retq
1006 ; AVX512VLVBMI2-LABEL: splatvar_rotate_v32i8:
1007 ; AVX512VLVBMI2: # %bb.0:
1008 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1009 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1010 ; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm2
1011 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1012 ; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
1013 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1014 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
1015 ; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0
1016 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
1017 ; AVX512VLVBMI2-NEXT: retq
1019 ; XOPAVX1-LABEL: splatvar_rotate_v32i8:
1021 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1022 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1023 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1024 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2
1025 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
1026 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1027 ; XOPAVX1-NEXT: retq
1029 ; XOPAVX2-LABEL: splatvar_rotate_v32i8:
1031 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1032 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1033 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2
1034 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
1035 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1036 ; XOPAVX2-NEXT: retq
1037 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
1038 %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
1039 %shl = shl <32 x i8> %a, %splat
1040 %lshr = lshr <32 x i8> %a, %splat8
1041 %or = or <32 x i8> %shl, %lshr
1049 define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
1050 ; AVX1-LABEL: constant_rotate_v4i64:
1052 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1053 ; AVX1-NEXT: vpsllq $60, %xmm1, %xmm2
1054 ; AVX1-NEXT: vpsllq $50, %xmm1, %xmm3
1055 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1056 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3
1057 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm4
1058 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1059 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1060 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3
1061 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm1
1062 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1063 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3
1064 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0
1065 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
1066 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1067 ; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
1070 ; AVX2-LABEL: constant_rotate_v4i64:
1072 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1073 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1074 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1077 ; AVX512F-LABEL: constant_rotate_v4i64:
1079 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1080 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1081 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1082 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1083 ; AVX512F-NEXT: retq
1085 ; AVX512VL-LABEL: constant_rotate_v4i64:
1086 ; AVX512VL: # %bb.0:
1087 ; AVX512VL-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1088 ; AVX512VL-NEXT: retq
1090 ; AVX512BW-LABEL: constant_rotate_v4i64:
1091 ; AVX512BW: # %bb.0:
1092 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1093 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1094 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1095 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1096 ; AVX512BW-NEXT: retq
1098 ; AVX512VLBW-LABEL: constant_rotate_v4i64:
1099 ; AVX512VLBW: # %bb.0:
1100 ; AVX512VLBW-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1101 ; AVX512VLBW-NEXT: retq
1103 ; AVX512VBMI2-LABEL: constant_rotate_v4i64:
1104 ; AVX512VBMI2: # %bb.0:
1105 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1106 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1107 ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1108 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1109 ; AVX512VBMI2-NEXT: retq
1111 ; AVX512VLVBMI2-LABEL: constant_rotate_v4i64:
1112 ; AVX512VLVBMI2: # %bb.0:
1113 ; AVX512VLVBMI2-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1114 ; AVX512VLVBMI2-NEXT: retq
1116 ; XOPAVX1-LABEL: constant_rotate_v4i64:
1118 ; XOPAVX1-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1119 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1120 ; XOPAVX1-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1121 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1122 ; XOPAVX1-NEXT: retq
1124 ; XOPAVX2-LABEL: constant_rotate_v4i64:
1126 ; XOPAVX2-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1127 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1128 ; XOPAVX2-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1129 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1130 ; XOPAVX2-NEXT: retq
1131 %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
1132 %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4>
1133 %or = or <4 x i64> %shl, %lshr
1137 define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
1138 ; AVX1-LABEL: constant_rotate_v8i32:
1140 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [256,512,1024,2048]
1141 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1142 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1143 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1144 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
1145 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1146 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
1147 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
1148 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1149 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1150 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1151 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,32,64,128]
1152 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1153 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1154 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
1155 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
1156 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1157 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1158 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
1159 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
1160 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1161 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1164 ; AVX2-LABEL: constant_rotate_v8i32:
1166 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1167 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1168 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1171 ; AVX512F-LABEL: constant_rotate_v8i32:
1173 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1174 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1175 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1176 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1177 ; AVX512F-NEXT: retq
1179 ; AVX512VL-LABEL: constant_rotate_v8i32:
1180 ; AVX512VL: # %bb.0:
1181 ; AVX512VL-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1182 ; AVX512VL-NEXT: retq
1184 ; AVX512BW-LABEL: constant_rotate_v8i32:
1185 ; AVX512BW: # %bb.0:
1186 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1187 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1188 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1189 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1190 ; AVX512BW-NEXT: retq
1192 ; AVX512VLBW-LABEL: constant_rotate_v8i32:
1193 ; AVX512VLBW: # %bb.0:
1194 ; AVX512VLBW-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1195 ; AVX512VLBW-NEXT: retq
1197 ; AVX512VBMI2-LABEL: constant_rotate_v8i32:
1198 ; AVX512VBMI2: # %bb.0:
1199 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1200 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1201 ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1202 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1203 ; AVX512VBMI2-NEXT: retq
1205 ; AVX512VLVBMI2-LABEL: constant_rotate_v8i32:
1206 ; AVX512VLVBMI2: # %bb.0:
1207 ; AVX512VLVBMI2-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1208 ; AVX512VLVBMI2-NEXT: retq
1210 ; XOPAVX1-LABEL: constant_rotate_v8i32:
1212 ; XOPAVX1-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1213 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1214 ; XOPAVX1-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1215 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1216 ; XOPAVX1-NEXT: retq
1218 ; XOPAVX2-LABEL: constant_rotate_v8i32:
1220 ; XOPAVX2-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1221 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1222 ; XOPAVX2-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1223 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1224 ; XOPAVX2-NEXT: retq
1225 %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1226 %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
1227 %or = or <8 x i32> %shl, %lshr
1231 define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
1232 ; AVX1-LABEL: constant_rotate_v16i16:
1234 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1235 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
1236 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
1237 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1238 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1239 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1240 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3
1241 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1242 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1243 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1246 ; AVX2-LABEL: constant_rotate_v16i16:
1248 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1249 ; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1250 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1251 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
1254 ; AVX512F-LABEL: constant_rotate_v16i16:
1256 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1257 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1258 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1259 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
1260 ; AVX512F-NEXT: retq
1262 ; AVX512VL-LABEL: constant_rotate_v16i16:
1263 ; AVX512VL: # %bb.0:
1264 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1265 ; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1266 ; AVX512VL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1267 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
1268 ; AVX512VL-NEXT: retq
1270 ; AVX512BW-LABEL: constant_rotate_v16i16:
1271 ; AVX512BW: # %bb.0:
1272 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1273 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1274 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1275 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2
1276 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1277 ; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0
1278 ; AVX512BW-NEXT: retq
1280 ; AVX512VLBW-LABEL: constant_rotate_v16i16:
1281 ; AVX512VLBW: # %bb.0:
1282 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1283 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1284 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1285 ; AVX512VLBW-NEXT: retq
1287 ; AVX512VBMI2-LABEL: constant_rotate_v16i16:
1288 ; AVX512VBMI2: # %bb.0:
1289 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1290 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1291 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
1292 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1293 ; AVX512VBMI2-NEXT: retq
1295 ; AVX512VLVBMI2-LABEL: constant_rotate_v16i16:
1296 ; AVX512VLVBMI2: # %bb.0:
1297 ; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1298 ; AVX512VLVBMI2-NEXT: retq
1300 ; XOPAVX1-LABEL: constant_rotate_v16i16:
1302 ; XOPAVX1-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1303 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1304 ; XOPAVX1-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1305 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1306 ; XOPAVX1-NEXT: retq
1308 ; XOPAVX2-LABEL: constant_rotate_v16i16:
1310 ; XOPAVX2-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1311 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1312 ; XOPAVX2-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1313 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1314 ; XOPAVX2-NEXT: retq
1315 %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1316 %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
1317 %or = or <16 x i16> %shl, %lshr
1321 define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
1322 ; AVX1-LABEL: constant_rotate_v32i8:
1324 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1325 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
1326 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1327 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [256,128,64,32,16,8,4,2]
1328 ; AVX1-NEXT: vpmullw %xmm3, %xmm9, %xmm3
1329 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1330 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1331 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [256,2,4,8,16,32,64,128]
1332 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm7
1333 ; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
1334 ; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
1335 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1336 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
1337 ; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1
1338 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1339 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1340 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1341 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm5
1342 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
1343 ; AVX1-NEXT: vpackuswb %xmm1, %xmm5, %xmm1
1344 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1345 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1346 ; AVX1-NEXT: vpmullw %xmm3, %xmm9, %xmm3
1347 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1348 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1349 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm6
1350 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
1351 ; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
1352 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1353 ; AVX1-NEXT: vpmullw %xmm7, %xmm0, %xmm0
1354 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1355 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
1356 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
1357 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
1358 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1359 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1362 ; AVX2-LABEL: constant_rotate_v32i8:
1364 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
1365 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1366 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1367 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
1368 ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1369 ; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3
1370 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1371 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1372 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1373 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1374 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1375 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1376 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1377 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1378 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1379 ; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
1380 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1381 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1382 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1383 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1384 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1387 ; AVX512F-LABEL: constant_rotate_v32i8:
1389 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
1390 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1391 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1392 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1393 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1394 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3
1395 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1396 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1397 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1398 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1399 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1400 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1401 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1402 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1403 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1404 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
1405 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1406 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1407 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1408 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1409 ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0
1410 ; AVX512F-NEXT: retq
1412 ; AVX512VL-LABEL: constant_rotate_v32i8:
1413 ; AVX512VL: # %bb.0:
1414 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1415 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1416 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1417 ; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
1418 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1419 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3
1420 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1421 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1422 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1423 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1424 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1425 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1426 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
1427 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1428 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1429 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
1430 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1431 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1432 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1433 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1434 ; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0
1435 ; AVX512VL-NEXT: retq
1437 ; AVX512BW-LABEL: constant_rotate_v32i8:
1438 ; AVX512BW: # %bb.0:
1439 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1440 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1441 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1442 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
1443 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1444 ; AVX512BW-NEXT: retq
1446 ; AVX512VLBW-LABEL: constant_rotate_v32i8:
1447 ; AVX512VLBW: # %bb.0:
1448 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1449 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1450 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1451 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
1452 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
1453 ; AVX512VLBW-NEXT: retq
1455 ; AVX512VBMI2-LABEL: constant_rotate_v32i8:
1456 ; AVX512VBMI2: # %bb.0:
1457 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1458 ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1459 ; AVX512VBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1460 ; AVX512VBMI2-NEXT: vporq %zmm0, %zmm1, %zmm0
1461 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
1462 ; AVX512VBMI2-NEXT: retq
1464 ; AVX512VLVBMI2-LABEL: constant_rotate_v32i8:
1465 ; AVX512VLVBMI2: # %bb.0:
1466 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1467 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1468 ; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1469 ; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm1, %zmm0
1470 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
1471 ; AVX512VLVBMI2-NEXT: retq
1473 ; XOPAVX1-LABEL: constant_rotate_v32i8:
1475 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1476 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1477 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1
1478 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0
1479 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1480 ; XOPAVX1-NEXT: retq
1482 ; XOPAVX2-LABEL: constant_rotate_v32i8:
1484 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1485 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1486 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1
1487 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0
1488 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1489 ; XOPAVX2-NEXT: retq
1490 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1491 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1492 %or = or <32 x i8> %shl, %lshr
1497 ; Uniform Constant Rotates
1500 define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
1501 ; AVX1-LABEL: splatconstant_rotate_v4i64:
1503 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1
1504 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1505 ; AVX1-NEXT: vpsllq $14, %xmm2, %xmm3
1506 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1507 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm0
1508 ; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm2
1509 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1510 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
1513 ; AVX2-LABEL: splatconstant_rotate_v4i64:
1515 ; AVX2-NEXT: vpsllq $14, %ymm0, %ymm1
1516 ; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm0
1517 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1520 ; AVX512F-LABEL: splatconstant_rotate_v4i64:
1522 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1523 ; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0
1524 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1525 ; AVX512F-NEXT: retq
1527 ; AVX512VL-LABEL: splatconstant_rotate_v4i64:
1528 ; AVX512VL: # %bb.0:
1529 ; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0
1530 ; AVX512VL-NEXT: retq
1532 ; AVX512BW-LABEL: splatconstant_rotate_v4i64:
1533 ; AVX512BW: # %bb.0:
1534 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1535 ; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0
1536 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1537 ; AVX512BW-NEXT: retq
1539 ; AVX512VLBW-LABEL: splatconstant_rotate_v4i64:
1540 ; AVX512VLBW: # %bb.0:
1541 ; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0
1542 ; AVX512VLBW-NEXT: retq
1544 ; AVX512VBMI2-LABEL: splatconstant_rotate_v4i64:
1545 ; AVX512VBMI2: # %bb.0:
1546 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1547 ; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0
1548 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1549 ; AVX512VBMI2-NEXT: retq
1551 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v4i64:
1552 ; AVX512VLVBMI2: # %bb.0:
1553 ; AVX512VLVBMI2-NEXT: vprolq $14, %ymm0, %ymm0
1554 ; AVX512VLVBMI2-NEXT: retq
1556 ; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
1558 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
1559 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1560 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
1561 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1562 ; XOPAVX1-NEXT: retq
1564 ; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
1566 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
1567 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1568 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
1569 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1570 ; XOPAVX2-NEXT: retq
1571 %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
1572 %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
1573 %or = or <4 x i64> %shl, %lshr
1577 define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
1578 ; AVX1-LABEL: splatconstant_rotate_v8i32:
1580 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1581 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1582 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1583 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1584 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1585 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1586 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1587 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1590 ; AVX2-LABEL: splatconstant_rotate_v8i32:
1592 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1593 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1594 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1597 ; AVX512F-LABEL: splatconstant_rotate_v8i32:
1599 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1600 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1601 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1602 ; AVX512F-NEXT: retq
1604 ; AVX512VL-LABEL: splatconstant_rotate_v8i32:
1605 ; AVX512VL: # %bb.0:
1606 ; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
1607 ; AVX512VL-NEXT: retq
1609 ; AVX512BW-LABEL: splatconstant_rotate_v8i32:
1610 ; AVX512BW: # %bb.0:
1611 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1612 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1613 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1614 ; AVX512BW-NEXT: retq
1616 ; AVX512VLBW-LABEL: splatconstant_rotate_v8i32:
1617 ; AVX512VLBW: # %bb.0:
1618 ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
1619 ; AVX512VLBW-NEXT: retq
1621 ; AVX512VBMI2-LABEL: splatconstant_rotate_v8i32:
1622 ; AVX512VBMI2: # %bb.0:
1623 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1624 ; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0
1625 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1626 ; AVX512VBMI2-NEXT: retq
1628 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v8i32:
1629 ; AVX512VLVBMI2: # %bb.0:
1630 ; AVX512VLVBMI2-NEXT: vprold $4, %ymm0, %ymm0
1631 ; AVX512VLVBMI2-NEXT: retq
1633 ; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
1635 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1636 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1637 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1638 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1639 ; XOPAVX1-NEXT: retq
1641 ; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
1643 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1644 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1645 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1646 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1647 ; XOPAVX2-NEXT: retq
1648 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1649 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1650 %or = or <8 x i32> %shl, %lshr
1654 define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
1655 ; AVX1-LABEL: splatconstant_rotate_v16i16:
1657 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1658 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
1659 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
1660 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1661 ; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm2
1662 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
1663 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1664 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1667 ; AVX2-LABEL: splatconstant_rotate_v16i16:
1669 ; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm1
1670 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
1671 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1674 ; AVX512F-LABEL: splatconstant_rotate_v16i16:
1676 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm1
1677 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
1678 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1679 ; AVX512F-NEXT: retq
1681 ; AVX512VL-LABEL: splatconstant_rotate_v16i16:
1682 ; AVX512VL: # %bb.0:
1683 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm1
1684 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
1685 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1686 ; AVX512VL-NEXT: retq
1688 ; AVX512BW-LABEL: splatconstant_rotate_v16i16:
1689 ; AVX512BW: # %bb.0:
1690 ; AVX512BW-NEXT: vpsrlw $9, %ymm0, %ymm1
1691 ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
1692 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1693 ; AVX512BW-NEXT: retq
1695 ; AVX512VLBW-LABEL: splatconstant_rotate_v16i16:
1696 ; AVX512VLBW: # %bb.0:
1697 ; AVX512VLBW-NEXT: vpsrlw $9, %ymm0, %ymm1
1698 ; AVX512VLBW-NEXT: vpsllw $7, %ymm0, %ymm0
1699 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1700 ; AVX512VLBW-NEXT: retq
1702 ; AVX512VBMI2-LABEL: splatconstant_rotate_v16i16:
1703 ; AVX512VBMI2: # %bb.0:
1704 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1705 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
1706 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1707 ; AVX512VBMI2-NEXT: retq
1709 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v16i16:
1710 ; AVX512VLVBMI2: # %bb.0:
1711 ; AVX512VLVBMI2-NEXT: vpshldw $7, %ymm0, %ymm0, %ymm0
1712 ; AVX512VLVBMI2-NEXT: retq
1714 ; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
1716 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
1717 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1718 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
1719 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1720 ; XOPAVX1-NEXT: retq
1722 ; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
1724 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
1725 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1726 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
1727 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1728 ; XOPAVX2-NEXT: retq
1729 %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1730 %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1731 %or = or <16 x i16> %shl, %lshr
1735 define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
1736 ; AVX1-LABEL: splatconstant_rotate_v32i8:
1738 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1739 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1740 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1741 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1742 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1743 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1744 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1745 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1746 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1747 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1748 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1749 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1750 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1753 ; AVX2-LABEL: splatconstant_rotate_v32i8:
1755 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1756 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1757 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1758 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1759 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1762 ; AVX512F-LABEL: splatconstant_rotate_v32i8:
1764 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1
1765 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1766 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1767 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1768 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1769 ; AVX512F-NEXT: retq
1771 ; AVX512VL-LABEL: splatconstant_rotate_v32i8:
1772 ; AVX512VL: # %bb.0:
1773 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1774 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
1775 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1776 ; AVX512VL-NEXT: retq
1778 ; AVX512BW-LABEL: splatconstant_rotate_v32i8:
1779 ; AVX512BW: # %bb.0:
1780 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1
1781 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1782 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0
1783 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1784 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
1785 ; AVX512BW-NEXT: retq
1787 ; AVX512VLBW-LABEL: splatconstant_rotate_v32i8:
1788 ; AVX512VLBW: # %bb.0:
1789 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
1790 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
1791 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1792 ; AVX512VLBW-NEXT: retq
1794 ; AVX512VBMI2-LABEL: splatconstant_rotate_v32i8:
1795 ; AVX512VBMI2: # %bb.0:
1796 ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
1797 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1798 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
1799 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1800 ; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0
1801 ; AVX512VBMI2-NEXT: retq
1803 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v32i8:
1804 ; AVX512VLVBMI2: # %bb.0:
1805 ; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
1806 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
1807 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1808 ; AVX512VLVBMI2-NEXT: retq
1810 ; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
1812 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1813 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1814 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1815 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1816 ; XOPAVX1-NEXT: retq
1818 ; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
1820 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1821 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1822 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1823 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1824 ; XOPAVX2-NEXT: retq
1825 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1826 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1827 %or = or <32 x i8> %shl, %lshr
1832 ; Masked Uniform Constant Rotates
1835 define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
1836 ; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
1838 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm1
1839 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1840 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
1841 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1842 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1845 ; AVX2-LABEL: splatconstant_rotate_mask_v4i64:
1847 ; AVX2-NEXT: vpsrlq $49, %ymm0, %ymm0
1848 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1851 ; AVX512F-LABEL: splatconstant_rotate_mask_v4i64:
1853 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1854 ; AVX512F-NEXT: vprolq $15, %zmm0, %zmm0
1855 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1856 ; AVX512F-NEXT: retq
1858 ; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64:
1859 ; AVX512VL: # %bb.0:
1860 ; AVX512VL-NEXT: vprolq $15, %ymm0, %ymm0
1861 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1862 ; AVX512VL-NEXT: retq
1864 ; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64:
1865 ; AVX512BW: # %bb.0:
1866 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1867 ; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0
1868 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1869 ; AVX512BW-NEXT: retq
1871 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i64:
1872 ; AVX512VLBW: # %bb.0:
1873 ; AVX512VLBW-NEXT: vprolq $15, %ymm0, %ymm0
1874 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1875 ; AVX512VLBW-NEXT: retq
1877 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v4i64:
1878 ; AVX512VBMI2: # %bb.0:
1879 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1880 ; AVX512VBMI2-NEXT: vprolq $15, %zmm0, %zmm0
1881 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1882 ; AVX512VBMI2-NEXT: retq
1884 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v4i64:
1885 ; AVX512VLVBMI2: # %bb.0:
1886 ; AVX512VLVBMI2-NEXT: vprolq $15, %ymm0, %ymm0
1887 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1888 ; AVX512VLVBMI2-NEXT: retq
1890 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
1892 ; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm1
1893 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1894 ; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0
1895 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1896 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1897 ; XOPAVX1-NEXT: retq
1899 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
1901 ; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1
1902 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1903 ; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0
1904 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1905 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1906 ; XOPAVX2-NEXT: retq
1907 %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
1908 %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
1909 %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255>
1910 %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257>
1911 %or = or <4 x i64> %lmask, %rmask
1915 define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
1916 ; AVX1-LABEL: splatconstant_rotate_mask_v8i32:
1918 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1919 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1920 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1921 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1922 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1923 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1924 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1925 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1926 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1929 ; AVX2-LABEL: splatconstant_rotate_mask_v8i32:
1931 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1932 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1933 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1934 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1937 ; AVX512F-LABEL: splatconstant_rotate_mask_v8i32:
1939 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1940 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1941 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1942 ; AVX512F-NEXT: retq
1944 ; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32:
1945 ; AVX512VL: # %bb.0:
1946 ; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
1947 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1948 ; AVX512VL-NEXT: retq
1950 ; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32:
1951 ; AVX512BW: # %bb.0:
1952 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1953 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1954 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1955 ; AVX512BW-NEXT: retq
1957 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i32:
1958 ; AVX512VLBW: # %bb.0:
1959 ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
1960 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1961 ; AVX512VLBW-NEXT: retq
1963 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i32:
1964 ; AVX512VBMI2: # %bb.0:
1965 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1966 ; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0
1967 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1968 ; AVX512VBMI2-NEXT: retq
1970 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v8i32:
1971 ; AVX512VLVBMI2: # %bb.0:
1972 ; AVX512VLVBMI2-NEXT: vprold $4, %ymm0, %ymm0
1973 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1974 ; AVX512VLVBMI2-NEXT: retq
1976 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
1978 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1979 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1980 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1981 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1982 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1983 ; XOPAVX1-NEXT: retq
1985 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
1987 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1988 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1989 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1990 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1991 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1992 ; XOPAVX2-NEXT: retq
1993 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1994 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1995 %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
1996 %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
1997 %or = or <8 x i32> %lmask, %rmask
2001 define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
2002 ; AVX1-LABEL: splatconstant_rotate_mask_v16i16:
2004 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2005 ; AVX1-NEXT: vpsrlw $11, %xmm1, %xmm2
2006 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
2007 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
2008 ; AVX1-NEXT: vpsrlw $11, %xmm0, %xmm2
2009 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
2010 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
2011 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2012 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2015 ; AVX2-LABEL: splatconstant_rotate_mask_v16i16:
2017 ; AVX2-NEXT: vpsrlw $11, %ymm0, %ymm1
2018 ; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0
2019 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2020 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2023 ; AVX512F-LABEL: splatconstant_rotate_mask_v16i16:
2025 ; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm1
2026 ; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0
2027 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
2028 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2029 ; AVX512F-NEXT: retq
2031 ; AVX512VL-LABEL: splatconstant_rotate_mask_v16i16:
2032 ; AVX512VL: # %bb.0:
2033 ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1
2034 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0
2035 ; AVX512VL-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
2036 ; AVX512VL-NEXT: retq
2038 ; AVX512BW-LABEL: splatconstant_rotate_mask_v16i16:
2039 ; AVX512BW: # %bb.0:
2040 ; AVX512BW-NEXT: vpsrlw $11, %ymm0, %ymm1
2041 ; AVX512BW-NEXT: vpsllw $5, %ymm0, %ymm0
2042 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2043 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2044 ; AVX512BW-NEXT: retq
2046 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i16:
2047 ; AVX512VLBW: # %bb.0:
2048 ; AVX512VLBW-NEXT: vpsllw $5, %ymm0, %ymm1
2049 ; AVX512VLBW-NEXT: vpsrlw $11, %ymm0, %ymm0
2050 ; AVX512VLBW-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
2051 ; AVX512VLBW-NEXT: retq
2053 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i16:
2054 ; AVX512VBMI2: # %bb.0:
2055 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2056 ; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0
2057 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2058 ; AVX512VBMI2-NEXT: retq
2060 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v16i16:
2061 ; AVX512VLVBMI2: # %bb.0:
2062 ; AVX512VLVBMI2-NEXT: vpshldw $5, %ymm0, %ymm0, %ymm0
2063 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2064 ; AVX512VLVBMI2-NEXT: retq
2066 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
2068 ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1
2069 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2070 ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0
2071 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2072 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2073 ; XOPAVX1-NEXT: retq
2075 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
2077 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1
2078 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2079 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0
2080 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
2081 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2082 ; XOPAVX2-NEXT: retq
2083 %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
2084 %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
2085 %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
2086 %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
2087 %or = or <16 x i16> %lmask, %rmask
2091 define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
2092 ; AVX1-LABEL: splatconstant_rotate_mask_v32i8:
2094 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2095 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
2096 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
2097 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
2098 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
2099 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
2100 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
2101 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
2102 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
2103 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
2104 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2105 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
2106 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2107 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2110 ; AVX2-LABEL: splatconstant_rotate_mask_v32i8:
2112 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
2113 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2114 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
2115 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2116 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2117 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2120 ; AVX512F-LABEL: splatconstant_rotate_mask_v32i8:
2122 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1
2123 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2124 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
2125 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2126 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
2127 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2128 ; AVX512F-NEXT: retq
2130 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i8:
2131 ; AVX512VL: # %bb.0:
2132 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
2133 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
2134 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
2135 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2136 ; AVX512VL-NEXT: retq
2138 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i8:
2139 ; AVX512BW: # %bb.0:
2140 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1
2141 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0
2142 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2143 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2144 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
2145 ; AVX512BW-NEXT: retq
2147 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i8:
2148 ; AVX512VLBW: # %bb.0:
2149 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
2150 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm2
2151 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
2152 ; AVX512VLBW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
2153 ; AVX512VLBW-NEXT: retq
2155 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i8:
2156 ; AVX512VBMI2: # %bb.0:
2157 ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
2158 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
2159 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2160 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2161 ; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0
2162 ; AVX512VBMI2-NEXT: retq
2164 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v32i8:
2165 ; AVX512VLVBMI2: # %bb.0:
2166 ; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
2167 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm2
2168 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
2169 ; AVX512VLVBMI2-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
2170 ; AVX512VLVBMI2-NEXT: retq
2172 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
2174 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
2175 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2176 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
2177 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2178 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2179 ; XOPAVX1-NEXT: retq
2181 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
2183 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
2184 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2185 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
2186 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
2187 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2188 ; XOPAVX2-NEXT: retq
2189 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2190 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2191 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
2192 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
2193 %or = or <32 x i8> %lmask, %rmask