1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VLBW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512VBMI2
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VLVBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
17 define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
18 ; AVX1-LABEL: var_rotate_v4i64:
20 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64]
21 ; AVX1-NEXT: # xmm2 = mem[0,0]
22 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
23 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
24 ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
25 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
26 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6
27 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
28 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4
29 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
30 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6
31 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
32 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
33 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
34 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
35 ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4
36 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
37 ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2
38 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
39 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4
40 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
41 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0
42 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
43 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
44 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
47 ; AVX2-LABEL: var_rotate_v4i64:
49 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64]
50 ; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2
51 ; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1
52 ; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0
53 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
56 ; AVX512NOVLX-LABEL: var_rotate_v4i64:
57 ; AVX512NOVLX: # %bb.0:
58 ; AVX512NOVLX-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
59 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
60 ; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0
61 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
62 ; AVX512NOVLX-NEXT: retq
64 ; AVX512VLX-LABEL: var_rotate_v4i64:
66 ; AVX512VLX-NEXT: vprolvq %ymm1, %ymm0, %ymm0
67 ; AVX512VLX-NEXT: retq
69 ; XOPAVX1-LABEL: var_rotate_v4i64:
71 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
72 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
73 ; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
74 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
75 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
78 ; XOPAVX2-LABEL: var_rotate_v4i64:
80 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
81 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
82 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
83 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
84 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
86 %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
87 %shl = shl <4 x i64> %a, %b
88 %lshr = lshr <4 x i64> %a, %b64
89 %or = or <4 x i64> %shl, %lshr
93 define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
94 ; AVX1-LABEL: var_rotate_v8i32:
96 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
97 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
98 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
99 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
100 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
101 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
102 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
103 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
104 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
105 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
106 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
107 ; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm2
108 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
109 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
110 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
111 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
112 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
113 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
114 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
115 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
116 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
117 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
118 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
119 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
120 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
121 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
122 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
123 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
124 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
125 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
126 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
129 ; AVX2-LABEL: var_rotate_v8i32:
131 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
132 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
133 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2
134 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
135 ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
136 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
137 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
140 ; AVX512NOVLX-LABEL: var_rotate_v8i32:
141 ; AVX512NOVLX: # %bb.0:
142 ; AVX512NOVLX-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
143 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
144 ; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0
145 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
146 ; AVX512NOVLX-NEXT: retq
148 ; AVX512VLX-LABEL: var_rotate_v8i32:
149 ; AVX512VLX: # %bb.0:
150 ; AVX512VLX-NEXT: vprolvd %ymm1, %ymm0, %ymm0
151 ; AVX512VLX-NEXT: retq
153 ; XOPAVX1-LABEL: var_rotate_v8i32:
155 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
156 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
157 ; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
158 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
159 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
162 ; XOPAVX2-LABEL: var_rotate_v8i32:
164 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
165 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
166 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
167 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
168 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
170 %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
171 %shl = shl <8 x i32> %a, %b
172 %lshr = lshr <8 x i32> %a, %b32
173 %or = or <8 x i32> %shl, %lshr
177 define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
178 ; AVX1-LABEL: var_rotate_v16i16:
180 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
181 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
182 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
183 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
184 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
185 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
186 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
187 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
188 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
189 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
190 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
191 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
192 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
193 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
194 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6
195 ; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
196 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
197 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
198 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
199 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
200 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
201 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
202 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
203 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
204 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
205 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
206 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
207 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
208 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
209 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
210 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
213 ; AVX2-LABEL: var_rotate_v16i16:
215 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
216 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
217 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
218 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
219 ; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
220 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
221 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
222 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
223 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
224 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
225 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
228 ; AVX512F-LABEL: var_rotate_v16i16:
230 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
231 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
232 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
233 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
234 ; AVX512F-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
235 ; AVX512F-NEXT: vpsrld $16, %ymm3, %ymm3
236 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
237 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
238 ; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
239 ; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
240 ; AVX512F-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
243 ; AVX512VL-LABEL: var_rotate_v16i16:
245 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
246 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
247 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
248 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
249 ; AVX512VL-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
250 ; AVX512VL-NEXT: vpsrld $16, %ymm3, %ymm3
251 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
252 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
253 ; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
254 ; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0
255 ; AVX512VL-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
256 ; AVX512VL-NEXT: retq
258 ; AVX512BW-LABEL: var_rotate_v16i16:
260 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
261 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
262 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
263 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
264 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
265 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
266 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
267 ; AVX512BW-NEXT: retq
269 ; AVX512VLBW-LABEL: var_rotate_v16i16:
270 ; AVX512VLBW: # %bb.0:
271 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
272 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
273 ; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
274 ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
275 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
276 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
277 ; AVX512VLBW-NEXT: retq
279 ; AVX512VBMI2-LABEL: var_rotate_v16i16:
280 ; AVX512VBMI2: # %bb.0:
281 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
282 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
283 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
284 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
285 ; AVX512VBMI2-NEXT: retq
287 ; AVX512VLVBMI2-LABEL: var_rotate_v16i16:
288 ; AVX512VLVBMI2: # %bb.0:
289 ; AVX512VLVBMI2-NEXT: vpshldvw %ymm1, %ymm0, %ymm0
290 ; AVX512VLVBMI2-NEXT: retq
292 ; XOPAVX1-LABEL: var_rotate_v16i16:
294 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
295 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
296 ; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
297 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
298 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
301 ; XOPAVX2-LABEL: var_rotate_v16i16:
303 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
304 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
305 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
306 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
307 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
309 %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
310 %shl = shl <16 x i16> %a, %b
311 %lshr = lshr <16 x i16> %a, %b16
312 %or = or <16 x i16> %shl, %lshr
316 define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
317 ; AVX1-LABEL: var_rotate_v32i8:
319 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
320 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
321 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
322 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
323 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
324 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
325 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
326 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
327 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
328 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
329 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
330 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
331 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
332 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7
333 ; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
334 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
335 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
336 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
337 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
338 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
339 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
340 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm8
341 ; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3
342 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
343 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
344 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
345 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
346 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
347 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
348 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
349 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
350 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
351 ; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3
352 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
353 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
354 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
355 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
356 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
357 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
358 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
359 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
360 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
361 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
362 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
363 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
364 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
367 ; AVX2-LABEL: var_rotate_v32i8:
369 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
370 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
371 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3
372 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
373 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
374 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
375 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
376 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2
377 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
378 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3
379 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
380 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
381 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
382 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
383 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2
384 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
385 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3
386 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
387 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
388 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
391 ; AVX512F-LABEL: var_rotate_v32i8:
393 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
394 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
395 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
396 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
397 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
398 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
399 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
400 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
401 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
402 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
403 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
404 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
405 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
406 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
407 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
408 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
411 ; AVX512VL-LABEL: var_rotate_v32i8:
413 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
414 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
415 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
416 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
417 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
418 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
419 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
420 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
421 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
422 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
423 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
424 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
425 ; AVX512VL-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
426 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
427 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
428 ; AVX512VL-NEXT: retq
430 ; AVX512BW-LABEL: var_rotate_v32i8:
432 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
433 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
434 ; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3
435 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
436 ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm2, %zmm2
437 ; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2
438 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
439 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
440 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
441 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
442 ; AVX512BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
443 ; AVX512BW-NEXT: retq
445 ; AVX512VLBW-LABEL: var_rotate_v32i8:
446 ; AVX512VLBW: # %bb.0:
447 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
448 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
449 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
450 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
451 ; AVX512VLBW-NEXT: vpsllvw %ymm3, %ymm4, %ymm3
452 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm3, %ymm3
453 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
454 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
455 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
456 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
457 ; AVX512VLBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
458 ; AVX512VLBW-NEXT: retq
460 ; AVX512VBMI2-LABEL: var_rotate_v32i8:
461 ; AVX512VBMI2: # %bb.0:
462 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
463 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
464 ; AVX512VBMI2-NEXT: vpxor %xmm3, %xmm3, %xmm3
465 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
466 ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm2, %zmm2
467 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm2, %ymm2
468 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
469 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
470 ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
471 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
472 ; AVX512VBMI2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
473 ; AVX512VBMI2-NEXT: retq
475 ; AVX512VLVBMI2-LABEL: var_rotate_v32i8:
476 ; AVX512VLVBMI2: # %bb.0:
477 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
478 ; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
479 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
480 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
481 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm3, %ymm4, %ymm3
482 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm3, %ymm3
483 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
484 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
485 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
486 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
487 ; AVX512VLVBMI2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
488 ; AVX512VLVBMI2-NEXT: retq
490 ; XOPAVX1-LABEL: var_rotate_v32i8:
492 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
493 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
494 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
495 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
496 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
499 ; XOPAVX2-LABEL: var_rotate_v32i8:
501 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
502 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
503 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
504 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
505 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
507 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
508 %shl = shl <32 x i8> %a, %b
509 %lshr = lshr <32 x i8> %a, %b8
510 %or = or <32 x i8> %shl, %lshr
515 ; Uniform Variable Rotates
518 define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
519 ; AVX1-LABEL: splatvar_rotate_v4i64:
521 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64]
522 ; AVX1-NEXT: # xmm2 = mem[0,0]
523 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
524 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
525 ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4
526 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
527 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
528 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
529 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
530 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
531 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
534 ; AVX2-LABEL: splatvar_rotate_v4i64:
536 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm2
537 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [64,64]
538 ; AVX2-NEXT: vpsubq %xmm1, %xmm3, %xmm1
539 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
540 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
543 ; AVX512NOVLX-LABEL: splatvar_rotate_v4i64:
544 ; AVX512NOVLX: # %bb.0:
545 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
546 ; AVX512NOVLX-NEXT: vpbroadcastq %xmm1, %ymm1
547 ; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0
548 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
549 ; AVX512NOVLX-NEXT: retq
551 ; AVX512VLX-LABEL: splatvar_rotate_v4i64:
552 ; AVX512VLX: # %bb.0:
553 ; AVX512VLX-NEXT: vpbroadcastq %xmm1, %ymm1
554 ; AVX512VLX-NEXT: vprolvq %ymm1, %ymm0, %ymm0
555 ; AVX512VLX-NEXT: retq
557 ; XOPAVX1-LABEL: splatvar_rotate_v4i64:
559 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
560 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
561 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
562 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
563 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
566 ; XOPAVX2-LABEL: splatvar_rotate_v4i64:
568 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
569 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
570 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2
571 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
572 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
574 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
575 %splat64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %splat
576 %shl = shl <4 x i64> %a, %splat
577 %lshr = lshr <4 x i64> %a, %splat64
578 %or = or <4 x i64> %shl, %lshr
582 define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
583 ; AVX1-LABEL: splatvar_rotate_v8i32:
585 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
586 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
587 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
588 ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm3
589 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3]
590 ; AVX1-NEXT: vpsllq %xmm1, %xmm4, %xmm4
591 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
592 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
593 ; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
594 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
595 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
596 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
597 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
600 ; AVX2-LABEL: splatvar_rotate_v8i32:
602 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,2,3,3,6,6,7,7]
603 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
604 ; AVX2-NEXT: vpsllq %xmm1, %ymm2, %ymm2
605 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
606 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
607 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
610 ; AVX512NOVLX-LABEL: splatvar_rotate_v8i32:
611 ; AVX512NOVLX: # %bb.0:
612 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
613 ; AVX512NOVLX-NEXT: vpbroadcastd %xmm1, %ymm1
614 ; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0
615 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
616 ; AVX512NOVLX-NEXT: retq
618 ; AVX512VLX-LABEL: splatvar_rotate_v8i32:
619 ; AVX512VLX: # %bb.0:
620 ; AVX512VLX-NEXT: vpbroadcastd %xmm1, %ymm1
621 ; AVX512VLX-NEXT: vprolvd %ymm1, %ymm0, %ymm0
622 ; AVX512VLX-NEXT: retq
624 ; XOPAVX1-LABEL: splatvar_rotate_v8i32:
626 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
627 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
628 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
629 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
630 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
633 ; XOPAVX2-LABEL: splatvar_rotate_v8i32:
635 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
636 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
637 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2
638 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
639 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
641 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
642 %splat32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
643 %shl = shl <8 x i32> %a, %splat
644 %lshr = lshr <8 x i32> %a, %splat32
645 %or = or <8 x i32> %shl, %lshr
649 define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
650 ; AVX1-LABEL: splatvar_rotate_v16i16:
652 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
653 ; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3
654 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
655 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5
656 ; AVX1-NEXT: vpsrlw %xmm3, %xmm5, %xmm5
657 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
658 ; AVX1-NEXT: vpsllw %xmm1, %xmm4, %xmm2
659 ; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2
660 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm4
661 ; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
662 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
663 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
664 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
667 ; AVX2-LABEL: splatvar_rotate_v16i16:
669 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
670 ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3
671 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4
672 ; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
673 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
674 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
675 ; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
678 ; AVX512F-LABEL: splatvar_rotate_v16i16:
680 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
681 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
682 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4
683 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
684 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
685 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
686 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
689 ; AVX512VL-LABEL: splatvar_rotate_v16i16:
691 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
692 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
693 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4
694 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
695 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
696 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
697 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
698 ; AVX512VL-NEXT: retq
700 ; AVX512BW-LABEL: splatvar_rotate_v16i16:
702 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
703 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
704 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4
705 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
706 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
707 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
708 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
709 ; AVX512BW-NEXT: retq
711 ; AVX512VLBW-LABEL: splatvar_rotate_v16i16:
712 ; AVX512VLBW: # %bb.0:
713 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
714 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
715 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4
716 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
717 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
718 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
719 ; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
720 ; AVX512VLBW-NEXT: retq
722 ; AVX512VBMI2-LABEL: splatvar_rotate_v16i16:
723 ; AVX512VBMI2: # %bb.0:
724 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
725 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %ymm1
726 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
727 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
728 ; AVX512VBMI2-NEXT: retq
730 ; AVX512VLVBMI2-LABEL: splatvar_rotate_v16i16:
731 ; AVX512VLVBMI2: # %bb.0:
732 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %ymm1
733 ; AVX512VLVBMI2-NEXT: vpshldvw %ymm1, %ymm0, %ymm0
734 ; AVX512VLVBMI2-NEXT: retq
736 ; XOPAVX1-LABEL: splatvar_rotate_v16i16:
738 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
739 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
740 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
741 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
742 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
743 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
746 ; XOPAVX2-LABEL: splatvar_rotate_v16i16:
748 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
749 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
750 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2
751 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
752 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
754 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
755 %splat16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
756 %shl = shl <16 x i16> %a, %splat
757 %lshr = lshr <16 x i16> %a, %splat16
758 %or = or <16 x i16> %shl, %lshr
762 define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
763 ; AVX1-LABEL: splatvar_rotate_v32i8:
765 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
766 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
767 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
768 ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
769 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
770 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
771 ; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
772 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
773 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
774 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
775 ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
776 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
777 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
778 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
779 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
780 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
781 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
784 ; AVX2-LABEL: splatvar_rotate_v32i8:
786 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
787 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
788 ; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
789 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
790 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
791 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
792 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
793 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
796 ; AVX512-LABEL: splatvar_rotate_v32i8:
798 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
799 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
800 ; AVX512-NEXT: vpsllw %xmm1, %ymm2, %ymm2
801 ; AVX512-NEXT: vpsrlw $8, %ymm2, %ymm2
802 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
803 ; AVX512-NEXT: vpsllw %xmm1, %ymm0, %ymm0
804 ; AVX512-NEXT: vpsrlw $8, %ymm0, %ymm0
805 ; AVX512-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
808 ; XOPAVX1-LABEL: splatvar_rotate_v32i8:
810 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
811 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
812 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
813 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2
814 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
815 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
818 ; XOPAVX2-LABEL: splatvar_rotate_v32i8:
820 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
821 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
822 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2
823 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
824 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
826 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
827 %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
828 %shl = shl <32 x i8> %a, %splat
829 %lshr = lshr <32 x i8> %a, %splat8
830 %or = or <32 x i8> %shl, %lshr
838 define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
839 ; AVX1-LABEL: constant_rotate_v4i64:
841 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
842 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm2
843 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm3
844 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
845 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3
846 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm4
847 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
848 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
849 ; AVX1-NEXT: vpsllq $60, %xmm1, %xmm3
850 ; AVX1-NEXT: vpsllq $50, %xmm1, %xmm1
851 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
852 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3
853 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0
854 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
855 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
856 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
859 ; AVX2-LABEL: constant_rotate_v4i64:
861 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
862 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
863 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
866 ; AVX512NOVLX-LABEL: constant_rotate_v4i64:
867 ; AVX512NOVLX: # %bb.0:
868 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
869 ; AVX512NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
870 ; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0
871 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
872 ; AVX512NOVLX-NEXT: retq
874 ; AVX512VLX-LABEL: constant_rotate_v4i64:
875 ; AVX512VLX: # %bb.0:
876 ; AVX512VLX-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
877 ; AVX512VLX-NEXT: retq
879 ; XOPAVX1-LABEL: constant_rotate_v4i64:
881 ; XOPAVX1-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
882 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
883 ; XOPAVX1-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
884 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
887 ; XOPAVX2-LABEL: constant_rotate_v4i64:
889 ; XOPAVX2-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
890 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
891 ; XOPAVX2-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
892 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
894 %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
895 %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4>
896 %or = or <4 x i64> %shl, %lshr
900 define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
901 ; AVX1-LABEL: constant_rotate_v8i32:
903 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
904 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
905 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
906 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
907 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
908 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
909 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
910 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
911 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
912 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
913 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
914 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
915 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
916 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
917 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
918 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
919 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
920 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
923 ; AVX2-LABEL: constant_rotate_v8i32:
925 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
926 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
927 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
930 ; AVX512NOVLX-LABEL: constant_rotate_v8i32:
931 ; AVX512NOVLX: # %bb.0:
932 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
933 ; AVX512NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
934 ; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0
935 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
936 ; AVX512NOVLX-NEXT: retq
938 ; AVX512VLX-LABEL: constant_rotate_v8i32:
939 ; AVX512VLX: # %bb.0:
940 ; AVX512VLX-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
941 ; AVX512VLX-NEXT: retq
943 ; XOPAVX1-LABEL: constant_rotate_v8i32:
945 ; XOPAVX1-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
946 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
947 ; XOPAVX1-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
948 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
951 ; XOPAVX2-LABEL: constant_rotate_v8i32:
953 ; XOPAVX2-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
954 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
955 ; XOPAVX2-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
956 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
958 %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
959 %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
960 %or = or <8 x i32> %shl, %lshr
964 define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
965 ; AVX1-LABEL: constant_rotate_v16i16:
967 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
968 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
969 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
970 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
971 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
972 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
973 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3
974 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
975 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
976 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
979 ; AVX2-LABEL: constant_rotate_v16i16:
981 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
982 ; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
983 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
984 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
987 ; AVX512F-LABEL: constant_rotate_v16i16:
989 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
990 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
991 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
992 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
995 ; AVX512VL-LABEL: constant_rotate_v16i16:
997 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
998 ; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
999 ; AVX512VL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1000 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
1001 ; AVX512VL-NEXT: retq
1003 ; AVX512BW-LABEL: constant_rotate_v16i16:
1004 ; AVX512BW: # %bb.0:
1005 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1006 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1007 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1008 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2
1009 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1010 ; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0
1011 ; AVX512BW-NEXT: retq
1013 ; AVX512VLBW-LABEL: constant_rotate_v16i16:
1014 ; AVX512VLBW: # %bb.0:
1015 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1016 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1017 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1018 ; AVX512VLBW-NEXT: retq
1020 ; AVX512VBMI2-LABEL: constant_rotate_v16i16:
1021 ; AVX512VBMI2: # %bb.0:
1022 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1023 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1024 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
1025 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1026 ; AVX512VBMI2-NEXT: retq
1028 ; AVX512VLVBMI2-LABEL: constant_rotate_v16i16:
1029 ; AVX512VLVBMI2: # %bb.0:
1030 ; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1031 ; AVX512VLVBMI2-NEXT: retq
1033 ; XOPAVX1-LABEL: constant_rotate_v16i16:
1035 ; XOPAVX1-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1036 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1037 ; XOPAVX1-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1038 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1039 ; XOPAVX1-NEXT: retq
1041 ; XOPAVX2-LABEL: constant_rotate_v16i16:
1043 ; XOPAVX2-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1044 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1045 ; XOPAVX2-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1046 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1047 ; XOPAVX2-NEXT: retq
1048 %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1049 %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
1050 %or = or <16 x i16> %shl, %lshr
1054 define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
1055 ; AVX1-LABEL: constant_rotate_v32i8:
1057 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1058 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1059 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2]
1060 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1061 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1062 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1063 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1064 ; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
1065 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1066 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1067 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1068 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1069 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1070 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1071 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
1072 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1073 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1074 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1077 ; AVX2-LABEL: constant_rotate_v32i8:
1079 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1080 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1081 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1082 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1083 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1084 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1085 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1088 ; AVX512F-LABEL: constant_rotate_v32i8:
1090 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1091 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1092 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
1093 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1094 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1095 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1096 ; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1097 ; AVX512F-NEXT: retq
1099 ; AVX512VL-LABEL: constant_rotate_v32i8:
1100 ; AVX512VL: # %bb.0:
1101 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1102 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1103 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
1104 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1105 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1106 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1107 ; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1108 ; AVX512VL-NEXT: retq
1110 ; AVX512BW-LABEL: constant_rotate_v32i8:
1111 ; AVX512BW: # %bb.0:
1112 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
1113 ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
1114 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1115 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1
1116 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
1117 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
1118 ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
1119 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1120 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1121 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
1122 ; AVX512BW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1123 ; AVX512BW-NEXT: retq
1125 ; AVX512VLBW-LABEL: constant_rotate_v32i8:
1126 ; AVX512VLBW: # %bb.0:
1127 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1128 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1129 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1
1130 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1131 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1132 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
1133 ; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1134 ; AVX512VLBW-NEXT: retq
1136 ; AVX512VBMI2-LABEL: constant_rotate_v32i8:
1137 ; AVX512VBMI2: # %bb.0:
1138 ; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
1139 ; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1]
1140 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1141 ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1
1142 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1
1143 ; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
1144 ; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1]
1145 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1146 ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1147 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
1148 ; AVX512VBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1149 ; AVX512VBMI2-NEXT: retq
1151 ; AVX512VLVBMI2-LABEL: constant_rotate_v32i8:
1152 ; AVX512VLVBMI2: # %bb.0:
1153 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1154 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1155 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1
1156 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1157 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1158 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
1159 ; AVX512VLVBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1160 ; AVX512VLVBMI2-NEXT: retq
1162 ; XOPAVX1-LABEL: constant_rotate_v32i8:
1164 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1165 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1166 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1
1167 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0
1168 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1169 ; XOPAVX1-NEXT: retq
1171 ; XOPAVX2-LABEL: constant_rotate_v32i8:
1173 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1174 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1175 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1
1176 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0
1177 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1178 ; XOPAVX2-NEXT: retq
1179 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1180 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1181 %or = or <32 x i8> %shl, %lshr
1186 ; Uniform Constant Rotates
1189 define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
1190 ; AVX1-LABEL: splatconstant_rotate_v4i64:
1192 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm1
1193 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1194 ; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm3
1195 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1196 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm0
1197 ; AVX1-NEXT: vpsllq $14, %xmm2, %xmm2
1198 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1199 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1202 ; AVX2-LABEL: splatconstant_rotate_v4i64:
1204 ; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm1
1205 ; AVX2-NEXT: vpsllq $14, %ymm0, %ymm0
1206 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1209 ; AVX512NOVLX-LABEL: splatconstant_rotate_v4i64:
1210 ; AVX512NOVLX: # %bb.0:
1211 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1212 ; AVX512NOVLX-NEXT: vprolq $14, %zmm0, %zmm0
1213 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1214 ; AVX512NOVLX-NEXT: retq
1216 ; AVX512VLX-LABEL: splatconstant_rotate_v4i64:
1217 ; AVX512VLX: # %bb.0:
1218 ; AVX512VLX-NEXT: vprolq $14, %ymm0, %ymm0
1219 ; AVX512VLX-NEXT: retq
1221 ; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
1223 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
1224 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1225 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
1226 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1227 ; XOPAVX1-NEXT: retq
1229 ; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
1231 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
1232 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1233 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
1234 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1235 ; XOPAVX2-NEXT: retq
1236 %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
1237 %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
1238 %or = or <4 x i64> %shl, %lshr
1242 define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
1243 ; AVX1-LABEL: splatconstant_rotate_v8i32:
1245 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1246 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1247 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1248 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1249 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1250 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1251 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1252 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1255 ; AVX2-LABEL: splatconstant_rotate_v8i32:
1257 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1258 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1259 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1262 ; AVX512NOVLX-LABEL: splatconstant_rotate_v8i32:
1263 ; AVX512NOVLX: # %bb.0:
1264 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1265 ; AVX512NOVLX-NEXT: vprold $4, %zmm0, %zmm0
1266 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1267 ; AVX512NOVLX-NEXT: retq
1269 ; AVX512VLX-LABEL: splatconstant_rotate_v8i32:
1270 ; AVX512VLX: # %bb.0:
1271 ; AVX512VLX-NEXT: vprold $4, %ymm0, %ymm0
1272 ; AVX512VLX-NEXT: retq
1274 ; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
1276 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1277 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1278 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1279 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1280 ; XOPAVX1-NEXT: retq
1282 ; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
1284 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1285 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1286 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1287 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1288 ; XOPAVX2-NEXT: retq
1289 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1290 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1291 %or = or <8 x i32> %shl, %lshr
1295 define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
1296 ; AVX1-LABEL: splatconstant_rotate_v16i16:
1298 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1299 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
1300 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
1301 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1302 ; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm2
1303 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
1304 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1305 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1308 ; AVX2-LABEL: splatconstant_rotate_v16i16:
1310 ; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm1
1311 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
1312 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1315 ; AVX512F-LABEL: splatconstant_rotate_v16i16:
1317 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm1
1318 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
1319 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1320 ; AVX512F-NEXT: retq
1322 ; AVX512VL-LABEL: splatconstant_rotate_v16i16:
1323 ; AVX512VL: # %bb.0:
1324 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm1
1325 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
1326 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1327 ; AVX512VL-NEXT: retq
1329 ; AVX512BW-LABEL: splatconstant_rotate_v16i16:
1330 ; AVX512BW: # %bb.0:
1331 ; AVX512BW-NEXT: vpsrlw $9, %ymm0, %ymm1
1332 ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
1333 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1334 ; AVX512BW-NEXT: retq
1336 ; AVX512VLBW-LABEL: splatconstant_rotate_v16i16:
1337 ; AVX512VLBW: # %bb.0:
1338 ; AVX512VLBW-NEXT: vpsrlw $9, %ymm0, %ymm1
1339 ; AVX512VLBW-NEXT: vpsllw $7, %ymm0, %ymm0
1340 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1341 ; AVX512VLBW-NEXT: retq
1343 ; AVX512VBMI2-LABEL: splatconstant_rotate_v16i16:
1344 ; AVX512VBMI2: # %bb.0:
1345 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1346 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
1347 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1348 ; AVX512VBMI2-NEXT: retq
1350 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v16i16:
1351 ; AVX512VLVBMI2: # %bb.0:
1352 ; AVX512VLVBMI2-NEXT: vpshldw $7, %ymm0, %ymm0, %ymm0
1353 ; AVX512VLVBMI2-NEXT: retq
1355 ; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
1357 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
1358 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1359 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
1360 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1361 ; XOPAVX1-NEXT: retq
1363 ; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
1365 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
1366 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1367 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
1368 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1369 ; XOPAVX2-NEXT: retq
1370 %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1371 %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1372 %or = or <16 x i16> %shl, %lshr
1376 define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
1377 ; AVX1-LABEL: splatconstant_rotate_v32i8:
1379 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1380 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1381 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1382 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1383 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1384 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1385 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1386 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1387 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1388 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1389 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1390 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1391 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1394 ; AVX2-LABEL: splatconstant_rotate_v32i8:
1396 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1397 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1398 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1399 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1400 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1403 ; AVX512NOVLX-LABEL: splatconstant_rotate_v32i8:
1404 ; AVX512NOVLX: # %bb.0:
1405 ; AVX512NOVLX-NEXT: vpsllw $4, %ymm0, %ymm1
1406 ; AVX512NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
1407 ; AVX512NOVLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
1408 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1409 ; AVX512NOVLX-NEXT: retq
1411 ; AVX512VLX-LABEL: splatconstant_rotate_v32i8:
1412 ; AVX512VLX: # %bb.0:
1413 ; AVX512VLX-NEXT: vpsllw $4, %ymm0, %ymm1
1414 ; AVX512VLX-NEXT: vpsrlw $4, %ymm0, %ymm0
1415 ; AVX512VLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
1416 ; AVX512VLX-NEXT: retq
1418 ; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
1420 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1421 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1422 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1423 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1424 ; XOPAVX1-NEXT: retq
1426 ; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
1428 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1429 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1430 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1431 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1432 ; XOPAVX2-NEXT: retq
1433 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1434 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1435 %or = or <32 x i8> %shl, %lshr
1440 ; Masked Uniform Constant Rotates
1443 define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
1444 ; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
1446 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm1
1447 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1448 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
1449 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1450 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1453 ; AVX2-LABEL: splatconstant_rotate_mask_v4i64:
1455 ; AVX2-NEXT: vpsrlq $49, %ymm0, %ymm0
1456 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1459 ; AVX512-LABEL: splatconstant_rotate_mask_v4i64:
1461 ; AVX512-NEXT: vpsrlq $49, %ymm0, %ymm0
1462 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1465 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
1467 ; XOPAVX1-NEXT: vpsrlq $49, %xmm0, %xmm1
1468 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1469 ; XOPAVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
1470 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1471 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1472 ; XOPAVX1-NEXT: retq
1474 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
1476 ; XOPAVX2-NEXT: vpsrlq $49, %ymm0, %ymm0
1477 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1478 ; XOPAVX2-NEXT: retq
1479 %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
1480 %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
1481 %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255>
1482 %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257>
1483 %or = or <4 x i64> %lmask, %rmask
1487 define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
1488 ; AVX1-LABEL: splatconstant_rotate_mask_v8i32:
1490 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1491 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1492 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1493 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1494 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1495 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1496 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1497 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1498 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1501 ; AVX2-LABEL: splatconstant_rotate_mask_v8i32:
1503 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1504 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1505 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1506 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1509 ; AVX512NOVLX-LABEL: splatconstant_rotate_mask_v8i32:
1510 ; AVX512NOVLX: # %bb.0:
1511 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1512 ; AVX512NOVLX-NEXT: vprold $4, %zmm0, %zmm0
1513 ; AVX512NOVLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1514 ; AVX512NOVLX-NEXT: retq
1516 ; AVX512VLX-LABEL: splatconstant_rotate_mask_v8i32:
1517 ; AVX512VLX: # %bb.0:
1518 ; AVX512VLX-NEXT: vprold $4, %ymm0, %ymm0
1519 ; AVX512VLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1520 ; AVX512VLX-NEXT: retq
1522 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
1524 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1525 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1526 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1527 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1528 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1529 ; XOPAVX1-NEXT: retq
1531 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
1533 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1534 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1535 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1536 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1537 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1538 ; XOPAVX2-NEXT: retq
1539 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1540 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1541 %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
1542 %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
1543 %or = or <8 x i32> %lmask, %rmask
1547 define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
1548 ; AVX1-LABEL: splatconstant_rotate_mask_v16i16:
1550 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1551 ; AVX1-NEXT: vpsrlw $11, %xmm1, %xmm2
1552 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
1553 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1554 ; AVX1-NEXT: vpsrlw $11, %xmm0, %xmm2
1555 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
1556 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1557 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1558 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1561 ; AVX2-LABEL: splatconstant_rotate_mask_v16i16:
1563 ; AVX2-NEXT: vpsrlw $11, %ymm0, %ymm1
1564 ; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0
1565 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1566 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1569 ; AVX512F-LABEL: splatconstant_rotate_mask_v16i16:
1571 ; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm1
1572 ; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0
1573 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1574 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1575 ; AVX512F-NEXT: retq
1577 ; AVX512VL-LABEL: splatconstant_rotate_mask_v16i16:
1578 ; AVX512VL: # %bb.0:
1579 ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1
1580 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0
1581 ; AVX512VL-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
1582 ; AVX512VL-NEXT: retq
1584 ; AVX512BW-LABEL: splatconstant_rotate_mask_v16i16:
1585 ; AVX512BW: # %bb.0:
1586 ; AVX512BW-NEXT: vpsrlw $11, %ymm0, %ymm1
1587 ; AVX512BW-NEXT: vpsllw $5, %ymm0, %ymm0
1588 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1589 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1590 ; AVX512BW-NEXT: retq
1592 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i16:
1593 ; AVX512VLBW: # %bb.0:
1594 ; AVX512VLBW-NEXT: vpsllw $5, %ymm0, %ymm1
1595 ; AVX512VLBW-NEXT: vpsrlw $11, %ymm0, %ymm0
1596 ; AVX512VLBW-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
1597 ; AVX512VLBW-NEXT: retq
1599 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i16:
1600 ; AVX512VBMI2: # %bb.0:
1601 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1602 ; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0
1603 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1604 ; AVX512VBMI2-NEXT: retq
1606 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v16i16:
1607 ; AVX512VLVBMI2: # %bb.0:
1608 ; AVX512VLVBMI2-NEXT: vpshldw $5, %ymm0, %ymm0, %ymm0
1609 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
1610 ; AVX512VLVBMI2-NEXT: retq
1612 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
1614 ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1
1615 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1616 ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0
1617 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1618 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1619 ; XOPAVX1-NEXT: retq
1621 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
1623 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1
1624 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1625 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0
1626 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1627 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1628 ; XOPAVX2-NEXT: retq
1629 %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1630 %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1631 %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1632 %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1633 %or = or <16 x i16> %lmask, %rmask
1637 define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
1638 ; AVX1-LABEL: splatconstant_rotate_mask_v32i8:
1640 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1641 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1642 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1643 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1644 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1645 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1646 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1647 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1648 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1649 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1650 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1651 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1652 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1653 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1656 ; AVX2-LABEL: splatconstant_rotate_mask_v32i8:
1658 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1659 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1660 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1661 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1662 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1663 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1666 ; AVX512NOVLX-LABEL: splatconstant_rotate_mask_v32i8:
1667 ; AVX512NOVLX: # %bb.0:
1668 ; AVX512NOVLX-NEXT: vpsllw $4, %ymm0, %ymm1
1669 ; AVX512NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
1670 ; AVX512NOVLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
1671 ; AVX512NOVLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1672 ; AVX512NOVLX-NEXT: retq
1674 ; AVX512VLX-LABEL: splatconstant_rotate_mask_v32i8:
1675 ; AVX512VLX: # %bb.0:
1676 ; AVX512VLX-NEXT: vpsllw $4, %ymm0, %ymm1
1677 ; AVX512VLX-NEXT: vpsrlw $4, %ymm0, %ymm0
1678 ; AVX512VLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
1679 ; AVX512VLX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
1680 ; AVX512VLX-NEXT: retq
1682 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
1684 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1685 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1686 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1687 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1688 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1689 ; XOPAVX1-NEXT: retq
1691 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
1693 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1694 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1695 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1696 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1697 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1698 ; XOPAVX2-NEXT: retq
1699 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1700 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1701 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1702 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1703 %or = or <32 x i8> %lmask, %rmask