1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
11 define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
12 ; AVX512-LABEL: var_rotate_v8i64:
14 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
16 %b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b
17 %shl = shl <8 x i64> %a, %b
18 %lshr = lshr <8 x i64> %a, %b64
19 %or = or <8 x i64> %shl, %lshr
23 define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
24 ; AVX512-LABEL: var_rotate_v16i32:
26 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
28 %b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
29 %shl = shl <16 x i32> %a, %b
30 %lshr = lshr <16 x i32> %a, %b32
31 %or = or <16 x i32> %shl, %lshr
35 define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
36 ; AVX512F-LABEL: var_rotate_v32i16:
38 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
39 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
40 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
41 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
42 ; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm5
43 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
44 ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
45 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
46 ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
47 ; AVX512F-NEXT: vpord %zmm0, %zmm5, %zmm0
48 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
49 ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm2
50 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
51 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
52 ; AVX512F-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
53 ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
54 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
55 ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
56 ; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
57 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
60 ; AVX512VL-LABEL: var_rotate_v32i16:
62 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
63 ; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
64 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
65 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
66 ; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm5
67 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
68 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2
69 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
70 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
71 ; AVX512VL-NEXT: vpord %zmm0, %zmm5, %zmm0
72 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
73 ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm2
74 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
75 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
76 ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
77 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2
78 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
79 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
80 ; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
81 ; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
84 ; AVX512BW-LABEL: var_rotate_v32i16:
86 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
87 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
88 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
89 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
90 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
93 ; AVX512VLBW-LABEL: var_rotate_v32i16:
94 ; AVX512VLBW: # %bb.0:
95 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
96 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
97 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
98 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
99 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
100 ; AVX512VLBW-NEXT: retq
101 %b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
102 %shl = shl <32 x i16> %a, %b
103 %lshr = lshr <32 x i16> %a, %b16
104 %or = or <32 x i16> %shl, %lshr
108 define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
109 ; AVX512F-LABEL: var_rotate_v64i8:
111 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
112 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
113 ; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
114 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
115 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
116 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
117 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
118 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
119 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm4
120 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
121 ; AVX512F-NEXT: vpandn %ymm4, %ymm6, %ymm4
122 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm7
123 ; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm7
124 ; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm4
125 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
126 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
127 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4
128 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
129 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
130 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm8
131 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
132 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
133 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
134 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
135 ; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm2
136 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4
137 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
138 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
139 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
140 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
141 ; AVX512F-NEXT: vpsrlw $6, %ymm1, %ymm2
142 ; AVX512F-NEXT: vpandn %ymm2, %ymm6, %ymm2
143 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm4
144 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
145 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
146 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
147 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
148 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
149 ; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
150 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm4
151 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
152 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
153 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
156 ; AVX512VL-LABEL: var_rotate_v64i8:
158 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
159 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
160 ; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4
161 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
162 ; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6
163 ; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
164 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
165 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
166 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm4
167 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
168 ; AVX512VL-NEXT: vpandn %ymm4, %ymm6, %ymm4
169 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm7
170 ; AVX512VL-NEXT: vpand %ymm6, %ymm7, %ymm7
171 ; AVX512VL-NEXT: vpor %ymm4, %ymm7, %ymm4
172 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
173 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
174 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm4
175 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
176 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
177 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm8
178 ; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4
179 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
180 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
181 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
182 ; AVX512VL-NEXT: vpandn %ymm2, %ymm5, %ymm2
183 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm4
184 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
185 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
186 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
187 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
188 ; AVX512VL-NEXT: vpsrlw $6, %ymm1, %ymm2
189 ; AVX512VL-NEXT: vpandn %ymm2, %ymm6, %ymm2
190 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm4
191 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
192 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
193 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
194 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
195 ; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2
196 ; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
197 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm4
198 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
199 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
200 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
201 ; AVX512VL-NEXT: retq
203 ; AVX512BW-LABEL: var_rotate_v64i8:
205 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
206 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
207 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
208 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
209 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
210 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
211 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
212 ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4
213 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
214 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
215 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
216 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
217 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
218 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
219 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
220 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm1
221 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
222 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
223 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
224 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
225 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
226 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
227 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
228 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
229 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
230 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
231 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
232 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
233 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
234 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
235 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
236 ; AVX512BW-NEXT: retq
238 ; AVX512VLBW-LABEL: var_rotate_v64i8:
239 ; AVX512VLBW: # %bb.0:
240 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
241 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
242 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
243 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
244 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
245 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
246 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
247 ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4
248 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
249 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
250 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
251 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
252 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
253 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
254 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
255 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm1
256 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
257 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
258 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
259 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1
260 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
261 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
262 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1
263 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
264 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
265 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1
266 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
267 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
268 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
269 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
270 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
271 ; AVX512VLBW-NEXT: retq
272 %b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
273 %shl = shl <64 x i8> %a, %b
274 %lshr = lshr <64 x i8> %a, %b8
275 %or = or <64 x i8> %shl, %lshr
280 ; Uniform Variable Rotates
283 define <8 x i64> @splatvar_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
284 ; AVX512-LABEL: splatvar_rotate_v8i64:
286 ; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1
287 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
289 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
290 %splat64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %splat
291 %shl = shl <8 x i64> %a, %splat
292 %lshr = lshr <8 x i64> %a, %splat64
293 %or = or <8 x i64> %shl, %lshr
297 define <16 x i32> @splatvar_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
298 ; AVX512-LABEL: splatvar_rotate_v16i32:
300 ; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1
301 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
303 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
304 %splat32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
305 %shl = shl <16 x i32> %a, %splat
306 %lshr = lshr <16 x i32> %a, %splat32
307 %or = or <16 x i32> %shl, %lshr
311 define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
312 ; AVX512F-LABEL: splatvar_rotate_v32i16:
314 ; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2
315 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
316 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
317 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
318 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
319 ; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm2
320 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
321 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
322 ; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
323 ; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3
324 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
325 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
328 ; AVX512VL-LABEL: splatvar_rotate_v32i16:
330 ; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2
331 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
332 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
333 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
334 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
335 ; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm2
336 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
337 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
338 ; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
339 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3
340 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
341 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
342 ; AVX512VL-NEXT: retq
344 ; AVX512BW-LABEL: splatvar_rotate_v32i16:
346 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
347 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
348 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
349 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
350 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
351 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
352 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
353 ; AVX512BW-NEXT: retq
355 ; AVX512VLBW-LABEL: splatvar_rotate_v32i16:
356 ; AVX512VLBW: # %bb.0:
357 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
358 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
359 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
360 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
361 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
362 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
363 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
364 ; AVX512VLBW-NEXT: retq
365 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
366 %splat16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
367 %shl = shl <32 x i16> %a, %splat
368 %lshr = lshr <32 x i16> %a, %splat16
369 %or = or <32 x i16> %shl, %lshr
373 define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
374 ; AVX512F-LABEL: splatvar_rotate_v64i8:
376 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
377 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
378 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
379 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
380 ; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
381 ; AVX512F-NEXT: vpsllw %xmm3, %ymm5, %ymm6
382 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
383 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
384 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
385 ; AVX512F-NEXT: vpsubb %xmm2, %xmm7, %xmm2
386 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
387 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
388 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm5, %ymm5
389 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
390 ; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
391 ; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
392 ; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
393 ; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3
394 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
395 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
396 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
397 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
400 ; AVX512VL-LABEL: splatvar_rotate_v64i8:
402 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
403 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
404 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
405 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
406 ; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
407 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm5, %ymm6
408 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
409 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
410 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
411 ; AVX512VL-NEXT: vpsubb %xmm2, %xmm7, %xmm2
412 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
413 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
414 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5
415 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
416 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
417 ; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
418 ; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
419 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3
420 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
421 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
422 ; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
423 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
424 ; AVX512VL-NEXT: retq
426 ; AVX512BW-LABEL: splatvar_rotate_v64i8:
428 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm2
429 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
430 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm3
431 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
432 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm4, %zmm1
433 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
434 ; AVX512BW-NEXT: vpandq %zmm1, %zmm3, %zmm1
435 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
436 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm3, %xmm2
437 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
438 ; AVX512BW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0
439 ; AVX512BW-NEXT: vpsrlw %xmm2, %zmm4, %zmm2
440 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
441 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
442 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
443 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
444 ; AVX512BW-NEXT: retq
446 ; AVX512VLBW-LABEL: splatvar_rotate_v64i8:
447 ; AVX512VLBW: # %bb.0:
448 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm2
449 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
450 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm3
451 ; AVX512VLBW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
452 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm4, %zmm1
453 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
454 ; AVX512VLBW-NEXT: vpandq %zmm1, %zmm3, %zmm1
455 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
456 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm3, %xmm2
457 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
458 ; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0
459 ; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm4, %zmm2
460 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2
461 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
462 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
463 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
464 ; AVX512VLBW-NEXT: retq
465 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
466 %splat8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
467 %shl = shl <64 x i8> %a, %splat
468 %lshr = lshr <64 x i8> %a, %splat8
469 %or = or <64 x i8> %shl, %lshr
477 define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind {
478 ; AVX512-LABEL: constant_rotate_v8i64:
480 ; AVX512-NEXT: vprolvq {{.*}}(%rip), %zmm0, %zmm0
482 %shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>
483 %lshr = lshr <8 x i64> %a, <i64 60, i64 50, i64 14, i64 4, i64 60, i64 50, i64 14, i64 4>
484 %or = or <8 x i64> %shl, %lshr
488 define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind {
489 ; AVX512-LABEL: constant_rotate_v16i32:
491 ; AVX512-NEXT: vprolvd {{.*}}(%rip), %zmm0, %zmm0
493 %shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
494 %lshr = lshr <16 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
495 %or = or <16 x i32> %shl, %lshr
499 define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
500 ; AVX512F-LABEL: constant_rotate_v32i16:
502 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
503 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
504 ; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
505 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
506 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
507 ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
508 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
511 ; AVX512VL-LABEL: constant_rotate_v32i16:
513 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
514 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
515 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
516 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
517 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
518 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm1
519 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
520 ; AVX512VL-NEXT: retq
522 ; AVX512BW-LABEL: constant_rotate_v32i16:
524 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
525 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
526 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
527 ; AVX512BW-NEXT: retq
529 ; AVX512VLBW-LABEL: constant_rotate_v32i16:
530 ; AVX512VLBW: # %bb.0:
531 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
532 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
533 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
534 ; AVX512VLBW-NEXT: retq
535 %shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
536 %lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
537 %or = or <32 x i16> %shl, %lshr
541 define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
542 ; AVX512F-LABEL: constant_rotate_v64i8:
544 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
545 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
546 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
547 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
548 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
549 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
550 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
551 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
552 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
553 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
554 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
555 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
556 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
557 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
558 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
559 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
560 ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
561 ; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
562 ; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
563 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
564 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
565 ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
566 ; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
567 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
568 ; AVX512F-NEXT: vpackuswb %ymm9, %ymm0, %ymm0
569 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
570 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
571 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
572 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
573 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
574 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
575 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
576 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
577 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
578 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
579 ; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3
580 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
581 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
582 ; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
583 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
584 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
585 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
588 ; AVX512VL-LABEL: constant_rotate_v64i8:
590 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
591 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
592 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
593 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
594 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
595 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
596 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
597 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
598 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
599 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
600 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
601 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
602 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
603 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
604 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
605 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
606 ; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1]
607 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5
608 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
609 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
610 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
611 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
612 ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
613 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
614 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
615 ; AVX512VL-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
616 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
617 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
618 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
619 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
620 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
621 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
622 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
623 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
624 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
625 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
626 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
627 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3
628 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
629 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
630 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
631 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
632 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
633 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
634 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
635 ; AVX512VL-NEXT: retq
637 ; AVX512BW-LABEL: constant_rotate_v64i8:
639 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
640 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
641 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
642 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
643 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
644 ; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3
645 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
646 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
647 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
648 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
649 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
650 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
651 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
652 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
653 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
654 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
655 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
656 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
657 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
658 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
659 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
660 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
661 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
662 ; AVX512BW-NEXT: retq
664 ; AVX512VLBW-LABEL: constant_rotate_v64i8:
665 ; AVX512VLBW: # %bb.0:
666 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
667 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
668 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
669 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
670 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
671 ; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3
672 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
673 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
674 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
675 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
676 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
677 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
678 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
679 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
680 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
681 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
682 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
683 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
684 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
685 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
686 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
687 ; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
688 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
689 ; AVX512VLBW-NEXT: retq
690 %shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
691 %lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
692 %or = or <64 x i8> %shl, %lshr
697 ; Uniform Constant Rotates
700 define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind {
701 ; AVX512-LABEL: splatconstant_rotate_v8i64:
703 ; AVX512-NEXT: vprolq $14, %zmm0, %zmm0
705 %shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>
706 %lshr = lshr <8 x i64> %a, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50>
707 %or = or <8 x i64> %shl, %lshr
711 define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind {
712 ; AVX512-LABEL: splatconstant_rotate_v16i32:
714 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
716 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
717 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
718 %or = or <16 x i32> %shl, %lshr
722 define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
723 ; AVX512F-LABEL: splatconstant_rotate_v32i16:
725 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2
726 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
727 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
728 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2
729 ; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1
730 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
733 ; AVX512VL-LABEL: splatconstant_rotate_v32i16:
735 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2
736 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
737 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
738 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2
739 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
740 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
741 ; AVX512VL-NEXT: retq
743 ; AVX512BW-LABEL: splatconstant_rotate_v32i16:
745 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm1
746 ; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm0
747 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
748 ; AVX512BW-NEXT: retq
750 ; AVX512VLBW-LABEL: splatconstant_rotate_v32i16:
751 ; AVX512VLBW: # %bb.0:
752 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm1
753 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm0
754 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
755 ; AVX512VLBW-NEXT: retq
756 %shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
757 %lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
758 %or = or <32 x i16> %shl, %lshr
762 define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
763 ; AVX512F-LABEL: splatconstant_rotate_v64i8:
765 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
766 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
767 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
768 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
769 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
770 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
771 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
772 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
773 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
774 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
775 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
778 ; AVX512VL-LABEL: splatconstant_rotate_v64i8:
780 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
781 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
782 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
783 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
784 ; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
785 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
786 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
787 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
788 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
789 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
790 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
791 ; AVX512VL-NEXT: retq
793 ; AVX512BW-LABEL: splatconstant_rotate_v64i8:
795 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
796 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
797 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
798 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
799 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
800 ; AVX512BW-NEXT: retq
802 ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
803 ; AVX512VLBW: # %bb.0:
804 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
805 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
806 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
807 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
808 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
809 ; AVX512VLBW-NEXT: retq
810 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
811 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
812 %or = or <64 x i8> %shl, %lshr
817 ; Masked Uniform Constant Rotates
820 define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind {
821 ; AVX512-LABEL: splatconstant_rotate_mask_v8i64:
823 ; AVX512-NEXT: vprolq $15, %zmm0, %zmm0
824 ; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
826 %shl = shl <8 x i64> %a, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
827 %lshr = lshr <8 x i64> %a, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49>
828 %rmask = and <8 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255, i64 255, i64 127, i64 127, i64 255>
829 %lmask = and <8 x i64> %shl, <i64 33, i64 65, i64 129, i64 257, i64 33, i64 65, i64 129, i64 257>
830 %or = or <8 x i64> %lmask, %rmask
834 define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
835 ; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
837 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
838 ; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
840 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
841 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
842 %rmask = and <16 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
843 %lmask = and <16 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3, i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
844 %or = or <16 x i32> %lmask, %rmask
848 define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
849 ; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
851 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
852 ; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm3
853 ; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0
854 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
855 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
856 ; AVX512F-NEXT: vpsrlw $11, %ymm1, %ymm3
857 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
858 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
859 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
862 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
864 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
865 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3
866 ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0
867 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
868 ; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
869 ; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm3
870 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
871 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
872 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
873 ; AVX512VL-NEXT: retq
875 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
877 ; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1
878 ; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0
879 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
880 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
881 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
882 ; AVX512BW-NEXT: retq
884 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
885 ; AVX512VLBW: # %bb.0:
886 ; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1
887 ; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0
888 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
889 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
890 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
891 ; AVX512VLBW-NEXT: retq
892 %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
893 %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
894 %rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
895 %lmask = and <32 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
896 %or = or <32 x i16> %lmask, %rmask
900 define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
901 ; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
903 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
904 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
905 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
906 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
907 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
908 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
909 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
910 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
911 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm4
912 ; AVX512F-NEXT: vpandn %ymm4, %ymm3, %ymm4
913 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
914 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
915 ; AVX512F-NEXT: vpor %ymm4, %ymm1, %ymm1
916 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
919 ; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
921 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
922 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
923 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
924 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
925 ; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
926 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
927 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
928 ; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
929 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4
930 ; AVX512VL-NEXT: vpandn %ymm4, %ymm3, %ymm4
931 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
932 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
933 ; AVX512VL-NEXT: vpor %ymm4, %ymm1, %ymm1
934 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
935 ; AVX512VL-NEXT: retq
937 ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
939 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
940 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
941 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
942 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
943 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
944 ; AVX512BW-NEXT: retq
946 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
947 ; AVX512VLBW: # %bb.0:
948 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
949 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
950 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
951 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
952 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
953 ; AVX512VLBW-NEXT: retq
954 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
955 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
956 %rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
957 %lmask = and <64 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
958 %or = or <64 x i8> %lmask, %rmask