1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
13 define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
14 ; AVX512-LABEL: var_rotate_v8i64:
16 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
18 %b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b
19 %shl = shl <8 x i64> %a, %b
20 %lshr = lshr <8 x i64> %a, %b64
21 %or = or <8 x i64> %shl, %lshr
25 define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
26 ; AVX512-LABEL: var_rotate_v16i32:
28 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
30 %b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
31 %shl = shl <16 x i32> %a, %b
32 %lshr = lshr <16 x i32> %a, %b32
33 %or = or <16 x i32> %shl, %lshr
37 define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
38 ; AVX512F-LABEL: var_rotate_v32i16:
40 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
41 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
42 ; AVX512F-NEXT: vpsubw %ymm3, %ymm2, %ymm4
43 ; AVX512F-NEXT: vpsubw %ymm1, %ymm2, %ymm2
44 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
45 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
46 ; AVX512F-NEXT: vpsllvd %zmm1, %zmm5, %zmm1
47 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
48 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
49 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
50 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
51 ; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
52 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
53 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
54 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
55 ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm5, %zmm2
56 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
57 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
58 ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm0, %zmm0
59 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
60 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
61 ; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
64 ; AVX512VL-LABEL: var_rotate_v32i16:
66 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
67 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
68 ; AVX512VL-NEXT: vpsubw %ymm3, %ymm2, %ymm4
69 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm2, %ymm2
70 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
71 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
72 ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm5, %zmm1
73 ; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
74 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
75 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
76 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
77 ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
78 ; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
79 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
80 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
81 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm5, %zmm2
82 ; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
83 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
84 ; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm0, %zmm0
85 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
86 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
87 ; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0
90 ; AVX512BW-LABEL: var_rotate_v32i16:
92 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
93 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
94 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
95 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
96 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
99 ; AVX512VLBW-LABEL: var_rotate_v32i16:
100 ; AVX512VLBW: # %bb.0:
101 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
102 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
103 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
104 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
105 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
106 ; AVX512VLBW-NEXT: retq
108 ; AVX512VBMI2-LABEL: var_rotate_v32i16:
109 ; AVX512VBMI2: # %bb.0:
110 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
111 ; AVX512VBMI2-NEXT: retq
113 ; AVX512VLVBMI2-LABEL: var_rotate_v32i16:
114 ; AVX512VLVBMI2: # %bb.0:
115 ; AVX512VLVBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
116 ; AVX512VLVBMI2-NEXT: retq
117 %b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
118 %shl = shl <32 x i16> %a, %b
119 %lshr = lshr <32 x i16> %a, %b16
120 %or = or <32 x i16> %shl, %lshr
124 define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
125 ; AVX512F-LABEL: var_rotate_v64i8:
127 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
128 ; AVX512F-NEXT: vpsubb %ymm1, %ymm3, %ymm2
129 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
130 ; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3
131 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
132 ; AVX512F-NEXT: vpsllw $4, %ymm5, %ymm6
133 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
134 ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
135 ; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4
136 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm6
137 ; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm8
138 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
139 ; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8
140 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
141 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm8, %ymm6, %ymm6
142 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8
143 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
144 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm8, %ymm6, %ymm4
145 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
146 ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
147 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
148 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm6
149 ; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm7
150 ; AVX512F-NEXT: vpand %ymm7, %ymm9, %ymm7
151 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
152 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm7, %ymm6, %ymm6
153 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm7
154 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
155 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm7, %ymm6, %ymm1
156 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
157 ; AVX512F-NEXT: vpsrlw $4, %ymm5, %ymm4
158 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
159 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
160 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
161 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4
162 ; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm5
163 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
164 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
165 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
166 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
167 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5
168 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
169 ; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
170 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
171 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
172 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
173 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
174 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
175 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
176 ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm4
177 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
178 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
179 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
180 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4
181 ; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
182 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
183 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
184 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
185 ; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
188 ; AVX512VL-LABEL: var_rotate_v64i8:
190 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
191 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm3, %ymm2
192 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
193 ; AVX512VL-NEXT: vpsubb %ymm4, %ymm3, %ymm3
194 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5
195 ; AVX512VL-NEXT: vpsllw $4, %ymm5, %ymm6
196 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
197 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
198 ; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
199 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm6
200 ; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm8
201 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
202 ; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8
203 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
204 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm8, %ymm6, %ymm6
205 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8
206 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
207 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm8, %ymm6, %ymm4
208 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
209 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
210 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
211 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm6
212 ; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm7
213 ; AVX512VL-NEXT: vpand %ymm7, %ymm9, %ymm7
214 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
215 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm7, %ymm6, %ymm6
216 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm7
217 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
218 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm7, %ymm6, %ymm1
219 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
220 ; AVX512VL-NEXT: vpsrlw $4, %ymm5, %ymm4
221 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
222 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
223 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
224 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4
225 ; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm5
226 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
227 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
228 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
229 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
230 ; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5
231 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
232 ; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
233 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
234 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
235 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
236 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
237 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
238 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
239 ; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm4
240 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
241 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
242 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
243 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4
244 ; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4
245 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
246 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
247 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
248 ; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0
249 ; AVX512VL-NEXT: retq
251 ; AVX512BW-LABEL: var_rotate_v64i8:
253 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
254 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
255 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
256 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
257 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
258 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
259 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
260 ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4
261 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4
262 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
263 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
264 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
265 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
266 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
267 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
268 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm1
269 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
270 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
271 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
272 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
273 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
274 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
275 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
276 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
277 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
278 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
279 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
280 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
281 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
282 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
283 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
284 ; AVX512BW-NEXT: retq
286 ; AVX512VLBW-LABEL: var_rotate_v64i8:
287 ; AVX512VLBW: # %bb.0:
288 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
289 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
290 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
291 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
292 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
293 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
294 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
295 ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4
296 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4
297 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
298 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
299 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
300 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
301 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
302 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
303 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm1
304 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
305 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
306 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
307 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1
308 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
309 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
310 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1
311 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
312 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
313 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1
314 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
315 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
316 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
317 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
318 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
319 ; AVX512VLBW-NEXT: retq
321 ; AVX512VBMI2-LABEL: var_rotate_v64i8:
322 ; AVX512VBMI2: # %bb.0:
323 ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
324 ; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
325 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
326 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
327 ; AVX512VBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
328 ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
329 ; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
330 ; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm4
331 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4
332 ; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
333 ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
334 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
335 ; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
336 ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
337 ; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
338 ; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm1
339 ; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2
340 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
341 ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k2
342 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1
343 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
344 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
345 ; AVX512VBMI2-NEXT: vpsrlw $2, %zmm0, %zmm1
346 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
347 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
348 ; AVX512VBMI2-NEXT: vpsrlw $1, %zmm0, %zmm1
349 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
350 ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
351 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
352 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
353 ; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
354 ; AVX512VBMI2-NEXT: retq
356 ; AVX512VLVBMI2-LABEL: var_rotate_v64i8:
357 ; AVX512VLVBMI2: # %bb.0:
358 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
359 ; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
360 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
361 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
362 ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
363 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
364 ; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
365 ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm4
366 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4
367 ; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
368 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
369 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
370 ; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
371 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
372 ; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
373 ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm1
374 ; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2
375 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
376 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k2
377 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1
378 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
379 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
380 ; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm0, %zmm1
381 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
382 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
383 ; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm0, %zmm1
384 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
385 ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
386 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
387 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
388 ; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
389 ; AVX512VLVBMI2-NEXT: retq
390 %b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
391 %shl = shl <64 x i8> %a, %b
392 %lshr = lshr <64 x i8> %a, %b8
393 %or = or <64 x i8> %shl, %lshr
398 ; Uniform Variable Rotates
401 define <8 x i64> @splatvar_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
402 ; AVX512-LABEL: splatvar_rotate_v8i64:
404 ; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1
405 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
407 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
408 %splat64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %splat
409 %shl = shl <8 x i64> %a, %splat
410 %lshr = lshr <8 x i64> %a, %splat64
411 %or = or <8 x i64> %shl, %lshr
415 define <16 x i32> @splatvar_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
416 ; AVX512-LABEL: splatvar_rotate_v16i32:
418 ; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1
419 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
421 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
422 %splat32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
423 %shl = shl <16 x i32> %a, %splat
424 %lshr = lshr <16 x i32> %a, %splat32
425 %or = or <16 x i32> %shl, %lshr
429 define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
430 ; AVX512F-LABEL: splatvar_rotate_v32i16:
432 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
433 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
434 ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1
435 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
436 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
437 ; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm4
438 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm2
439 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
440 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm3
441 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
442 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
443 ; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
446 ; AVX512VL-LABEL: splatvar_rotate_v32i16:
448 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
449 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
450 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1
451 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
452 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
453 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm4
454 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm2
455 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
456 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm3
457 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
458 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
459 ; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
460 ; AVX512VL-NEXT: retq
462 ; AVX512BW-LABEL: splatvar_rotate_v32i16:
464 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
465 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
466 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
467 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
468 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
469 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
470 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
471 ; AVX512BW-NEXT: retq
473 ; AVX512VLBW-LABEL: splatvar_rotate_v32i16:
474 ; AVX512VLBW: # %bb.0:
475 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
476 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
477 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
478 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
479 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
480 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
481 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
482 ; AVX512VLBW-NEXT: retq
484 ; AVX512VBMI2-LABEL: splatvar_rotate_v32i16:
485 ; AVX512VBMI2: # %bb.0:
486 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %zmm1
487 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
488 ; AVX512VBMI2-NEXT: retq
490 ; AVX512VLVBMI2-LABEL: splatvar_rotate_v32i16:
491 ; AVX512VLVBMI2: # %bb.0:
492 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %zmm1
493 ; AVX512VLVBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
494 ; AVX512VLVBMI2-NEXT: retq
495 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
496 %splat16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
497 %shl = shl <32 x i16> %a, %splat
498 %lshr = lshr <32 x i16> %a, %splat16
499 %or = or <32 x i16> %shl, %lshr
503 define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
504 ; AVX512F-LABEL: splatvar_rotate_v64i8:
506 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
507 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
508 ; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
509 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
510 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
511 ; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm4
512 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm5
513 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
514 ; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
515 ; AVX512F-NEXT: vpsllw %xmm2, %xmm5, %xmm2
516 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
517 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
518 ; AVX512F-NEXT: vpandq %zmm2, %zmm4, %zmm2
519 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm3
520 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
521 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
522 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
523 ; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0
524 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
525 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
526 ; AVX512F-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
529 ; AVX512VL-LABEL: splatvar_rotate_v64i8:
531 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
532 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
533 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
534 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
535 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
536 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm4
537 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm5
538 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
539 ; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
540 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm5, %xmm2
541 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
542 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
543 ; AVX512VL-NEXT: vpandq %zmm2, %zmm4, %zmm2
544 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm3
545 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
546 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
547 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
548 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
549 ; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
550 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
551 ; AVX512VL-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
552 ; AVX512VL-NEXT: retq
554 ; AVX512BW-LABEL: splatvar_rotate_v64i8:
556 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
557 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
558 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm3, %xmm1
559 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
560 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm3
561 ; AVX512BW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
562 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm4, %xmm2
563 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
564 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm2
565 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
566 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
567 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0
568 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
569 ; AVX512BW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
570 ; AVX512BW-NEXT: retq
572 ; AVX512VLBW-LABEL: splatvar_rotate_v64i8:
573 ; AVX512VLBW: # %bb.0:
574 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
575 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
576 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm3, %xmm1
577 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
578 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm3
579 ; AVX512VLBW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
580 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm4, %xmm2
581 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
582 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm3, %zmm2
583 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
584 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
585 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0
586 ; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0
587 ; AVX512VLBW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
588 ; AVX512VLBW-NEXT: retq
590 ; AVX512VBMI2-LABEL: splatvar_rotate_v64i8:
591 ; AVX512VBMI2: # %bb.0:
592 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
593 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
594 ; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
595 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
596 ; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm3
597 ; AVX512VBMI2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
598 ; AVX512VBMI2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
599 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
600 ; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm3, %zmm2
601 ; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
602 ; AVX512VBMI2-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
603 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
604 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
605 ; AVX512VBMI2-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
606 ; AVX512VBMI2-NEXT: retq
608 ; AVX512VLVBMI2-LABEL: splatvar_rotate_v64i8:
609 ; AVX512VLVBMI2: # %bb.0:
610 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
611 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
612 ; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
613 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
614 ; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm3
615 ; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
616 ; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
617 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
618 ; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm3, %zmm2
619 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
620 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
621 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
622 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
623 ; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
624 ; AVX512VLVBMI2-NEXT: retq
625 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
626 %splat8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
627 %shl = shl <64 x i8> %a, %splat
628 %lshr = lshr <64 x i8> %a, %splat8
629 %or = or <64 x i8> %shl, %lshr
637 define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind {
638 ; AVX512-LABEL: constant_rotate_v8i64:
640 ; AVX512-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
642 %shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>
643 %lshr = lshr <8 x i64> %a, <i64 60, i64 50, i64 14, i64 4, i64 60, i64 50, i64 14, i64 4>
644 %or = or <8 x i64> %shl, %lshr
648 define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind {
649 ; AVX512-LABEL: constant_rotate_v16i32:
651 ; AVX512-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
653 %shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
654 %lshr = lshr <16 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
655 %or = or <16 x i32> %shl, %lshr
659 define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
660 ; AVX512F-LABEL: constant_rotate_v32i16:
662 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
663 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
664 ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3
665 ; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm4
666 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
667 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
668 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
669 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
670 ; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
673 ; AVX512VL-LABEL: constant_rotate_v32i16:
675 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
676 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
677 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3
678 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm4
679 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
680 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
681 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
682 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
683 ; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0
684 ; AVX512VL-NEXT: retq
686 ; AVX512BW-LABEL: constant_rotate_v32i16:
688 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
689 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
690 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
691 ; AVX512BW-NEXT: retq
693 ; AVX512VLBW-LABEL: constant_rotate_v32i16:
694 ; AVX512VLBW: # %bb.0:
695 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
696 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
697 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
698 ; AVX512VLBW-NEXT: retq
700 ; AVX512VBMI2-LABEL: constant_rotate_v32i16:
701 ; AVX512VBMI2: # %bb.0:
702 ; AVX512VBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
703 ; AVX512VBMI2-NEXT: retq
705 ; AVX512VLVBMI2-LABEL: constant_rotate_v32i16:
706 ; AVX512VLVBMI2: # %bb.0:
707 ; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
708 ; AVX512VLVBMI2-NEXT: retq
709 %shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
710 %lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
711 %or = or <32 x i16> %shl, %lshr
715 define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
716 ; AVX512F-LABEL: constant_rotate_v64i8:
718 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
719 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
720 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
721 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
722 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
723 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
724 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
725 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
726 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
727 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
728 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
729 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
730 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
731 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
732 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
733 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5
734 ; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm3
735 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
736 ; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
737 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
738 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
739 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
740 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
741 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
742 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
743 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
744 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
745 ; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
746 ; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
747 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
748 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
749 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
750 ; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
751 ; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1
752 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
753 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
754 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31]
755 ; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
756 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
757 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23]
758 ; AVX512F-NEXT: vpmullw %ymm6, %ymm0, %ymm0
759 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
760 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
761 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
762 ; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
765 ; AVX512VL-LABEL: constant_rotate_v64i8:
767 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
768 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
769 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
770 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
771 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
772 ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
773 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
774 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
775 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
776 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
777 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
778 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
779 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
780 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
781 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
782 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
783 ; AVX512VL-NEXT: vpand %ymm3, %ymm5, %ymm3
784 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
785 ; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
786 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
787 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
788 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
789 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
790 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
791 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
792 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
793 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
794 ; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
795 ; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
796 ; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
797 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
798 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
799 ; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
800 ; AVX512VL-NEXT: vpmullw %ymm6, %ymm1, %ymm1
801 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
802 ; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
803 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31]
804 ; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
805 ; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
806 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23]
807 ; AVX512VL-NEXT: vpmullw %ymm6, %ymm0, %ymm0
808 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
809 ; AVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
810 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
811 ; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
812 ; AVX512VL-NEXT: retq
814 ; AVX512BW-LABEL: constant_rotate_v64i8:
816 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
817 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
818 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
819 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
820 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
821 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
822 ; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3
823 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
824 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
825 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
826 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
827 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
828 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
829 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
830 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
831 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
832 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
833 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
834 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
835 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
836 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
837 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
838 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
839 ; AVX512BW-NEXT: retq
841 ; AVX512VLBW-LABEL: constant_rotate_v64i8:
842 ; AVX512VLBW: # %bb.0:
843 ; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
844 ; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
845 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
846 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
847 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
848 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
849 ; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3
850 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
851 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
852 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
853 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
854 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
855 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
856 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
857 ; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
858 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
859 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
860 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3
861 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
862 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
863 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
864 ; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
865 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
866 ; AVX512VLBW-NEXT: retq
868 ; AVX512VBMI2-LABEL: constant_rotate_v64i8:
869 ; AVX512VBMI2: # %bb.0:
870 ; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
871 ; AVX512VBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
872 ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
873 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
874 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
875 ; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
876 ; AVX512VBMI2-NEXT: vpsllw $2, %zmm2, %zmm3
877 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
878 ; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
879 ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
880 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
881 ; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
882 ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
883 ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
884 ; AVX512VBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1
885 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
886 ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
887 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
888 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
889 ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
890 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
891 ; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
892 ; AVX512VBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0
893 ; AVX512VBMI2-NEXT: retq
895 ; AVX512VLVBMI2-LABEL: constant_rotate_v64i8:
896 ; AVX512VLVBMI2: # %bb.0:
897 ; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
898 ; AVX512VLVBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
899 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
900 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
901 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
902 ; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
903 ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm2, %zmm3
904 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
905 ; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
906 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
907 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
908 ; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
909 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
910 ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
911 ; AVX512VLVBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1
912 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
913 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
914 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
915 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
916 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
917 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
918 ; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
919 ; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0
920 ; AVX512VLVBMI2-NEXT: retq
921 %shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
922 %lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
923 %or = or <64 x i8> %shl, %lshr
928 ; Uniform Constant Rotates
931 define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind {
932 ; AVX512-LABEL: splatconstant_rotate_v8i64:
934 ; AVX512-NEXT: vprolq $14, %zmm0, %zmm0
936 %shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>
937 %lshr = lshr <8 x i64> %a, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50>
938 %or = or <8 x i64> %shl, %lshr
942 define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind {
943 ; AVX512-LABEL: splatconstant_rotate_v16i32:
945 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
947 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
948 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
949 %or = or <16 x i32> %shl, %lshr
953 define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
954 ; AVX512F-LABEL: splatconstant_rotate_v32i16:
956 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm1
957 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
958 ; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm3
959 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
960 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm0
961 ; AVX512F-NEXT: vpsrlw $9, %ymm2, %ymm2
962 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
963 ; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
966 ; AVX512VL-LABEL: splatconstant_rotate_v32i16:
968 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm1
969 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
970 ; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm3
971 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
972 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm0
973 ; AVX512VL-NEXT: vpsrlw $9, %ymm2, %ymm2
974 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
975 ; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0
976 ; AVX512VL-NEXT: retq
978 ; AVX512BW-LABEL: splatconstant_rotate_v32i16:
980 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm1
981 ; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm0
982 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
983 ; AVX512BW-NEXT: retq
985 ; AVX512VLBW-LABEL: splatconstant_rotate_v32i16:
986 ; AVX512VLBW: # %bb.0:
987 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm1
988 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm0
989 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
990 ; AVX512VLBW-NEXT: retq
992 ; AVX512VBMI2-LABEL: splatconstant_rotate_v32i16:
993 ; AVX512VBMI2: # %bb.0:
994 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
995 ; AVX512VBMI2-NEXT: retq
997 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v32i16:
998 ; AVX512VLVBMI2: # %bb.0:
999 ; AVX512VLVBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
1000 ; AVX512VLVBMI2-NEXT: retq
1001 %shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1002 %lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1003 %or = or <32 x i16> %shl, %lshr
1007 define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
1008 ; AVX512F-LABEL: splatconstant_rotate_v64i8:
1010 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
1011 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1012 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3
1013 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1014 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
1015 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2
1016 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1017 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1018 ; AVX512F-NEXT: retq
1020 ; AVX512VL-LABEL: splatconstant_rotate_v64i8:
1021 ; AVX512VL: # %bb.0:
1022 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1023 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1024 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3
1025 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1026 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
1027 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2
1028 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1029 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1030 ; AVX512VL-NEXT: retq
1032 ; AVX512BW-LABEL: splatconstant_rotate_v64i8:
1033 ; AVX512BW: # %bb.0:
1034 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
1035 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
1036 ; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1037 ; AVX512BW-NEXT: retq
1039 ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
1040 ; AVX512VLBW: # %bb.0:
1041 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
1042 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
1043 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1044 ; AVX512VLBW-NEXT: retq
1046 ; AVX512VBMI2-LABEL: splatconstant_rotate_v64i8:
1047 ; AVX512VBMI2: # %bb.0:
1048 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
1049 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0
1050 ; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1051 ; AVX512VBMI2-NEXT: retq
1053 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v64i8:
1054 ; AVX512VLVBMI2: # %bb.0:
1055 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
1056 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0
1057 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1058 ; AVX512VLVBMI2-NEXT: retq
1059 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1060 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1061 %or = or <64 x i8> %shl, %lshr
1066 ; Masked Uniform Constant Rotates
1069 define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind {
1070 ; AVX512-LABEL: splatconstant_rotate_mask_v8i64:
1072 ; AVX512-NEXT: vprolq $15, %zmm0, %zmm0
1073 ; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1075 %shl = shl <8 x i64> %a, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
1076 %lshr = lshr <8 x i64> %a, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49>
1077 %rmask = and <8 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255, i64 255, i64 127, i64 127, i64 255>
1078 %lmask = and <8 x i64> %shl, <i64 33, i64 65, i64 129, i64 257, i64 33, i64 65, i64 129, i64 257>
1079 %or = or <8 x i64> %lmask, %rmask
1083 define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
1084 ; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
1086 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
1087 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1089 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1090 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1091 %rmask = and <16 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
1092 %lmask = and <16 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3, i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
1093 %or = or <16 x i32> %lmask, %rmask
1097 define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
1098 ; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
1100 ; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm1
1101 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1102 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm3
1103 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1104 ; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0
1105 ; AVX512F-NEXT: vpsrlw $11, %ymm2, %ymm2
1106 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
1107 ; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1108 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
1109 ; AVX512F-NEXT: retq
1111 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
1112 ; AVX512VL: # %bb.0:
1113 ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1
1114 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1115 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm3
1116 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1117 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0
1118 ; AVX512VL-NEXT: vpsrlw $11, %ymm2, %ymm2
1119 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
1120 ; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1121 ; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
1122 ; AVX512VL-NEXT: retq
1124 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
1125 ; AVX512BW: # %bb.0:
1126 ; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1
1127 ; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm2
1128 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1129 ; AVX512BW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
1130 ; AVX512BW-NEXT: retq
1132 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
1133 ; AVX512VLBW: # %bb.0:
1134 ; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1
1135 ; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm2
1136 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1137 ; AVX512VLBW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
1138 ; AVX512VLBW-NEXT: retq
1140 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i16:
1141 ; AVX512VBMI2: # %bb.0:
1142 ; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0
1143 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1144 ; AVX512VBMI2-NEXT: retq
1146 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v32i16:
1147 ; AVX512VLVBMI2: # %bb.0:
1148 ; AVX512VLVBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0
1149 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1150 ; AVX512VLVBMI2-NEXT: retq
1151 %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1152 %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1153 %rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1154 %lmask = and <32 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1155 %or = or <32 x i16> %lmask, %rmask
1159 define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
1160 ; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
1162 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
1163 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1164 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3
1165 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1166 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
1167 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2
1168 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
1169 ; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1170 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
1171 ; AVX512F-NEXT: retq
1173 ; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
1174 ; AVX512VL: # %bb.0:
1175 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1176 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1177 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3
1178 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1179 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
1180 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2
1181 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
1182 ; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1183 ; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
1184 ; AVX512VL-NEXT: retq
1186 ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
1187 ; AVX512BW: # %bb.0:
1188 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
1189 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
1190 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1191 ; AVX512BW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
1192 ; AVX512BW-NEXT: retq
1194 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
1195 ; AVX512VLBW: # %bb.0:
1196 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
1197 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm2
1198 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1199 ; AVX512VLBW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
1200 ; AVX512VLBW-NEXT: retq
1202 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v64i8:
1203 ; AVX512VBMI2: # %bb.0:
1204 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
1205 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
1206 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1207 ; AVX512VBMI2-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
1208 ; AVX512VBMI2-NEXT: retq
1210 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v64i8:
1211 ; AVX512VLVBMI2: # %bb.0:
1212 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
1213 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
1214 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1215 ; AVX512VLVBMI2-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
1216 ; AVX512VLVBMI2-NEXT: retq
1217 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1218 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1219 %rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1220 %lmask = and <64 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1221 %or = or <64 x i8> %lmask, %rmask