1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
11 define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
12 ; AVX512-LABEL: var_rotate_v8i64:
14 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
16 %b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b
17 %shl = shl <8 x i64> %a, %b
18 %lshr = lshr <8 x i64> %a, %b64
19 %or = or <8 x i64> %shl, %lshr
23 define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
24 ; AVX512-LABEL: var_rotate_v16i32:
26 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
28 %b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
29 %shl = shl <16 x i32> %a, %b
30 %lshr = lshr <16 x i32> %a, %b32
31 %or = or <16 x i32> %shl, %lshr
35 define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
36 ; AVX512F-LABEL: var_rotate_v32i16:
38 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
39 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
40 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
41 ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
42 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
43 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
44 ; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
45 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
46 ; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3
47 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
48 ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
49 ; AVX512F-NEXT: vpord %zmm2, %zmm5, %zmm2
50 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
51 ; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
52 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
53 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
54 ; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
55 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
56 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
57 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
58 ; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
59 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
60 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
63 ; AVX512VL-LABEL: var_rotate_v32i16:
65 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
66 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
67 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
68 ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
69 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
70 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
71 ; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
72 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
73 ; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3
74 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
75 ; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
76 ; AVX512VL-NEXT: vpord %zmm2, %zmm5, %zmm2
77 ; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
78 ; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
79 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
80 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
81 ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
82 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm6, %ymm1
83 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
84 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
85 ; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
86 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
87 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
90 ; AVX512BW-LABEL: var_rotate_v32i16:
92 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
93 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
94 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
95 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
96 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
99 ; AVX512VLBW-LABEL: var_rotate_v32i16:
100 ; AVX512VLBW: # %bb.0:
101 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
102 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
103 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
104 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
105 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
106 ; AVX512VLBW-NEXT: retq
107 %b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
108 %shl = shl <32 x i16> %a, %b
109 %lshr = lshr <32 x i16> %a, %b16
110 %or = or <32 x i16> %shl, %lshr
114 define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
115 ; AVX512F-LABEL: var_rotate_v64i8:
117 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
118 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
119 ; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
120 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
121 ; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
122 ; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6
123 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
124 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
125 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
126 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
127 ; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
128 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
129 ; AVX512F-NEXT: vpandn %ymm4, %ymm6, %ymm4
130 ; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm7
131 ; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm7
132 ; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm4
133 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
134 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
135 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
136 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
137 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
138 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8
139 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
140 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
141 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
142 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
143 ; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3
144 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
145 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
146 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
147 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
148 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
149 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
150 ; AVX512F-NEXT: vpandn %ymm3, %ymm6, %ymm3
151 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
152 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
153 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
154 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
155 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
156 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
157 ; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
158 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
159 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
160 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
161 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
162 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
165 ; AVX512VL-LABEL: var_rotate_v64i8:
167 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
168 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
169 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4
170 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
171 ; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4
172 ; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6
173 ; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6
174 ; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
175 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
176 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
177 ; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4
178 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
179 ; AVX512VL-NEXT: vpandn %ymm4, %ymm6, %ymm4
180 ; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm7
181 ; AVX512VL-NEXT: vpand %ymm6, %ymm7, %ymm7
182 ; AVX512VL-NEXT: vpor %ymm4, %ymm7, %ymm4
183 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
184 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
185 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4
186 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
187 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
188 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8
189 ; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4
190 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
191 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
192 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
193 ; AVX512VL-NEXT: vpandn %ymm3, %ymm5, %ymm3
194 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
195 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
196 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
197 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
198 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
199 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
200 ; AVX512VL-NEXT: vpandn %ymm3, %ymm6, %ymm3
201 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
202 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
203 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
204 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
205 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
206 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
207 ; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3
208 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
209 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
210 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
211 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
212 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
213 ; AVX512VL-NEXT: retq
215 ; AVX512BW-LABEL: var_rotate_v64i8:
217 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
218 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
219 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
220 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
221 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
222 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
223 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
224 ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4
225 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
226 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
227 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
228 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
229 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
230 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
231 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
232 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm1
233 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
234 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
235 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
236 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
237 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
238 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
239 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
240 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
241 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
242 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
243 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
244 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
245 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
246 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
247 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
248 ; AVX512BW-NEXT: retq
250 ; AVX512VLBW-LABEL: var_rotate_v64i8:
251 ; AVX512VLBW: # %bb.0:
252 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
253 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
254 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
255 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
256 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
257 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
258 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
259 ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4
260 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
261 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
262 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
263 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
264 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
265 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
266 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
267 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm1
268 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
269 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
270 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
271 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1
272 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
273 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
274 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1
275 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
276 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
277 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1
278 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
279 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
280 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
281 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
282 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
283 ; AVX512VLBW-NEXT: retq
284 %b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
285 %shl = shl <64 x i8> %a, %b
286 %lshr = lshr <64 x i8> %a, %b8
287 %or = or <64 x i8> %shl, %lshr
292 ; Uniform Variable Rotates
295 define <8 x i64> @splatvar_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
296 ; AVX512-LABEL: splatvar_rotate_v8i64:
298 ; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1
299 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
301 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
302 %splat64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %splat
303 %shl = shl <8 x i64> %a, %splat
304 %lshr = lshr <8 x i64> %a, %splat64
305 %or = or <8 x i64> %shl, %lshr
309 define <16 x i32> @splatvar_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
310 ; AVX512-LABEL: splatvar_rotate_v16i32:
312 ; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1
313 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
315 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
316 %splat32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
317 %shl = shl <16 x i32> %a, %splat
318 %lshr = lshr <16 x i32> %a, %splat32
319 %or = or <16 x i32> %shl, %lshr
323 define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
324 ; AVX512F-LABEL: splatvar_rotate_v32i16:
326 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
327 ; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
328 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
329 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
330 ; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
331 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
332 ; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1
333 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
334 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
335 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
336 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
337 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
338 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
339 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
342 ; AVX512VL-LABEL: splatvar_rotate_v32i16:
344 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
345 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
346 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
347 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
348 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
349 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
350 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1
351 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
352 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
353 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
354 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
355 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
356 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
357 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
358 ; AVX512VL-NEXT: retq
360 ; AVX512BW-LABEL: splatvar_rotate_v32i16:
362 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
363 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
364 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
365 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
366 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
367 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
368 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
369 ; AVX512BW-NEXT: retq
371 ; AVX512VLBW-LABEL: splatvar_rotate_v32i16:
372 ; AVX512VLBW: # %bb.0:
373 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
374 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
375 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
376 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
377 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
378 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
379 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
380 ; AVX512VLBW-NEXT: retq
381 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
382 %splat16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
383 %shl = shl <32 x i16> %a, %splat
384 %lshr = lshr <32 x i16> %a, %splat16
385 %or = or <32 x i16> %shl, %lshr
389 define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
390 ; AVX512F-LABEL: splatvar_rotate_v64i8:
392 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
393 ; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
394 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
395 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
396 ; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
397 ; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
398 ; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
399 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
400 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
401 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
402 ; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1
403 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
404 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
405 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
406 ; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
407 ; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
408 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
409 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
410 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
411 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
412 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
413 ; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
414 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
415 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
418 ; AVX512VL-LABEL: splatvar_rotate_v64i8:
420 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
421 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
422 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
423 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
424 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
425 ; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
426 ; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
427 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
428 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
429 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
430 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1
431 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
432 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
433 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
434 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
435 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
436 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
437 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
438 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
439 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
440 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
441 ; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
442 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
443 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
444 ; AVX512VL-NEXT: retq
446 ; AVX512BW-LABEL: splatvar_rotate_v64i8:
448 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
449 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
450 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm3, %xmm1
451 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
452 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm3
453 ; AVX512BW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
454 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm4, %xmm2
455 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
456 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm2
457 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
458 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
459 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
460 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
461 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
462 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
463 ; AVX512BW-NEXT: retq
465 ; AVX512VLBW-LABEL: splatvar_rotate_v64i8:
466 ; AVX512VLBW: # %bb.0:
467 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
468 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
469 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm3, %xmm1
470 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
471 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm3
472 ; AVX512VLBW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
473 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm4, %xmm2
474 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
475 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm3, %zmm2
476 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
477 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
478 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
479 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
480 ; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
481 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
482 ; AVX512VLBW-NEXT: retq
483 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
484 %splat8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
485 %shl = shl <64 x i8> %a, %splat
486 %lshr = lshr <64 x i8> %a, %splat8
487 %or = or <64 x i8> %shl, %lshr
495 define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind {
496 ; AVX512-LABEL: constant_rotate_v8i64:
498 ; AVX512-NEXT: vprolvq {{.*}}(%rip), %zmm0, %zmm0
500 %shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>
501 %lshr = lshr <8 x i64> %a, <i64 60, i64 50, i64 14, i64 4, i64 60, i64 50, i64 14, i64 4>
502 %or = or <8 x i64> %shl, %lshr
506 define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind {
507 ; AVX512-LABEL: constant_rotate_v16i32:
509 ; AVX512-NEXT: vprolvd {{.*}}(%rip), %zmm0, %zmm0
511 %shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
512 %lshr = lshr <16 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
513 %or = or <16 x i32> %shl, %lshr
517 define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
518 ; AVX512F-LABEL: constant_rotate_v32i16:
520 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
521 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
522 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
523 ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
524 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
525 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
526 ; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
527 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
528 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
531 ; AVX512VL-LABEL: constant_rotate_v32i16:
533 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
534 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
535 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
536 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm1
537 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
538 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
539 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
540 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
541 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
542 ; AVX512VL-NEXT: retq
544 ; AVX512BW-LABEL: constant_rotate_v32i16:
546 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
547 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
548 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
549 ; AVX512BW-NEXT: retq
551 ; AVX512VLBW-LABEL: constant_rotate_v32i16:
552 ; AVX512VLBW: # %bb.0:
553 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
554 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
555 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
556 ; AVX512VLBW-NEXT: retq
557 %shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
558 %lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
559 %or = or <32 x i16> %shl, %lshr
563 define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
564 ; AVX512F-LABEL: constant_rotate_v64i8:
566 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
567 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
568 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
569 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
570 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
571 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
572 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
573 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
574 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
575 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
576 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
577 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
578 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
579 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
580 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
581 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
582 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
583 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
584 ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
585 ; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
586 ; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
587 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
588 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
589 ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
590 ; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
591 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
592 ; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
593 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
594 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
595 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
596 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
597 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
598 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
599 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
600 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
601 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
602 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
603 ; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3
604 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
605 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
606 ; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
607 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
608 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
609 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
610 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
613 ; AVX512VL-LABEL: constant_rotate_v64i8:
615 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
616 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
617 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
618 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
619 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
620 ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
621 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
622 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
623 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
624 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
625 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
626 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
627 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
628 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
629 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
630 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
631 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
632 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
633 ; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1]
634 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5
635 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
636 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
637 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
638 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
639 ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
640 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
641 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
642 ; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
643 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
644 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
645 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
646 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
647 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
648 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
649 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
650 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
651 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
652 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
653 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
654 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3
655 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
656 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
657 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
658 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
659 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
660 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
661 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
662 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
663 ; AVX512VL-NEXT: retq
665 ; AVX512BW-LABEL: constant_rotate_v64i8:
667 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
668 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
669 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
670 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
671 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
672 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
673 ; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3
674 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
675 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
676 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
677 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
678 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
679 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
680 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
681 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
682 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
683 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
684 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
685 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
686 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
687 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
688 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
689 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
690 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
691 ; AVX512BW-NEXT: retq
693 ; AVX512VLBW-LABEL: constant_rotate_v64i8:
694 ; AVX512VLBW: # %bb.0:
695 ; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
696 ; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
697 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
698 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
699 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
700 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
701 ; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3
702 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
703 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
704 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
705 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
706 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
707 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
708 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
709 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
710 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
711 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
712 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
713 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
714 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
715 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
716 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
717 ; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
718 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
719 ; AVX512VLBW-NEXT: retq
720 %shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
721 %lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
722 %or = or <64 x i8> %shl, %lshr
727 ; Uniform Constant Rotates
730 define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind {
731 ; AVX512-LABEL: splatconstant_rotate_v8i64:
733 ; AVX512-NEXT: vprolq $14, %zmm0, %zmm0
735 %shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>
736 %lshr = lshr <8 x i64> %a, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50>
737 %or = or <8 x i64> %shl, %lshr
741 define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind {
742 ; AVX512-LABEL: splatconstant_rotate_v16i32:
744 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
746 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
747 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
748 %or = or <16 x i32> %shl, %lshr
752 define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
753 ; AVX512F-LABEL: splatconstant_rotate_v32i16:
755 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
756 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2
757 ; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1
758 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
759 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2
760 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
761 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
762 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
765 ; AVX512VL-LABEL: splatconstant_rotate_v32i16:
767 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
768 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2
769 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
770 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
771 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2
772 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
773 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
774 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
775 ; AVX512VL-NEXT: retq
777 ; AVX512BW-LABEL: splatconstant_rotate_v32i16:
779 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm1
780 ; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm0
781 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
782 ; AVX512BW-NEXT: retq
784 ; AVX512VLBW-LABEL: splatconstant_rotate_v32i16:
785 ; AVX512VLBW: # %bb.0:
786 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm1
787 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm0
788 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
789 ; AVX512VLBW-NEXT: retq
790 %shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
791 %lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
792 %or = or <32 x i16> %shl, %lshr
796 define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
797 ; AVX512F-LABEL: splatconstant_rotate_v64i8:
799 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
800 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
801 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
802 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
803 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
804 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
805 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
806 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
807 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
808 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
809 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
810 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
811 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
814 ; AVX512VL-LABEL: splatconstant_rotate_v64i8:
816 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
817 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
818 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
819 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
820 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
821 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
822 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
823 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
824 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
825 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
826 ; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
827 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
828 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
829 ; AVX512VL-NEXT: retq
831 ; AVX512BW-LABEL: splatconstant_rotate_v64i8:
833 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
834 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
835 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
836 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
837 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
838 ; AVX512BW-NEXT: retq
840 ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
841 ; AVX512VLBW: # %bb.0:
842 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
843 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
844 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
845 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
846 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
847 ; AVX512VLBW-NEXT: retq
848 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
849 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
850 %or = or <64 x i8> %shl, %lshr
855 ; Masked Uniform Constant Rotates
858 define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind {
859 ; AVX512-LABEL: splatconstant_rotate_mask_v8i64:
861 ; AVX512-NEXT: vprolq $15, %zmm0, %zmm0
862 ; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
864 %shl = shl <8 x i64> %a, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
865 %lshr = lshr <8 x i64> %a, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49>
866 %rmask = and <8 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255, i64 255, i64 127, i64 127, i64 255>
867 %lmask = and <8 x i64> %shl, <i64 33, i64 65, i64 129, i64 257, i64 33, i64 65, i64 129, i64 257>
868 %or = or <8 x i64> %lmask, %rmask
872 define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
873 ; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
875 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
876 ; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
878 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
879 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
880 %rmask = and <16 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
881 %lmask = and <16 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3, i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
882 %or = or <16 x i32> %lmask, %rmask
886 define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
887 ; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
889 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
890 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
891 ; AVX512F-NEXT: vpsrlw $11, %ymm1, %ymm3
892 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
893 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
894 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
895 ; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm3
896 ; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0
897 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
898 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
899 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
902 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
904 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
905 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
906 ; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm3
907 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
908 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
909 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
910 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3
911 ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0
912 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
913 ; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
914 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
915 ; AVX512VL-NEXT: retq
917 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
919 ; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1
920 ; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0
921 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
922 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
923 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
924 ; AVX512BW-NEXT: retq
926 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
927 ; AVX512VLBW: # %bb.0:
928 ; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1
929 ; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0
930 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
931 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
932 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
933 ; AVX512VLBW-NEXT: retq
934 %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
935 %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
936 %rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
937 %lmask = and <32 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
938 %or = or <32 x i16> %lmask, %rmask
942 define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
943 ; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
945 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
946 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
947 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
948 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
949 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
950 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
951 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
952 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
953 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
954 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
955 ; AVX512F-NEXT: vpandn %ymm4, %ymm3, %ymm4
956 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
957 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
958 ; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
959 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
960 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
963 ; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
965 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
966 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
967 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
968 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
969 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
970 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
971 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
972 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
973 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
974 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
975 ; AVX512VL-NEXT: vpandn %ymm4, %ymm3, %ymm4
976 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
977 ; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
978 ; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0
979 ; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
980 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
981 ; AVX512VL-NEXT: retq
983 ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
985 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
986 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
987 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
988 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
989 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
990 ; AVX512BW-NEXT: retq
992 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
993 ; AVX512VLBW: # %bb.0:
994 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
995 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
996 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
997 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
998 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
999 ; AVX512VLBW-NEXT: retq
1000 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1001 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1002 %rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1003 %lmask = and <64 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1004 %or = or <64 x i8> %lmask, %rmask