1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
13 define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
14 ; AVX512-LABEL: var_rotate_v8i64:
16 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
18 %b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b
19 %shl = shl <8 x i64> %a, %b
20 %lshr = lshr <8 x i64> %a, %b64
21 %or = or <8 x i64> %shl, %lshr
25 define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
26 ; AVX512-LABEL: var_rotate_v16i32:
28 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
30 %b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
31 %shl = shl <16 x i32> %a, %b
32 %lshr = lshr <16 x i32> %a, %b32
33 %or = or <16 x i32> %shl, %lshr
37 define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
38 ; AVX512F-LABEL: var_rotate_v32i16:
40 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
41 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
42 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
43 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
44 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
45 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6
46 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
47 ; AVX512F-NEXT: vpsllvd %ymm5, %ymm7, %ymm5
48 ; AVX512F-NEXT: vpsrld $16, %ymm5, %ymm5
49 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
50 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
51 ; AVX512F-NEXT: vpsllvd %ymm2, %ymm6, %ymm2
52 ; AVX512F-NEXT: vpsrld $16, %ymm2, %ymm2
53 ; AVX512F-NEXT: vpackusdw %ymm5, %ymm2, %ymm2
54 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
55 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15]
56 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
57 ; AVX512F-NEXT: vpsllvd %ymm3, %ymm5, %ymm3
58 ; AVX512F-NEXT: vpsrld $16, %ymm3, %ymm3
59 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
60 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
61 ; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
62 ; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
63 ; AVX512F-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
64 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
67 ; AVX512VL-LABEL: var_rotate_v32i16:
69 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
70 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
71 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
72 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
73 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
74 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm6
75 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
76 ; AVX512VL-NEXT: vpsllvd %ymm5, %ymm7, %ymm5
77 ; AVX512VL-NEXT: vpsrld $16, %ymm5, %ymm5
78 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
79 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
80 ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm6, %ymm2
81 ; AVX512VL-NEXT: vpsrld $16, %ymm2, %ymm2
82 ; AVX512VL-NEXT: vpackusdw %ymm5, %ymm2, %ymm2
83 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
84 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15]
85 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
86 ; AVX512VL-NEXT: vpsllvd %ymm3, %ymm5, %ymm3
87 ; AVX512VL-NEXT: vpsrld $16, %ymm3, %ymm3
88 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
89 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
90 ; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
91 ; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0
92 ; AVX512VL-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
93 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
96 ; AVX512BW-LABEL: var_rotate_v32i16:
98 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
99 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
100 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
101 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1
102 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
103 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
104 ; AVX512BW-NEXT: retq
106 ; AVX512VLBW-LABEL: var_rotate_v32i16:
107 ; AVX512VLBW: # %bb.0:
108 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
109 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
110 ; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
111 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1
112 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
113 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
114 ; AVX512VLBW-NEXT: retq
116 ; AVX512VBMI2-LABEL: var_rotate_v32i16:
117 ; AVX512VBMI2: # %bb.0:
118 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
119 ; AVX512VBMI2-NEXT: retq
121 ; AVX512VLVBMI2-LABEL: var_rotate_v32i16:
122 ; AVX512VLVBMI2: # %bb.0:
123 ; AVX512VLVBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
124 ; AVX512VLVBMI2-NEXT: retq
125 %b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
126 %shl = shl <32 x i16> %a, %b
127 %lshr = lshr <32 x i16> %a, %b16
128 %or = or <32 x i16> %shl, %lshr
132 define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
133 ; AVX512F-LABEL: var_rotate_v64i8:
135 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
136 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3
137 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4
138 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160]
139 ; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4
140 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
141 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
142 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
143 ; AVX512F-NEXT: vpsrlw $6, %ymm2, %ymm4
144 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm6
145 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268]
146 ; AVX512F-NEXT: vpternlogd $226, %zmm4, %zmm7, %zmm6
147 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
148 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
149 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4
150 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
151 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
152 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm8
153 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
154 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
155 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
156 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
157 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
158 ; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4
159 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
160 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
161 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
162 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
163 ; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm7, %zmm4
164 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
165 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
166 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
167 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
168 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
169 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
170 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
171 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
172 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
175 ; AVX512VL-LABEL: var_rotate_v64i8:
177 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
178 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm3
179 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4
180 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160]
181 ; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4
182 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
183 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
184 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
185 ; AVX512VL-NEXT: vpsrlw $6, %ymm2, %ymm4
186 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm6
187 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268]
188 ; AVX512VL-NEXT: vpternlogd $226, %ymm4, %ymm7, %ymm6
189 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
190 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
191 ; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4
192 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm6
193 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
194 ; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm4, %ymm6
195 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
196 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
197 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
198 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
199 ; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4
200 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
201 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
202 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
203 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
204 ; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm7, %ymm4
205 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
206 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
207 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
208 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
209 ; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm3, %ymm4
210 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
211 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
212 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
213 ; AVX512VL-NEXT: retq
215 ; AVX512BW-LABEL: var_rotate_v64i8:
217 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
218 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
219 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
220 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
221 ; AVX512BW-NEXT: vpsllvw %zmm3, %zmm4, %zmm3
222 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
223 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
224 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
225 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
226 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
227 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
228 ; AVX512BW-NEXT: retq
230 ; AVX512VLBW-LABEL: var_rotate_v64i8:
231 ; AVX512VLBW: # %bb.0:
232 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
233 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
234 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
235 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
236 ; AVX512VLBW-NEXT: vpsllvw %zmm3, %zmm4, %zmm3
237 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3
238 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
239 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
240 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
241 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
242 ; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
243 ; AVX512VLBW-NEXT: retq
245 ; AVX512VBMI2-LABEL: var_rotate_v64i8:
246 ; AVX512VBMI2: # %bb.0:
247 ; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
248 ; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
249 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
250 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
251 ; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm4, %zmm3
252 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
253 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
254 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
255 ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
256 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
257 ; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
258 ; AVX512VBMI2-NEXT: retq
260 ; AVX512VLVBMI2-LABEL: var_rotate_v64i8:
261 ; AVX512VLVBMI2: # %bb.0:
262 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
263 ; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
264 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
265 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
266 ; AVX512VLVBMI2-NEXT: vpsllvw %zmm3, %zmm4, %zmm3
267 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
268 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
269 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
270 ; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
271 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
272 ; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
273 ; AVX512VLVBMI2-NEXT: retq
274 %b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
275 %shl = shl <64 x i8> %a, %b
276 %lshr = lshr <64 x i8> %a, %b8
277 %or = or <64 x i8> %shl, %lshr
282 ; Uniform Variable Rotates
285 define <8 x i64> @splatvar_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
286 ; AVX512-LABEL: splatvar_rotate_v8i64:
288 ; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1
289 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
291 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
292 %splat64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %splat
293 %shl = shl <8 x i64> %a, %splat
294 %lshr = lshr <8 x i64> %a, %splat64
295 %or = or <8 x i64> %shl, %lshr
299 define <16 x i32> @splatvar_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
300 ; AVX512-LABEL: splatvar_rotate_v16i32:
302 ; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1
303 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
305 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
306 %splat32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
307 %shl = shl <16 x i32> %a, %splat
308 %lshr = lshr <16 x i32> %a, %splat32
309 %or = or <16 x i32> %shl, %lshr
313 define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
314 ; AVX512F-LABEL: splatvar_rotate_v32i16:
316 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
317 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
318 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
319 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5
320 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm5, %ymm5
321 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm6
322 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm6, %ymm3
323 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3
324 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
325 ; AVX512F-NEXT: vpsllw %xmm1, %ymm4, %ymm2
326 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
327 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
328 ; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
331 ; AVX512VL-LABEL: splatvar_rotate_v32i16:
333 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
334 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
335 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
336 ; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5
337 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm5, %ymm5
338 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm6
339 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm6, %ymm3
340 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3
341 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
342 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm4, %ymm2
343 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
344 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
345 ; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
346 ; AVX512VL-NEXT: retq
348 ; AVX512BW-LABEL: splatvar_rotate_v32i16:
350 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
351 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
352 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4
353 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3
354 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
355 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
356 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
357 ; AVX512BW-NEXT: retq
359 ; AVX512VLBW-LABEL: splatvar_rotate_v32i16:
360 ; AVX512VLBW: # %bb.0:
361 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
362 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
363 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4
364 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3
365 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
366 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
367 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
368 ; AVX512VLBW-NEXT: retq
370 ; AVX512VBMI2-LABEL: splatvar_rotate_v32i16:
371 ; AVX512VBMI2: # %bb.0:
372 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %zmm1
373 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
374 ; AVX512VBMI2-NEXT: retq
376 ; AVX512VLVBMI2-LABEL: splatvar_rotate_v32i16:
377 ; AVX512VLVBMI2: # %bb.0:
378 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %zmm1
379 ; AVX512VLVBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
380 ; AVX512VLVBMI2-NEXT: retq
381 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
382 %splat16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
383 %shl = shl <32 x i16> %a, %splat
384 %lshr = lshr <32 x i16> %a, %splat16
385 %or = or <32 x i16> %shl, %lshr
389 define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
390 ; AVX512F-LABEL: splatvar_rotate_v64i8:
392 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
393 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
394 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
395 ; AVX512F-NEXT: vpsllw %xmm1, %ymm3, %ymm3
396 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
397 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
398 ; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2
399 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
400 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
401 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
402 ; AVX512F-NEXT: vpsllw %xmm1, %ymm3, %ymm3
403 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
404 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
405 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
406 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
407 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
408 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
411 ; AVX512VL-LABEL: splatvar_rotate_v64i8:
413 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
414 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
415 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
416 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm3, %ymm3
417 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
418 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
419 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2
420 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
421 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
422 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
423 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm3, %ymm3
424 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
425 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
426 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
427 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
428 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
429 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
430 ; AVX512VL-NEXT: retq
432 ; AVX512BW-LABEL: splatvar_rotate_v64i8:
434 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
435 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
436 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm2, %zmm2
437 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
438 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
439 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
440 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
441 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
442 ; AVX512BW-NEXT: retq
444 ; AVX512VLBW-LABEL: splatvar_rotate_v64i8:
445 ; AVX512VLBW: # %bb.0:
446 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
447 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
448 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm2, %zmm2
449 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2
450 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
451 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
452 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
453 ; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
454 ; AVX512VLBW-NEXT: retq
456 ; AVX512VBMI2-LABEL: splatvar_rotate_v64i8:
457 ; AVX512VBMI2: # %bb.0:
458 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
459 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
460 ; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm2, %zmm2
461 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
462 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
463 ; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0
464 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
465 ; AVX512VBMI2-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
466 ; AVX512VBMI2-NEXT: retq
468 ; AVX512VLVBMI2-LABEL: splatvar_rotate_v64i8:
469 ; AVX512VLVBMI2: # %bb.0:
470 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
471 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
472 ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm2, %zmm2
473 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
474 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
475 ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0
476 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
477 ; AVX512VLVBMI2-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
478 ; AVX512VLVBMI2-NEXT: retq
479 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
480 %splat8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
481 %shl = shl <64 x i8> %a, %splat
482 %lshr = lshr <64 x i8> %a, %splat8
483 %or = or <64 x i8> %shl, %lshr
491 define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind {
492 ; AVX512-LABEL: constant_rotate_v8i64:
494 ; AVX512-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
496 %shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>
497 %lshr = lshr <8 x i64> %a, <i64 60, i64 50, i64 14, i64 4, i64 60, i64 50, i64 14, i64 4>
498 %or = or <8 x i64> %shl, %lshr
502 define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind {
503 ; AVX512-LABEL: constant_rotate_v16i32:
505 ; AVX512-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
507 %shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
508 %lshr = lshr <16 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
509 %or = or <16 x i32> %shl, %lshr
513 define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
514 ; AVX512F-LABEL: constant_rotate_v32i16:
516 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
517 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
518 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
519 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm4
520 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
521 ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
522 ; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
523 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
524 ; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
527 ; AVX512VL-LABEL: constant_rotate_v32i16:
529 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
530 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
531 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
532 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm4
533 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
534 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm1
535 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
536 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
537 ; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
538 ; AVX512VL-NEXT: retq
540 ; AVX512BW-LABEL: constant_rotate_v32i16:
542 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
543 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
544 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
545 ; AVX512BW-NEXT: retq
547 ; AVX512VLBW-LABEL: constant_rotate_v32i16:
548 ; AVX512VLBW: # %bb.0:
549 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
550 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
551 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
552 ; AVX512VLBW-NEXT: retq
554 ; AVX512VBMI2-LABEL: constant_rotate_v32i16:
555 ; AVX512VBMI2: # %bb.0:
556 ; AVX512VBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
557 ; AVX512VBMI2-NEXT: retq
559 ; AVX512VLVBMI2-LABEL: constant_rotate_v32i16:
560 ; AVX512VLVBMI2: # %bb.0:
561 ; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
562 ; AVX512VLVBMI2-NEXT: retq
563 %shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
564 %lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
565 %or = or <32 x i16> %shl, %lshr
569 define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
570 ; AVX512F-LABEL: constant_rotate_v64i8:
572 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
573 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
574 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
575 ; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
576 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
577 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
578 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
579 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
580 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
581 ; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
582 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
583 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
584 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
585 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
586 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
587 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
588 ; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
589 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
590 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
591 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
594 ; AVX512VL-LABEL: constant_rotate_v64i8:
596 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
597 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
598 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
599 ; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
600 ; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2
601 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
602 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
603 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
604 ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
605 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1
606 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
607 ; AVX512VL-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
608 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
609 ; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2
610 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
611 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
612 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
613 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
614 ; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
615 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
616 ; AVX512VL-NEXT: retq
618 ; AVX512BW-LABEL: constant_rotate_v64i8:
620 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
621 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
622 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
623 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
624 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
625 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
626 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
627 ; AVX512BW-NEXT: retq
629 ; AVX512VLBW-LABEL: constant_rotate_v64i8:
630 ; AVX512VLBW: # %bb.0:
631 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
632 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
633 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
634 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
635 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
636 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
637 ; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
638 ; AVX512VLBW-NEXT: retq
640 ; AVX512VBMI2-LABEL: constant_rotate_v64i8:
641 ; AVX512VBMI2: # %bb.0:
642 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
643 ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
644 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
645 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
646 ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
647 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
648 ; AVX512VBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
649 ; AVX512VBMI2-NEXT: retq
651 ; AVX512VLVBMI2-LABEL: constant_rotate_v64i8:
652 ; AVX512VLVBMI2: # %bb.0:
653 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
654 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
655 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
656 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
657 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
658 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
659 ; AVX512VLVBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
660 ; AVX512VLVBMI2-NEXT: retq
661 %shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
662 %lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
663 %or = or <64 x i8> %shl, %lshr
668 ; Uniform Constant Rotates
671 define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind {
672 ; AVX512-LABEL: splatconstant_rotate_v8i64:
674 ; AVX512-NEXT: vprolq $14, %zmm0, %zmm0
676 %shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>
677 %lshr = lshr <8 x i64> %a, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50>
678 %or = or <8 x i64> %shl, %lshr
682 define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind {
683 ; AVX512-LABEL: splatconstant_rotate_v16i32:
685 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
687 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
688 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
689 %or = or <16 x i32> %shl, %lshr
693 define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
694 ; AVX512F-LABEL: splatconstant_rotate_v32i16:
696 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm1
697 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
698 ; AVX512F-NEXT: vpsrlw $9, %ymm2, %ymm3
699 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
700 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
701 ; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
702 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
703 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
706 ; AVX512VL-LABEL: splatconstant_rotate_v32i16:
708 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm1
709 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
710 ; AVX512VL-NEXT: vpsrlw $9, %ymm2, %ymm3
711 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
712 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
713 ; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm2
714 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
715 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
716 ; AVX512VL-NEXT: retq
718 ; AVX512BW-LABEL: splatconstant_rotate_v32i16:
720 ; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm1
721 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0
722 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
723 ; AVX512BW-NEXT: retq
725 ; AVX512VLBW-LABEL: splatconstant_rotate_v32i16:
726 ; AVX512VLBW: # %bb.0:
727 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm1
728 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm0
729 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
730 ; AVX512VLBW-NEXT: retq
732 ; AVX512VBMI2-LABEL: splatconstant_rotate_v32i16:
733 ; AVX512VBMI2: # %bb.0:
734 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
735 ; AVX512VBMI2-NEXT: retq
737 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v32i16:
738 ; AVX512VLVBMI2: # %bb.0:
739 ; AVX512VLVBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
740 ; AVX512VLVBMI2-NEXT: retq
741 %shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
742 %lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
743 %or = or <32 x i16> %shl, %lshr
747 define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
748 ; AVX512F-LABEL: splatconstant_rotate_v64i8:
750 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
751 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
752 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3
753 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
754 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
755 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2
756 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
757 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
760 ; AVX512VL-LABEL: splatconstant_rotate_v64i8:
762 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
763 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
764 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3
765 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
766 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
767 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2
768 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
769 ; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
770 ; AVX512VL-NEXT: retq
772 ; AVX512BW-LABEL: splatconstant_rotate_v64i8:
774 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
775 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
776 ; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
777 ; AVX512BW-NEXT: retq
779 ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
780 ; AVX512VLBW: # %bb.0:
781 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
782 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
783 ; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
784 ; AVX512VLBW-NEXT: retq
786 ; AVX512VBMI2-LABEL: splatconstant_rotate_v64i8:
787 ; AVX512VBMI2: # %bb.0:
788 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
789 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0
790 ; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
791 ; AVX512VBMI2-NEXT: retq
793 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v64i8:
794 ; AVX512VLVBMI2: # %bb.0:
795 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
796 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0
797 ; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
798 ; AVX512VLVBMI2-NEXT: retq
799 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
800 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
801 %or = or <64 x i8> %shl, %lshr
806 ; Masked Uniform Constant Rotates
809 define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind {
810 ; AVX512-LABEL: splatconstant_rotate_mask_v8i64:
812 ; AVX512-NEXT: vpsrlq $49, %zmm0, %zmm0
813 ; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
815 %shl = shl <8 x i64> %a, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
816 %lshr = lshr <8 x i64> %a, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49>
817 %rmask = and <8 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255, i64 255, i64 127, i64 127, i64 255>
818 %lmask = and <8 x i64> %shl, <i64 33, i64 65, i64 129, i64 257, i64 33, i64 65, i64 129, i64 257>
819 %or = or <8 x i64> %lmask, %rmask
823 define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
824 ; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
826 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
827 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
829 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
830 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
831 %rmask = and <16 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
832 %lmask = and <16 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3, i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
833 %or = or <16 x i32> %lmask, %rmask
837 define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
838 ; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
840 ; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm1
841 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
842 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm3
843 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
844 ; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0
845 ; AVX512F-NEXT: vpsrlw $11, %ymm2, %ymm2
846 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
847 ; AVX512F-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
850 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
852 ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1
853 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
854 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm3
855 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
856 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0
857 ; AVX512VL-NEXT: vpsrlw $11, %ymm2, %ymm2
858 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
859 ; AVX512VL-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
860 ; AVX512VL-NEXT: retq
862 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
864 ; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1
865 ; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0
866 ; AVX512BW-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
867 ; AVX512BW-NEXT: retq
869 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
870 ; AVX512VLBW: # %bb.0:
871 ; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1
872 ; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0
873 ; AVX512VLBW-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
874 ; AVX512VLBW-NEXT: retq
876 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i16:
877 ; AVX512VBMI2: # %bb.0:
878 ; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0
879 ; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
880 ; AVX512VBMI2-NEXT: retq
882 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v32i16:
883 ; AVX512VLVBMI2: # %bb.0:
884 ; AVX512VLVBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0
885 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
886 ; AVX512VLVBMI2-NEXT: retq
887 %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
888 %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
889 %rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
890 %lmask = and <32 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
891 %or = or <32 x i16> %lmask, %rmask
895 define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
896 ; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
898 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
899 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
900 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3
901 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
902 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
903 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2
904 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
905 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
906 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
909 ; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
911 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
912 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
913 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3
914 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
915 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
916 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2
917 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
918 ; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
919 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
920 ; AVX512VL-NEXT: retq
922 ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
924 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
925 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
926 ; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
927 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
928 ; AVX512BW-NEXT: retq
930 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
931 ; AVX512VLBW: # %bb.0:
932 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
933 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
934 ; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
935 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
936 ; AVX512VLBW-NEXT: retq
938 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v64i8:
939 ; AVX512VBMI2: # %bb.0:
940 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
941 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0
942 ; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
943 ; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
944 ; AVX512VBMI2-NEXT: retq
946 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v64i8:
947 ; AVX512VLVBMI2: # %bb.0:
948 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
949 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0
950 ; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
951 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
952 ; AVX512VLVBMI2-NEXT: retq
953 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
954 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
955 %rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
956 %lmask = and <64 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
957 %or = or <64 x i8> %lmask, %rmask