1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
11 define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
12 ; AVX512-LABEL: var_rotate_v8i64:
14 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
16 %b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b
17 %shl = shl <8 x i64> %a, %b
18 %lshr = lshr <8 x i64> %a, %b64
19 %or = or <8 x i64> %shl, %lshr
23 define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
24 ; AVX512-LABEL: var_rotate_v16i32:
26 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
28 %b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
29 %shl = shl <16 x i32> %a, %b
30 %lshr = lshr <16 x i32> %a, %b32
31 %or = or <16 x i32> %shl, %lshr
35 define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
36 ; AVX512F-LABEL: var_rotate_v32i16:
38 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
39 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
40 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
41 ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
42 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
43 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
44 ; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
45 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
46 ; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3
47 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
48 ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
49 ; AVX512F-NEXT: vpord %zmm2, %zmm5, %zmm2
50 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
51 ; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
52 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
53 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
54 ; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
55 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
56 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
57 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
58 ; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
59 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
60 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
63 ; AVX512VL-LABEL: var_rotate_v32i16:
65 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
66 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
67 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
68 ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
69 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
70 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
71 ; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
72 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
73 ; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3
74 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
75 ; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
76 ; AVX512VL-NEXT: vpord %zmm2, %zmm5, %zmm2
77 ; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
78 ; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
79 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
80 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
81 ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
82 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm6, %ymm1
83 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
84 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
85 ; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
86 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
87 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
90 ; AVX512BW-LABEL: var_rotate_v32i16:
92 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
93 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
94 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
95 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
96 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
99 ; AVX512VLBW-LABEL: var_rotate_v32i16:
100 ; AVX512VLBW: # %bb.0:
101 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
102 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
103 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
104 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
105 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
106 ; AVX512VLBW-NEXT: retq
107 %b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
108 %shl = shl <32 x i16> %a, %b
109 %lshr = lshr <32 x i16> %a, %b16
110 %or = or <32 x i16> %shl, %lshr
114 define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
115 ; AVX512F-LABEL: var_rotate_v64i8:
117 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
118 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
119 ; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
120 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
121 ; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
122 ; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6
123 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
124 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
125 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
126 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
127 ; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
128 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
129 ; AVX512F-NEXT: vpandn %ymm4, %ymm6, %ymm4
130 ; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm7
131 ; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm7
132 ; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm4
133 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
134 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
135 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
136 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
137 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
138 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8
139 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
140 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
141 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
142 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
143 ; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3
144 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
145 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
146 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
147 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
148 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
149 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
150 ; AVX512F-NEXT: vpandn %ymm3, %ymm6, %ymm3
151 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
152 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
153 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
154 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
155 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
156 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
157 ; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
158 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
159 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
160 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
161 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
162 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
165 ; AVX512VL-LABEL: var_rotate_v64i8:
167 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
168 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
169 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4
170 ; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5
171 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
172 ; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5
173 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
174 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3
175 ; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4
176 ; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5
177 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
178 ; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm5
179 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
180 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3
181 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4
182 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
183 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
184 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8
185 ; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4
186 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
187 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
188 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
189 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
190 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm4
191 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
192 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
193 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
194 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
195 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4
196 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
197 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
198 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
199 ; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
200 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
201 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
202 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
203 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
204 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
205 ; AVX512VL-NEXT: retq
207 ; AVX512BW-LABEL: var_rotate_v64i8:
209 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
210 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
211 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
212 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
213 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
214 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
215 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
216 ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4
217 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
218 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
219 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
220 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
221 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
222 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
223 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
224 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm1
225 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
226 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
227 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
228 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
229 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
230 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
231 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
232 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
233 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
234 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
235 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
236 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
237 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
238 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
239 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
240 ; AVX512BW-NEXT: retq
242 ; AVX512VLBW-LABEL: var_rotate_v64i8:
243 ; AVX512VLBW: # %bb.0:
244 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
245 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
246 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
247 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
248 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
249 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
250 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
251 ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4
252 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
253 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
254 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
255 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
256 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
257 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
258 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
259 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm1
260 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
261 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
262 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
263 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1
264 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
265 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
266 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1
267 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
268 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
269 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1
270 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
271 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
272 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
273 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
274 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
275 ; AVX512VLBW-NEXT: retq
276 %b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
277 %shl = shl <64 x i8> %a, %b
278 %lshr = lshr <64 x i8> %a, %b8
279 %or = or <64 x i8> %shl, %lshr
284 ; Uniform Variable Rotates
287 define <8 x i64> @splatvar_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
288 ; AVX512-LABEL: splatvar_rotate_v8i64:
290 ; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1
291 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
293 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
294 %splat64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %splat
295 %shl = shl <8 x i64> %a, %splat
296 %lshr = lshr <8 x i64> %a, %splat64
297 %or = or <8 x i64> %shl, %lshr
301 define <16 x i32> @splatvar_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
302 ; AVX512-LABEL: splatvar_rotate_v16i32:
304 ; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1
305 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
307 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
308 %splat32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
309 %shl = shl <16 x i32> %a, %splat
310 %lshr = lshr <16 x i32> %a, %splat32
311 %or = or <16 x i32> %shl, %lshr
315 define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
316 ; AVX512F-LABEL: splatvar_rotate_v32i16:
318 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
319 ; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
320 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
321 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
322 ; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
323 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
324 ; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1
325 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
326 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
327 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
328 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
329 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
330 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
331 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
334 ; AVX512VL-LABEL: splatvar_rotate_v32i16:
336 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
337 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
338 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
339 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
340 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
341 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
342 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1
343 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
344 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
345 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
346 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
347 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
348 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
349 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
350 ; AVX512VL-NEXT: retq
352 ; AVX512BW-LABEL: splatvar_rotate_v32i16:
354 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
355 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
356 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
357 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
358 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
359 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
360 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
361 ; AVX512BW-NEXT: retq
363 ; AVX512VLBW-LABEL: splatvar_rotate_v32i16:
364 ; AVX512VLBW: # %bb.0:
365 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
366 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
367 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
368 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
369 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
370 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
371 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
372 ; AVX512VLBW-NEXT: retq
373 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
374 %splat16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
375 %shl = shl <32 x i16> %a, %splat
376 %lshr = lshr <32 x i16> %a, %splat16
377 %or = or <32 x i16> %shl, %lshr
381 define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
382 ; AVX512F-LABEL: splatvar_rotate_v64i8:
384 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
385 ; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
386 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
387 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
388 ; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
389 ; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
390 ; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
391 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
392 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
393 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
394 ; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1
395 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
396 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
397 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
398 ; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
399 ; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
400 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
401 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
402 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
403 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
404 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
405 ; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
406 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
407 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
410 ; AVX512VL-LABEL: splatvar_rotate_v64i8:
412 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
413 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
414 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
415 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
416 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
417 ; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
418 ; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
419 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
420 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
421 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
422 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1
423 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
424 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
425 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
426 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
427 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
428 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
429 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
430 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
431 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
432 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
433 ; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
434 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
435 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
436 ; AVX512VL-NEXT: retq
438 ; AVX512BW-LABEL: splatvar_rotate_v64i8:
440 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
441 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
442 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm3, %xmm1
443 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
444 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm3
445 ; AVX512BW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
446 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm4, %xmm2
447 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
448 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm2
449 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
450 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
451 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
452 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
453 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
454 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
455 ; AVX512BW-NEXT: retq
457 ; AVX512VLBW-LABEL: splatvar_rotate_v64i8:
458 ; AVX512VLBW: # %bb.0:
459 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
460 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
461 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm3, %xmm1
462 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
463 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm3
464 ; AVX512VLBW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
465 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm4, %xmm2
466 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
467 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm3, %zmm2
468 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
469 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
470 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
471 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
472 ; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
473 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
474 ; AVX512VLBW-NEXT: retq
475 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
476 %splat8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
477 %shl = shl <64 x i8> %a, %splat
478 %lshr = lshr <64 x i8> %a, %splat8
479 %or = or <64 x i8> %shl, %lshr
487 define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind {
488 ; AVX512-LABEL: constant_rotate_v8i64:
490 ; AVX512-NEXT: vprolvq {{.*}}(%rip), %zmm0, %zmm0
492 %shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>
493 %lshr = lshr <8 x i64> %a, <i64 60, i64 50, i64 14, i64 4, i64 60, i64 50, i64 14, i64 4>
494 %or = or <8 x i64> %shl, %lshr
498 define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind {
499 ; AVX512-LABEL: constant_rotate_v16i32:
501 ; AVX512-NEXT: vprolvd {{.*}}(%rip), %zmm0, %zmm0
503 %shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
504 %lshr = lshr <16 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
505 %or = or <16 x i32> %shl, %lshr
509 define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
510 ; AVX512F-LABEL: constant_rotate_v32i16:
512 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
513 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
514 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
515 ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
516 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
517 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
518 ; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
519 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
520 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
523 ; AVX512VL-LABEL: constant_rotate_v32i16:
525 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
526 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
527 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
528 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm1
529 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
530 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
531 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
532 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
533 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
534 ; AVX512VL-NEXT: retq
536 ; AVX512BW-LABEL: constant_rotate_v32i16:
538 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
539 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
540 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
541 ; AVX512BW-NEXT: retq
543 ; AVX512VLBW-LABEL: constant_rotate_v32i16:
544 ; AVX512VLBW: # %bb.0:
545 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
546 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
547 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
548 ; AVX512VLBW-NEXT: retq
549 %shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
550 %lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
551 %or = or <32 x i16> %shl, %lshr
555 define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
556 ; AVX512F-LABEL: constant_rotate_v64i8:
558 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
559 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
560 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
561 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
562 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
563 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
564 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
565 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
566 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
567 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
568 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
569 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
570 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
571 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
572 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
573 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
574 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
575 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
576 ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
577 ; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
578 ; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
579 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
580 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
581 ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
582 ; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
583 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
584 ; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
585 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
586 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
587 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
588 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
589 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
590 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
591 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
592 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
593 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
594 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
595 ; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3
596 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
597 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
598 ; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
599 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
600 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
601 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
602 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
605 ; AVX512VL-LABEL: constant_rotate_v64i8:
607 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
608 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
609 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
610 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
611 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
612 ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
613 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
614 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
615 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
616 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
617 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
618 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
619 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
620 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
621 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
622 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
623 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
624 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
625 ; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1]
626 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5
627 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
628 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
629 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
630 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
631 ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
632 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
633 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
634 ; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
635 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
636 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
637 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
638 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
639 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
640 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
641 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
642 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
643 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
644 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
645 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
646 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3
647 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
648 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
649 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
650 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
651 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
652 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
653 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
654 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
655 ; AVX512VL-NEXT: retq
657 ; AVX512BW-LABEL: constant_rotate_v64i8:
659 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
660 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
661 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
662 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
663 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
664 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
665 ; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3
666 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
667 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
668 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
669 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
670 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
671 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
672 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
673 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
674 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
675 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
676 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
677 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
678 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
679 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
680 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
681 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
682 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
683 ; AVX512BW-NEXT: retq
685 ; AVX512VLBW-LABEL: constant_rotate_v64i8:
686 ; AVX512VLBW: # %bb.0:
687 ; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
688 ; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
689 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
690 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
691 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
692 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
693 ; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3
694 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
695 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
696 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
697 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
698 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
699 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
700 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
701 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
702 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
703 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
704 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
705 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
706 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
707 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
708 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
709 ; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
710 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
711 ; AVX512VLBW-NEXT: retq
712 %shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
713 %lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
714 %or = or <64 x i8> %shl, %lshr
719 ; Uniform Constant Rotates
722 define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind {
723 ; AVX512-LABEL: splatconstant_rotate_v8i64:
725 ; AVX512-NEXT: vprolq $14, %zmm0, %zmm0
727 %shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>
728 %lshr = lshr <8 x i64> %a, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50>
729 %or = or <8 x i64> %shl, %lshr
733 define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind {
734 ; AVX512-LABEL: splatconstant_rotate_v16i32:
736 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
738 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
739 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
740 %or = or <16 x i32> %shl, %lshr
744 define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
745 ; AVX512F-LABEL: splatconstant_rotate_v32i16:
747 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
748 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2
749 ; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1
750 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
751 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2
752 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
753 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
754 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
757 ; AVX512VL-LABEL: splatconstant_rotate_v32i16:
759 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
760 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2
761 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
762 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
763 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2
764 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
765 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
766 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
767 ; AVX512VL-NEXT: retq
769 ; AVX512BW-LABEL: splatconstant_rotate_v32i16:
771 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm1
772 ; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm0
773 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
774 ; AVX512BW-NEXT: retq
776 ; AVX512VLBW-LABEL: splatconstant_rotate_v32i16:
777 ; AVX512VLBW: # %bb.0:
778 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm1
779 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm0
780 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
781 ; AVX512VLBW-NEXT: retq
782 %shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
783 %lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
784 %or = or <32 x i16> %shl, %lshr
788 define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
789 ; AVX512F-LABEL: splatconstant_rotate_v64i8:
791 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
792 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
793 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
794 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
795 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
796 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
797 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
798 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
799 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
800 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
801 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
802 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
803 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
806 ; AVX512VL-LABEL: splatconstant_rotate_v64i8:
808 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
809 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
810 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
811 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
812 ; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1
813 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
814 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
815 ; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm0
816 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
817 ; AVX512VL-NEXT: retq
819 ; AVX512BW-LABEL: splatconstant_rotate_v64i8:
821 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
822 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
823 ; AVX512BW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
824 ; AVX512BW-NEXT: retq
826 ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
827 ; AVX512VLBW: # %bb.0:
828 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
829 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
830 ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
831 ; AVX512VLBW-NEXT: retq
832 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
833 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
834 %or = or <64 x i8> %shl, %lshr
839 ; Masked Uniform Constant Rotates
842 define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind {
843 ; AVX512-LABEL: splatconstant_rotate_mask_v8i64:
845 ; AVX512-NEXT: vprolq $15, %zmm0, %zmm0
846 ; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
848 %shl = shl <8 x i64> %a, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
849 %lshr = lshr <8 x i64> %a, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49>
850 %rmask = and <8 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255, i64 255, i64 127, i64 127, i64 255>
851 %lmask = and <8 x i64> %shl, <i64 33, i64 65, i64 129, i64 257, i64 33, i64 65, i64 129, i64 257>
852 %or = or <8 x i64> %lmask, %rmask
856 define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
857 ; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
859 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
860 ; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
862 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
863 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
864 %rmask = and <16 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
865 %lmask = and <16 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3, i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
866 %or = or <16 x i32> %lmask, %rmask
870 define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
871 ; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
873 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
874 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
875 ; AVX512F-NEXT: vpsrlw $11, %ymm1, %ymm3
876 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
877 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
878 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
879 ; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm3
880 ; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0
881 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
882 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
883 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
886 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
888 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
889 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
890 ; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm3
891 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
892 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
893 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
894 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3
895 ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0
896 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
897 ; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
898 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
899 ; AVX512VL-NEXT: retq
901 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
903 ; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1
904 ; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0
905 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
906 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
907 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
908 ; AVX512BW-NEXT: retq
910 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
911 ; AVX512VLBW: # %bb.0:
912 ; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1
913 ; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0
914 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
915 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
916 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
917 ; AVX512VLBW-NEXT: retq
918 %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
919 %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
920 %rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
921 %lmask = and <32 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
922 %or = or <32 x i16> %lmask, %rmask
926 define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
927 ; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
929 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
930 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
931 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
932 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
933 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
934 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
935 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
936 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
937 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
938 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
939 ; AVX512F-NEXT: vpandn %ymm4, %ymm3, %ymm4
940 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
941 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
942 ; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
943 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
944 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
947 ; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
949 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
950 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
951 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
952 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
953 ; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1
954 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
955 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
956 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
957 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
958 ; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm3, %ymm0
959 ; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
960 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
961 ; AVX512VL-NEXT: retq
963 ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
965 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
966 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
967 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
968 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
969 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
970 ; AVX512BW-NEXT: retq
972 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
973 ; AVX512VLBW: # %bb.0:
974 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
975 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
976 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
977 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
978 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
979 ; AVX512VLBW-NEXT: retq
980 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
981 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
982 %rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
983 %lmask = and <64 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
984 %or = or <64 x i8> %lmask, %rmask