1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
11 define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
12 ; AVX512-LABEL: var_rotate_v8i64:
14 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
16 %b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b
17 %shl = shl <8 x i64> %a, %b
18 %lshr = lshr <8 x i64> %a, %b64
19 %or = or <8 x i64> %shl, %lshr
23 define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
24 ; AVX512-LABEL: var_rotate_v16i32:
26 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
28 %b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
29 %shl = shl <16 x i32> %a, %b
30 %lshr = lshr <16 x i32> %a, %b32
31 %or = or <16 x i32> %shl, %lshr
35 define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
36 ; AVX512F-LABEL: var_rotate_v32i16:
38 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
39 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
40 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
41 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
42 ; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm5
43 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
44 ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
45 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
46 ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
47 ; AVX512F-NEXT: vpord %zmm0, %zmm5, %zmm0
48 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
49 ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm2
50 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
51 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
52 ; AVX512F-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
53 ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
54 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
55 ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
56 ; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
57 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
60 ; AVX512VL-LABEL: var_rotate_v32i16:
62 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
63 ; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
64 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
65 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
66 ; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm5
67 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
68 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2
69 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
70 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
71 ; AVX512VL-NEXT: vpord %zmm0, %zmm5, %zmm0
72 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
73 ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm2
74 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
75 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
76 ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
77 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2
78 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
79 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
80 ; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
81 ; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
84 ; AVX512BW-LABEL: var_rotate_v32i16:
86 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
87 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
88 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
89 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
90 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
93 ; AVX512VLBW-LABEL: var_rotate_v32i16:
94 ; AVX512VLBW: # %bb.0:
95 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
96 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
97 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
98 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
99 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
100 ; AVX512VLBW-NEXT: retq
101 %b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
102 %shl = shl <32 x i16> %a, %b
103 %lshr = lshr <32 x i16> %a, %b16
104 %or = or <32 x i16> %shl, %lshr
108 define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
109 ; AVX512F-LABEL: var_rotate_v64i8:
111 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
112 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
113 ; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
114 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
115 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
116 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
117 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
118 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
119 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm4
120 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
121 ; AVX512F-NEXT: vpandn %ymm4, %ymm6, %ymm4
122 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm7
123 ; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm7
124 ; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm4
125 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
126 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
127 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4
128 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
129 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
130 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm8
131 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
132 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
133 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
134 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
135 ; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm2
136 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4
137 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
138 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
139 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
140 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
141 ; AVX512F-NEXT: vpsrlw $6, %ymm1, %ymm2
142 ; AVX512F-NEXT: vpandn %ymm2, %ymm6, %ymm2
143 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm4
144 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
145 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
146 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
147 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
148 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
149 ; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
150 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm4
151 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
152 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
153 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
156 ; AVX512VL-LABEL: var_rotate_v64i8:
158 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
159 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
160 ; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4
161 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
162 ; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6
163 ; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
164 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
165 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
166 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm4
167 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
168 ; AVX512VL-NEXT: vpandn %ymm4, %ymm6, %ymm4
169 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm7
170 ; AVX512VL-NEXT: vpand %ymm6, %ymm7, %ymm7
171 ; AVX512VL-NEXT: vpor %ymm4, %ymm7, %ymm4
172 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
173 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
174 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm4
175 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
176 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
177 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm8
178 ; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4
179 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
180 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
181 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
182 ; AVX512VL-NEXT: vpandn %ymm2, %ymm5, %ymm2
183 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm4
184 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
185 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
186 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
187 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
188 ; AVX512VL-NEXT: vpsrlw $6, %ymm1, %ymm2
189 ; AVX512VL-NEXT: vpandn %ymm2, %ymm6, %ymm2
190 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm4
191 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
192 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
193 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
194 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
195 ; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2
196 ; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
197 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm4
198 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
199 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
200 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
201 ; AVX512VL-NEXT: retq
203 ; AVX512BW-LABEL: var_rotate_v64i8:
205 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
206 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
207 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
208 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
209 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
210 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
211 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
212 ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4
213 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
214 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
215 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
216 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
217 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
218 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
219 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
220 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm1
221 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
222 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
223 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
224 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
225 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
226 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
227 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
228 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
229 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
230 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
231 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
232 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
233 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
234 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
235 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
236 ; AVX512BW-NEXT: retq
238 ; AVX512VLBW-LABEL: var_rotate_v64i8:
239 ; AVX512VLBW: # %bb.0:
240 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
241 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
242 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
243 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
244 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
245 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
246 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
247 ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4
248 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
249 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
250 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
251 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
252 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
253 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
254 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
255 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm1
256 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
257 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
258 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
259 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1
260 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
261 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
262 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1
263 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
264 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
265 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1
266 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
267 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
268 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
269 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
270 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
271 ; AVX512VLBW-NEXT: retq
272 %b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
273 %shl = shl <64 x i8> %a, %b
274 %lshr = lshr <64 x i8> %a, %b8
275 %or = or <64 x i8> %shl, %lshr
280 ; Uniform Variable Rotates
283 define <8 x i64> @splatvar_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
284 ; AVX512-LABEL: splatvar_rotate_v8i64:
286 ; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1
287 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
289 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
290 %splat64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %splat
291 %shl = shl <8 x i64> %a, %splat
292 %lshr = lshr <8 x i64> %a, %splat64
293 %or = or <8 x i64> %shl, %lshr
297 define <16 x i32> @splatvar_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
298 ; AVX512-LABEL: splatvar_rotate_v16i32:
300 ; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1
301 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
303 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
304 %splat32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
305 %shl = shl <16 x i32> %a, %splat
306 %lshr = lshr <16 x i32> %a, %splat32
307 %or = or <16 x i32> %shl, %lshr
311 define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
312 ; AVX512F-LABEL: splatvar_rotate_v32i16:
314 ; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
315 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
316 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
317 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
318 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
319 ; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm2
320 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
321 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
322 ; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
323 ; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3
324 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
325 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
328 ; AVX512VL-LABEL: splatvar_rotate_v32i16:
330 ; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
331 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
332 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
333 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
334 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
335 ; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm2
336 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
337 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
338 ; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
339 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3
340 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
341 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
342 ; AVX512VL-NEXT: retq
344 ; AVX512BW-LABEL: splatvar_rotate_v32i16:
346 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
347 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
348 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
349 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
350 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
351 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
352 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
353 ; AVX512BW-NEXT: retq
355 ; AVX512VLBW-LABEL: splatvar_rotate_v32i16:
356 ; AVX512VLBW: # %bb.0:
357 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
358 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
359 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
360 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
361 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
362 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
363 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
364 ; AVX512VLBW-NEXT: retq
365 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
366 %splat16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
367 %shl = shl <32 x i16> %a, %splat
368 %lshr = lshr <32 x i16> %a, %splat16
369 %or = or <32 x i16> %shl, %lshr
373 define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
374 ; AVX512F-LABEL: splatvar_rotate_v64i8:
376 ; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
377 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
378 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
379 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
380 ; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
381 ; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
382 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
383 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
384 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
385 ; AVX512F-NEXT: vpsubb %xmm2, %xmm7, %xmm2
386 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
387 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
388 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
389 ; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
390 ; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
391 ; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
392 ; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
393 ; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3
394 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
395 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
396 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
397 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
400 ; AVX512VL-LABEL: splatvar_rotate_v64i8:
402 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
403 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
404 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
405 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
406 ; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
407 ; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
408 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
409 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
410 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
411 ; AVX512VL-NEXT: vpsubb %xmm2, %xmm7, %xmm2
412 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
413 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
414 ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
415 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
416 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
417 ; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
418 ; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
419 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3
420 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
421 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
422 ; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
423 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
424 ; AVX512VL-NEXT: retq
426 ; AVX512BW-LABEL: splatvar_rotate_v64i8:
428 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
429 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
430 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm3, %xmm1
431 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
432 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm3
433 ; AVX512BW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
434 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm4, %xmm2
435 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
436 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm2
437 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
438 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
439 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
440 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
441 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
442 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
443 ; AVX512BW-NEXT: retq
445 ; AVX512VLBW-LABEL: splatvar_rotate_v64i8:
446 ; AVX512VLBW: # %bb.0:
447 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
448 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
449 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm3, %xmm1
450 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
451 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm3
452 ; AVX512VLBW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
453 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm4, %xmm2
454 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
455 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm3, %zmm2
456 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
457 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
458 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
459 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
460 ; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
461 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
462 ; AVX512VLBW-NEXT: retq
463 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
464 %splat8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
465 %shl = shl <64 x i8> %a, %splat
466 %lshr = lshr <64 x i8> %a, %splat8
467 %or = or <64 x i8> %shl, %lshr
475 define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind {
476 ; AVX512-LABEL: constant_rotate_v8i64:
478 ; AVX512-NEXT: vprolvq {{.*}}(%rip), %zmm0, %zmm0
480 %shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>
481 %lshr = lshr <8 x i64> %a, <i64 60, i64 50, i64 14, i64 4, i64 60, i64 50, i64 14, i64 4>
482 %or = or <8 x i64> %shl, %lshr
486 define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind {
487 ; AVX512-LABEL: constant_rotate_v16i32:
489 ; AVX512-NEXT: vprolvd {{.*}}(%rip), %zmm0, %zmm0
491 %shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
492 %lshr = lshr <16 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
493 %or = or <16 x i32> %shl, %lshr
497 define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
498 ; AVX512F-LABEL: constant_rotate_v32i16:
500 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
501 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
502 ; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
503 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
504 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
505 ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
506 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
509 ; AVX512VL-LABEL: constant_rotate_v32i16:
511 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
512 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
513 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
514 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
515 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
516 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm1
517 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
518 ; AVX512VL-NEXT: retq
520 ; AVX512BW-LABEL: constant_rotate_v32i16:
522 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
523 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
524 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
525 ; AVX512BW-NEXT: retq
527 ; AVX512VLBW-LABEL: constant_rotate_v32i16:
528 ; AVX512VLBW: # %bb.0:
529 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
530 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
531 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
532 ; AVX512VLBW-NEXT: retq
533 %shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
534 %lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
535 %or = or <32 x i16> %shl, %lshr
539 define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
540 ; AVX512F-LABEL: constant_rotate_v64i8:
542 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
543 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
544 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
545 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
546 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
547 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
548 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
549 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
550 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
551 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
552 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
553 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
554 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
555 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
556 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
557 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
558 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
559 ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
560 ; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
561 ; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
562 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
563 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
564 ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
565 ; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
566 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
567 ; AVX512F-NEXT: vpackuswb %ymm9, %ymm0, %ymm0
568 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
569 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
570 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
571 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
572 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
573 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
574 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
575 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
576 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
577 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
578 ; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3
579 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
580 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
581 ; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
582 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
583 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
584 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
587 ; AVX512VL-LABEL: constant_rotate_v64i8:
589 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
590 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
591 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
592 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
593 ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
594 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
595 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
596 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
597 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
598 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
599 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
600 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
601 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
602 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
603 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
604 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
605 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
606 ; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1]
607 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5
608 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
609 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
610 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
611 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
612 ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
613 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
614 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
615 ; AVX512VL-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
616 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
617 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
618 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
619 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
620 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
621 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
622 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
623 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
624 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
625 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
626 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
627 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3
628 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
629 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
630 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
631 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
632 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
633 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
634 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
635 ; AVX512VL-NEXT: retq
637 ; AVX512BW-LABEL: constant_rotate_v64i8:
639 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
640 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
641 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
642 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
643 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
644 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
645 ; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3
646 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
647 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
648 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
649 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
650 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
651 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
652 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
653 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
654 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
655 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
656 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
657 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
658 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
659 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
660 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
661 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
662 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
663 ; AVX512BW-NEXT: retq
665 ; AVX512VLBW-LABEL: constant_rotate_v64i8:
666 ; AVX512VLBW: # %bb.0:
667 ; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
668 ; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
669 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
670 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
671 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
672 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
673 ; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3
674 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
675 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
676 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
677 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
678 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
679 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
680 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
681 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
682 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
683 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
684 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
685 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
686 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
687 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
688 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
689 ; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
690 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
691 ; AVX512VLBW-NEXT: retq
692 %shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
693 %lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
694 %or = or <64 x i8> %shl, %lshr
699 ; Uniform Constant Rotates
702 define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind {
703 ; AVX512-LABEL: splatconstant_rotate_v8i64:
705 ; AVX512-NEXT: vprolq $14, %zmm0, %zmm0
707 %shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>
708 %lshr = lshr <8 x i64> %a, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50>
709 %or = or <8 x i64> %shl, %lshr
713 define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind {
714 ; AVX512-LABEL: splatconstant_rotate_v16i32:
716 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
718 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
719 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
720 %or = or <16 x i32> %shl, %lshr
724 define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
725 ; AVX512F-LABEL: splatconstant_rotate_v32i16:
727 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2
728 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
729 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
730 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2
731 ; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1
732 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
735 ; AVX512VL-LABEL: splatconstant_rotate_v32i16:
737 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2
738 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
739 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
740 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2
741 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
742 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
743 ; AVX512VL-NEXT: retq
745 ; AVX512BW-LABEL: splatconstant_rotate_v32i16:
747 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm1
748 ; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm0
749 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
750 ; AVX512BW-NEXT: retq
752 ; AVX512VLBW-LABEL: splatconstant_rotate_v32i16:
753 ; AVX512VLBW: # %bb.0:
754 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm1
755 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm0
756 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
757 ; AVX512VLBW-NEXT: retq
758 %shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
759 %lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
760 %or = or <32 x i16> %shl, %lshr
764 define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
765 ; AVX512F-LABEL: splatconstant_rotate_v64i8:
767 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
768 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
769 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
770 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
771 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
772 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
773 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
774 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
775 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
776 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
777 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
780 ; AVX512VL-LABEL: splatconstant_rotate_v64i8:
782 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
783 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
784 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
785 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
786 ; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
787 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
788 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
789 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
790 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
791 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
792 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
793 ; AVX512VL-NEXT: retq
795 ; AVX512BW-LABEL: splatconstant_rotate_v64i8:
797 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
798 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
799 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
800 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
801 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
802 ; AVX512BW-NEXT: retq
804 ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
805 ; AVX512VLBW: # %bb.0:
806 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
807 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
808 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
809 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
810 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
811 ; AVX512VLBW-NEXT: retq
812 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
813 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
814 %or = or <64 x i8> %shl, %lshr
819 ; Masked Uniform Constant Rotates
822 define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind {
823 ; AVX512-LABEL: splatconstant_rotate_mask_v8i64:
825 ; AVX512-NEXT: vprolq $15, %zmm0, %zmm0
826 ; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
828 %shl = shl <8 x i64> %a, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
829 %lshr = lshr <8 x i64> %a, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49>
830 %rmask = and <8 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255, i64 255, i64 127, i64 127, i64 255>
831 %lmask = and <8 x i64> %shl, <i64 33, i64 65, i64 129, i64 257, i64 33, i64 65, i64 129, i64 257>
832 %or = or <8 x i64> %lmask, %rmask
836 define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
837 ; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
839 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
840 ; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
842 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
843 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
844 %rmask = and <16 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
845 %lmask = and <16 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3, i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
846 %or = or <16 x i32> %lmask, %rmask
850 define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
851 ; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
853 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
854 ; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm3
855 ; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0
856 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
857 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
858 ; AVX512F-NEXT: vpsrlw $11, %ymm1, %ymm3
859 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
860 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
861 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
864 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
866 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
867 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3
868 ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0
869 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
870 ; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
871 ; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm3
872 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
873 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
874 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
875 ; AVX512VL-NEXT: retq
877 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
879 ; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1
880 ; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0
881 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
882 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
883 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
884 ; AVX512BW-NEXT: retq
886 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
887 ; AVX512VLBW: # %bb.0:
888 ; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1
889 ; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0
890 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
891 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
892 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
893 ; AVX512VLBW-NEXT: retq
894 %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
895 %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
896 %rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
897 %lmask = and <32 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
898 %or = or <32 x i16> %lmask, %rmask
902 define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
903 ; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
905 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
906 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
907 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
908 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
909 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
910 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
911 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
912 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
913 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm4
914 ; AVX512F-NEXT: vpandn %ymm4, %ymm3, %ymm4
915 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
916 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
917 ; AVX512F-NEXT: vpor %ymm4, %ymm1, %ymm1
918 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
921 ; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
923 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
924 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
925 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
926 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
927 ; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
928 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
929 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
930 ; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
931 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4
932 ; AVX512VL-NEXT: vpandn %ymm4, %ymm3, %ymm4
933 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
934 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
935 ; AVX512VL-NEXT: vpor %ymm4, %ymm1, %ymm1
936 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
937 ; AVX512VL-NEXT: retq
939 ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
941 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
942 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
943 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
944 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
945 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
946 ; AVX512BW-NEXT: retq
948 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
949 ; AVX512VLBW: # %bb.0:
950 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
951 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
952 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
953 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
954 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
955 ; AVX512VLBW-NEXT: retq
956 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
957 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
958 %rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
959 %lmask = and <64 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
960 %or = or <64 x i8> %lmask, %rmask