1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --no_x86_scrub_mem_shuffle
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
6 define <8 x i64> @var_shuffle_v8i64(<8 x i64> %v, <8 x i64> %indices) nounwind {
7 ; AVX512-LABEL: var_shuffle_v8i64:
9 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
11 %index0 = extractelement <8 x i64> %indices, i32 0
12 %index1 = extractelement <8 x i64> %indices, i32 1
13 %index2 = extractelement <8 x i64> %indices, i32 2
14 %index3 = extractelement <8 x i64> %indices, i32 3
15 %index4 = extractelement <8 x i64> %indices, i32 4
16 %index5 = extractelement <8 x i64> %indices, i32 5
17 %index6 = extractelement <8 x i64> %indices, i32 6
18 %index7 = extractelement <8 x i64> %indices, i32 7
19 %v0 = extractelement <8 x i64> %v, i64 %index0
20 %v1 = extractelement <8 x i64> %v, i64 %index1
21 %v2 = extractelement <8 x i64> %v, i64 %index2
22 %v3 = extractelement <8 x i64> %v, i64 %index3
23 %v4 = extractelement <8 x i64> %v, i64 %index4
24 %v5 = extractelement <8 x i64> %v, i64 %index5
25 %v6 = extractelement <8 x i64> %v, i64 %index6
26 %v7 = extractelement <8 x i64> %v, i64 %index7
27 %ret0 = insertelement <8 x i64> undef, i64 %v0, i32 0
28 %ret1 = insertelement <8 x i64> %ret0, i64 %v1, i32 1
29 %ret2 = insertelement <8 x i64> %ret1, i64 %v2, i32 2
30 %ret3 = insertelement <8 x i64> %ret2, i64 %v3, i32 3
31 %ret4 = insertelement <8 x i64> %ret3, i64 %v4, i32 4
32 %ret5 = insertelement <8 x i64> %ret4, i64 %v5, i32 5
33 %ret6 = insertelement <8 x i64> %ret5, i64 %v6, i32 6
34 %ret7 = insertelement <8 x i64> %ret6, i64 %v7, i32 7
38 define <16 x i32> @var_shuffle_v16i32(<16 x i32> %v, <16 x i32> %indices) nounwind {
39 ; AVX512-LABEL: var_shuffle_v16i32:
41 ; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
43 %index0 = extractelement <16 x i32> %indices, i32 0
44 %index1 = extractelement <16 x i32> %indices, i32 1
45 %index2 = extractelement <16 x i32> %indices, i32 2
46 %index3 = extractelement <16 x i32> %indices, i32 3
47 %index4 = extractelement <16 x i32> %indices, i32 4
48 %index5 = extractelement <16 x i32> %indices, i32 5
49 %index6 = extractelement <16 x i32> %indices, i32 6
50 %index7 = extractelement <16 x i32> %indices, i32 7
51 %index8 = extractelement <16 x i32> %indices, i32 8
52 %index9 = extractelement <16 x i32> %indices, i32 9
53 %index10 = extractelement <16 x i32> %indices, i32 10
54 %index11 = extractelement <16 x i32> %indices, i32 11
55 %index12 = extractelement <16 x i32> %indices, i32 12
56 %index13 = extractelement <16 x i32> %indices, i32 13
57 %index14 = extractelement <16 x i32> %indices, i32 14
58 %index15 = extractelement <16 x i32> %indices, i32 15
59 %v0 = extractelement <16 x i32> %v, i32 %index0
60 %v1 = extractelement <16 x i32> %v, i32 %index1
61 %v2 = extractelement <16 x i32> %v, i32 %index2
62 %v3 = extractelement <16 x i32> %v, i32 %index3
63 %v4 = extractelement <16 x i32> %v, i32 %index4
64 %v5 = extractelement <16 x i32> %v, i32 %index5
65 %v6 = extractelement <16 x i32> %v, i32 %index6
66 %v7 = extractelement <16 x i32> %v, i32 %index7
67 %v8 = extractelement <16 x i32> %v, i32 %index8
68 %v9 = extractelement <16 x i32> %v, i32 %index9
69 %v10 = extractelement <16 x i32> %v, i32 %index10
70 %v11 = extractelement <16 x i32> %v, i32 %index11
71 %v12 = extractelement <16 x i32> %v, i32 %index12
72 %v13 = extractelement <16 x i32> %v, i32 %index13
73 %v14 = extractelement <16 x i32> %v, i32 %index14
74 %v15 = extractelement <16 x i32> %v, i32 %index15
75 %ret0 = insertelement <16 x i32> undef, i32 %v0, i32 0
76 %ret1 = insertelement <16 x i32> %ret0, i32 %v1, i32 1
77 %ret2 = insertelement <16 x i32> %ret1, i32 %v2, i32 2
78 %ret3 = insertelement <16 x i32> %ret2, i32 %v3, i32 3
79 %ret4 = insertelement <16 x i32> %ret3, i32 %v4, i32 4
80 %ret5 = insertelement <16 x i32> %ret4, i32 %v5, i32 5
81 %ret6 = insertelement <16 x i32> %ret5, i32 %v6, i32 6
82 %ret7 = insertelement <16 x i32> %ret6, i32 %v7, i32 7
83 %ret8 = insertelement <16 x i32> %ret7, i32 %v8, i32 8
84 %ret9 = insertelement <16 x i32> %ret8, i32 %v9, i32 9
85 %ret10 = insertelement <16 x i32> %ret9, i32 %v10, i32 10
86 %ret11 = insertelement <16 x i32> %ret10, i32 %v11, i32 11
87 %ret12 = insertelement <16 x i32> %ret11, i32 %v12, i32 12
88 %ret13 = insertelement <16 x i32> %ret12, i32 %v13, i32 13
89 %ret14 = insertelement <16 x i32> %ret13, i32 %v14, i32 14
90 %ret15 = insertelement <16 x i32> %ret14, i32 %v15, i32 15
94 define <32 x i16> @var_shuffle_v32i16(<32 x i16> %v, <32 x i16> %indices) nounwind {
95 ; AVX512F-LABEL: var_shuffle_v32i16:
97 ; AVX512F-NEXT: pushq %rbp
98 ; AVX512F-NEXT: movq %rsp, %rbp
99 ; AVX512F-NEXT: andq $-64, %rsp
100 ; AVX512F-NEXT: subq $128, %rsp
101 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
102 ; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3
103 ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4
104 ; AVX512F-NEXT: vpextrw $0, %xmm4, %eax
105 ; AVX512F-NEXT: vmovaps %zmm0, (%rsp)
106 ; AVX512F-NEXT: andl $31, %eax
107 ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
108 ; AVX512F-NEXT: vmovd %eax, %xmm0
109 ; AVX512F-NEXT: vpextrw $1, %xmm4, %eax
110 ; AVX512F-NEXT: andl $31, %eax
111 ; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
112 ; AVX512F-NEXT: vpextrw $2, %xmm4, %eax
113 ; AVX512F-NEXT: andl $31, %eax
114 ; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
115 ; AVX512F-NEXT: vpextrw $3, %xmm4, %eax
116 ; AVX512F-NEXT: andl $31, %eax
117 ; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
118 ; AVX512F-NEXT: vpextrw $4, %xmm4, %eax
119 ; AVX512F-NEXT: andl $31, %eax
120 ; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
121 ; AVX512F-NEXT: vpextrw $5, %xmm4, %eax
122 ; AVX512F-NEXT: andl $31, %eax
123 ; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
124 ; AVX512F-NEXT: vpextrw $6, %xmm4, %eax
125 ; AVX512F-NEXT: andl $31, %eax
126 ; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
127 ; AVX512F-NEXT: vpextrw $7, %xmm4, %eax
128 ; AVX512F-NEXT: andl $31, %eax
129 ; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
130 ; AVX512F-NEXT: vpextrw $0, %xmm3, %eax
131 ; AVX512F-NEXT: andl $31, %eax
132 ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
133 ; AVX512F-NEXT: vmovd %eax, %xmm4
134 ; AVX512F-NEXT: vpextrw $1, %xmm3, %eax
135 ; AVX512F-NEXT: andl $31, %eax
136 ; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4
137 ; AVX512F-NEXT: vpextrw $2, %xmm3, %eax
138 ; AVX512F-NEXT: andl $31, %eax
139 ; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4
140 ; AVX512F-NEXT: vpextrw $3, %xmm3, %eax
141 ; AVX512F-NEXT: andl $31, %eax
142 ; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4
143 ; AVX512F-NEXT: vpextrw $4, %xmm3, %eax
144 ; AVX512F-NEXT: andl $31, %eax
145 ; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4
146 ; AVX512F-NEXT: vpextrw $5, %xmm3, %eax
147 ; AVX512F-NEXT: andl $31, %eax
148 ; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4
149 ; AVX512F-NEXT: vpextrw $6, %xmm3, %eax
150 ; AVX512F-NEXT: andl $31, %eax
151 ; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4
152 ; AVX512F-NEXT: vpextrw $7, %xmm3, %eax
153 ; AVX512F-NEXT: andl $31, %eax
154 ; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm3
155 ; AVX512F-NEXT: vpextrw $0, %xmm2, %eax
156 ; AVX512F-NEXT: andl $31, %eax
157 ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
158 ; AVX512F-NEXT: vmovd %eax, %xmm4
159 ; AVX512F-NEXT: vpextrw $1, %xmm2, %eax
160 ; AVX512F-NEXT: andl $31, %eax
161 ; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4
162 ; AVX512F-NEXT: vpextrw $2, %xmm2, %eax
163 ; AVX512F-NEXT: andl $31, %eax
164 ; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4
165 ; AVX512F-NEXT: vpextrw $3, %xmm2, %eax
166 ; AVX512F-NEXT: andl $31, %eax
167 ; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4
168 ; AVX512F-NEXT: vpextrw $4, %xmm2, %eax
169 ; AVX512F-NEXT: andl $31, %eax
170 ; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4
171 ; AVX512F-NEXT: vpextrw $5, %xmm2, %eax
172 ; AVX512F-NEXT: andl $31, %eax
173 ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
174 ; AVX512F-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4
175 ; AVX512F-NEXT: vpextrw $6, %xmm2, %eax
176 ; AVX512F-NEXT: andl $31, %eax
177 ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
178 ; AVX512F-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4
179 ; AVX512F-NEXT: vpextrw $7, %xmm2, %eax
180 ; AVX512F-NEXT: andl $31, %eax
181 ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
182 ; AVX512F-NEXT: vpinsrw $7, %eax, %xmm4, %xmm2
183 ; AVX512F-NEXT: vpextrw $0, %xmm1, %eax
184 ; AVX512F-NEXT: andl $31, %eax
185 ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
186 ; AVX512F-NEXT: vmovd %eax, %xmm4
187 ; AVX512F-NEXT: vpextrw $1, %xmm1, %eax
188 ; AVX512F-NEXT: andl $31, %eax
189 ; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4
190 ; AVX512F-NEXT: vpextrw $2, %xmm1, %eax
191 ; AVX512F-NEXT: andl $31, %eax
192 ; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4
193 ; AVX512F-NEXT: vpextrw $3, %xmm1, %eax
194 ; AVX512F-NEXT: andl $31, %eax
195 ; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4
196 ; AVX512F-NEXT: vpextrw $4, %xmm1, %eax
197 ; AVX512F-NEXT: andl $31, %eax
198 ; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4
199 ; AVX512F-NEXT: vpextrw $5, %xmm1, %eax
200 ; AVX512F-NEXT: andl $31, %eax
201 ; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4
202 ; AVX512F-NEXT: vpextrw $6, %xmm1, %eax
203 ; AVX512F-NEXT: andl $31, %eax
204 ; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4
205 ; AVX512F-NEXT: vpextrw $7, %xmm1, %eax
206 ; AVX512F-NEXT: andl $31, %eax
207 ; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm1
208 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
209 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
210 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
211 ; AVX512F-NEXT: movq %rbp, %rsp
212 ; AVX512F-NEXT: popq %rbp
215 ; AVX512BW-LABEL: var_shuffle_v32i16:
217 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
218 ; AVX512BW-NEXT: retq
220 ; AVX512VBMI-LABEL: var_shuffle_v32i16:
221 ; AVX512VBMI: # %bb.0:
222 ; AVX512VBMI-NEXT: vpermw %zmm0, %zmm1, %zmm0
223 ; AVX512VBMI-NEXT: retq
224 %index0 = extractelement <32 x i16> %indices, i32 0
225 %index1 = extractelement <32 x i16> %indices, i32 1
226 %index2 = extractelement <32 x i16> %indices, i32 2
227 %index3 = extractelement <32 x i16> %indices, i32 3
228 %index4 = extractelement <32 x i16> %indices, i32 4
229 %index5 = extractelement <32 x i16> %indices, i32 5
230 %index6 = extractelement <32 x i16> %indices, i32 6
231 %index7 = extractelement <32 x i16> %indices, i32 7
232 %index8 = extractelement <32 x i16> %indices, i32 8
233 %index9 = extractelement <32 x i16> %indices, i32 9
234 %index10 = extractelement <32 x i16> %indices, i32 10
235 %index11 = extractelement <32 x i16> %indices, i32 11
236 %index12 = extractelement <32 x i16> %indices, i32 12
237 %index13 = extractelement <32 x i16> %indices, i32 13
238 %index14 = extractelement <32 x i16> %indices, i32 14
239 %index15 = extractelement <32 x i16> %indices, i32 15
240 %index16 = extractelement <32 x i16> %indices, i32 16
241 %index17 = extractelement <32 x i16> %indices, i32 17
242 %index18 = extractelement <32 x i16> %indices, i32 18
243 %index19 = extractelement <32 x i16> %indices, i32 19
244 %index20 = extractelement <32 x i16> %indices, i32 20
245 %index21 = extractelement <32 x i16> %indices, i32 21
246 %index22 = extractelement <32 x i16> %indices, i32 22
247 %index23 = extractelement <32 x i16> %indices, i32 23
248 %index24 = extractelement <32 x i16> %indices, i32 24
249 %index25 = extractelement <32 x i16> %indices, i32 25
250 %index26 = extractelement <32 x i16> %indices, i32 26
251 %index27 = extractelement <32 x i16> %indices, i32 27
252 %index28 = extractelement <32 x i16> %indices, i32 28
253 %index29 = extractelement <32 x i16> %indices, i32 29
254 %index30 = extractelement <32 x i16> %indices, i32 30
255 %index31 = extractelement <32 x i16> %indices, i32 31
256 %v0 = extractelement <32 x i16> %v, i16 %index0
257 %v1 = extractelement <32 x i16> %v, i16 %index1
258 %v2 = extractelement <32 x i16> %v, i16 %index2
259 %v3 = extractelement <32 x i16> %v, i16 %index3
260 %v4 = extractelement <32 x i16> %v, i16 %index4
261 %v5 = extractelement <32 x i16> %v, i16 %index5
262 %v6 = extractelement <32 x i16> %v, i16 %index6
263 %v7 = extractelement <32 x i16> %v, i16 %index7
264 %v8 = extractelement <32 x i16> %v, i16 %index8
265 %v9 = extractelement <32 x i16> %v, i16 %index9
266 %v10 = extractelement <32 x i16> %v, i16 %index10
267 %v11 = extractelement <32 x i16> %v, i16 %index11
268 %v12 = extractelement <32 x i16> %v, i16 %index12
269 %v13 = extractelement <32 x i16> %v, i16 %index13
270 %v14 = extractelement <32 x i16> %v, i16 %index14
271 %v15 = extractelement <32 x i16> %v, i16 %index15
272 %v16 = extractelement <32 x i16> %v, i16 %index16
273 %v17 = extractelement <32 x i16> %v, i16 %index17
274 %v18 = extractelement <32 x i16> %v, i16 %index18
275 %v19 = extractelement <32 x i16> %v, i16 %index19
276 %v20 = extractelement <32 x i16> %v, i16 %index20
277 %v21 = extractelement <32 x i16> %v, i16 %index21
278 %v22 = extractelement <32 x i16> %v, i16 %index22
279 %v23 = extractelement <32 x i16> %v, i16 %index23
280 %v24 = extractelement <32 x i16> %v, i16 %index24
281 %v25 = extractelement <32 x i16> %v, i16 %index25
282 %v26 = extractelement <32 x i16> %v, i16 %index26
283 %v27 = extractelement <32 x i16> %v, i16 %index27
284 %v28 = extractelement <32 x i16> %v, i16 %index28
285 %v29 = extractelement <32 x i16> %v, i16 %index29
286 %v30 = extractelement <32 x i16> %v, i16 %index30
287 %v31 = extractelement <32 x i16> %v, i16 %index31
288 %ret0 = insertelement <32 x i16> undef, i16 %v0, i32 0
289 %ret1 = insertelement <32 x i16> %ret0, i16 %v1, i32 1
290 %ret2 = insertelement <32 x i16> %ret1, i16 %v2, i32 2
291 %ret3 = insertelement <32 x i16> %ret2, i16 %v3, i32 3
292 %ret4 = insertelement <32 x i16> %ret3, i16 %v4, i32 4
293 %ret5 = insertelement <32 x i16> %ret4, i16 %v5, i32 5
294 %ret6 = insertelement <32 x i16> %ret5, i16 %v6, i32 6
295 %ret7 = insertelement <32 x i16> %ret6, i16 %v7, i32 7
296 %ret8 = insertelement <32 x i16> %ret7, i16 %v8, i32 8
297 %ret9 = insertelement <32 x i16> %ret8, i16 %v9, i32 9
298 %ret10 = insertelement <32 x i16> %ret9, i16 %v10, i32 10
299 %ret11 = insertelement <32 x i16> %ret10, i16 %v11, i32 11
300 %ret12 = insertelement <32 x i16> %ret11, i16 %v12, i32 12
301 %ret13 = insertelement <32 x i16> %ret12, i16 %v13, i32 13
302 %ret14 = insertelement <32 x i16> %ret13, i16 %v14, i32 14
303 %ret15 = insertelement <32 x i16> %ret14, i16 %v15, i32 15
304 %ret16 = insertelement <32 x i16> %ret15, i16 %v16, i32 16
305 %ret17 = insertelement <32 x i16> %ret16, i16 %v17, i32 17
306 %ret18 = insertelement <32 x i16> %ret17, i16 %v18, i32 18
307 %ret19 = insertelement <32 x i16> %ret18, i16 %v19, i32 19
308 %ret20 = insertelement <32 x i16> %ret19, i16 %v20, i32 20
309 %ret21 = insertelement <32 x i16> %ret20, i16 %v21, i32 21
310 %ret22 = insertelement <32 x i16> %ret21, i16 %v22, i32 22
311 %ret23 = insertelement <32 x i16> %ret22, i16 %v23, i32 23
312 %ret24 = insertelement <32 x i16> %ret23, i16 %v24, i32 24
313 %ret25 = insertelement <32 x i16> %ret24, i16 %v25, i32 25
314 %ret26 = insertelement <32 x i16> %ret25, i16 %v26, i32 26
315 %ret27 = insertelement <32 x i16> %ret26, i16 %v27, i32 27
316 %ret28 = insertelement <32 x i16> %ret27, i16 %v28, i32 28
317 %ret29 = insertelement <32 x i16> %ret28, i16 %v29, i32 29
318 %ret30 = insertelement <32 x i16> %ret29, i16 %v30, i32 30
319 %ret31 = insertelement <32 x i16> %ret30, i16 %v31, i32 31
320 ret <32 x i16> %ret31
323 define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind {
324 ; AVX512F-LABEL: var_shuffle_v64i8:
326 ; AVX512F-NEXT: pushq %rbp
327 ; AVX512F-NEXT: movq %rsp, %rbp
328 ; AVX512F-NEXT: andq $-64, %rsp
329 ; AVX512F-NEXT: subq $128, %rsp
330 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
331 ; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3
332 ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4
333 ; AVX512F-NEXT: vpextrb $0, %xmm4, %eax
334 ; AVX512F-NEXT: vmovaps %zmm0, (%rsp)
335 ; AVX512F-NEXT: andl $63, %eax
336 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
337 ; AVX512F-NEXT: vmovd %eax, %xmm0
338 ; AVX512F-NEXT: vpextrb $1, %xmm4, %eax
339 ; AVX512F-NEXT: andl $63, %eax
340 ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
341 ; AVX512F-NEXT: vpextrb $2, %xmm4, %eax
342 ; AVX512F-NEXT: andl $63, %eax
343 ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
344 ; AVX512F-NEXT: vpextrb $3, %xmm4, %eax
345 ; AVX512F-NEXT: andl $63, %eax
346 ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
347 ; AVX512F-NEXT: vpextrb $4, %xmm4, %eax
348 ; AVX512F-NEXT: andl $63, %eax
349 ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
350 ; AVX512F-NEXT: vpextrb $5, %xmm4, %eax
351 ; AVX512F-NEXT: andl $63, %eax
352 ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
353 ; AVX512F-NEXT: vpextrb $6, %xmm4, %eax
354 ; AVX512F-NEXT: andl $63, %eax
355 ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
356 ; AVX512F-NEXT: vpextrb $7, %xmm4, %eax
357 ; AVX512F-NEXT: andl $63, %eax
358 ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
359 ; AVX512F-NEXT: vpextrb $8, %xmm4, %eax
360 ; AVX512F-NEXT: andl $63, %eax
361 ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
362 ; AVX512F-NEXT: vpextrb $9, %xmm4, %eax
363 ; AVX512F-NEXT: andl $63, %eax
364 ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
365 ; AVX512F-NEXT: vpextrb $10, %xmm4, %eax
366 ; AVX512F-NEXT: andl $63, %eax
367 ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0
368 ; AVX512F-NEXT: vpextrb $11, %xmm4, %eax
369 ; AVX512F-NEXT: andl $63, %eax
370 ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
371 ; AVX512F-NEXT: vpextrb $12, %xmm4, %eax
372 ; AVX512F-NEXT: andl $63, %eax
373 ; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
374 ; AVX512F-NEXT: vpextrb $13, %xmm4, %eax
375 ; AVX512F-NEXT: andl $63, %eax
376 ; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
377 ; AVX512F-NEXT: vpextrb $14, %xmm4, %eax
378 ; AVX512F-NEXT: andl $63, %eax
379 ; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
380 ; AVX512F-NEXT: vpextrb $15, %xmm4, %eax
381 ; AVX512F-NEXT: andl $63, %eax
382 ; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
383 ; AVX512F-NEXT: vpextrb $0, %xmm3, %eax
384 ; AVX512F-NEXT: andl $63, %eax
385 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
386 ; AVX512F-NEXT: vmovd %eax, %xmm4
387 ; AVX512F-NEXT: vpextrb $1, %xmm3, %eax
388 ; AVX512F-NEXT: andl $63, %eax
389 ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4
390 ; AVX512F-NEXT: vpextrb $2, %xmm3, %eax
391 ; AVX512F-NEXT: andl $63, %eax
392 ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4
393 ; AVX512F-NEXT: vpextrb $3, %xmm3, %eax
394 ; AVX512F-NEXT: andl $63, %eax
395 ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4
396 ; AVX512F-NEXT: vpextrb $4, %xmm3, %eax
397 ; AVX512F-NEXT: andl $63, %eax
398 ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4
399 ; AVX512F-NEXT: vpextrb $5, %xmm3, %eax
400 ; AVX512F-NEXT: andl $63, %eax
401 ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4
402 ; AVX512F-NEXT: vpextrb $6, %xmm3, %eax
403 ; AVX512F-NEXT: andl $63, %eax
404 ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4
405 ; AVX512F-NEXT: vpextrb $7, %xmm3, %eax
406 ; AVX512F-NEXT: andl $63, %eax
407 ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4
408 ; AVX512F-NEXT: vpextrb $8, %xmm3, %eax
409 ; AVX512F-NEXT: andl $63, %eax
410 ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4
411 ; AVX512F-NEXT: vpextrb $9, %xmm3, %eax
412 ; AVX512F-NEXT: andl $63, %eax
413 ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4
414 ; AVX512F-NEXT: vpextrb $10, %xmm3, %eax
415 ; AVX512F-NEXT: andl $63, %eax
416 ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4
417 ; AVX512F-NEXT: vpextrb $11, %xmm3, %eax
418 ; AVX512F-NEXT: andl $63, %eax
419 ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4
420 ; AVX512F-NEXT: vpextrb $12, %xmm3, %eax
421 ; AVX512F-NEXT: andl $63, %eax
422 ; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4
423 ; AVX512F-NEXT: vpextrb $13, %xmm3, %eax
424 ; AVX512F-NEXT: andl $63, %eax
425 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
426 ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
427 ; AVX512F-NEXT: vpextrb $14, %xmm3, %eax
428 ; AVX512F-NEXT: andl $63, %eax
429 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
430 ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
431 ; AVX512F-NEXT: vpextrb $15, %xmm3, %eax
432 ; AVX512F-NEXT: andl $63, %eax
433 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
434 ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3
435 ; AVX512F-NEXT: vpextrb $0, %xmm2, %eax
436 ; AVX512F-NEXT: andl $63, %eax
437 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
438 ; AVX512F-NEXT: vmovd %eax, %xmm4
439 ; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
440 ; AVX512F-NEXT: andl $63, %eax
441 ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4
442 ; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
443 ; AVX512F-NEXT: andl $63, %eax
444 ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4
445 ; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
446 ; AVX512F-NEXT: andl $63, %eax
447 ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4
448 ; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
449 ; AVX512F-NEXT: andl $63, %eax
450 ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4
451 ; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
452 ; AVX512F-NEXT: andl $63, %eax
453 ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4
454 ; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
455 ; AVX512F-NEXT: andl $63, %eax
456 ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4
457 ; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
458 ; AVX512F-NEXT: andl $63, %eax
459 ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4
460 ; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
461 ; AVX512F-NEXT: andl $63, %eax
462 ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4
463 ; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
464 ; AVX512F-NEXT: andl $63, %eax
465 ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4
466 ; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
467 ; AVX512F-NEXT: andl $63, %eax
468 ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4
469 ; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
470 ; AVX512F-NEXT: andl $63, %eax
471 ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4
472 ; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
473 ; AVX512F-NEXT: andl $63, %eax
474 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
475 ; AVX512F-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
476 ; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
477 ; AVX512F-NEXT: andl $63, %eax
478 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
479 ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
480 ; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
481 ; AVX512F-NEXT: andl $63, %eax
482 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
483 ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
484 ; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
485 ; AVX512F-NEXT: andl $63, %eax
486 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
487 ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2
488 ; AVX512F-NEXT: vpextrb $0, %xmm1, %eax
489 ; AVX512F-NEXT: andl $63, %eax
490 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
491 ; AVX512F-NEXT: vmovd %eax, %xmm4
492 ; AVX512F-NEXT: vpextrb $1, %xmm1, %eax
493 ; AVX512F-NEXT: andl $63, %eax
494 ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4
495 ; AVX512F-NEXT: vpextrb $2, %xmm1, %eax
496 ; AVX512F-NEXT: andl $63, %eax
497 ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4
498 ; AVX512F-NEXT: vpextrb $3, %xmm1, %eax
499 ; AVX512F-NEXT: andl $63, %eax
500 ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4
501 ; AVX512F-NEXT: vpextrb $4, %xmm1, %eax
502 ; AVX512F-NEXT: andl $63, %eax
503 ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4
504 ; AVX512F-NEXT: vpextrb $5, %xmm1, %eax
505 ; AVX512F-NEXT: andl $63, %eax
506 ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4
507 ; AVX512F-NEXT: vpextrb $6, %xmm1, %eax
508 ; AVX512F-NEXT: andl $63, %eax
509 ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4
510 ; AVX512F-NEXT: vpextrb $7, %xmm1, %eax
511 ; AVX512F-NEXT: andl $63, %eax
512 ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4
513 ; AVX512F-NEXT: vpextrb $8, %xmm1, %eax
514 ; AVX512F-NEXT: andl $63, %eax
515 ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4
516 ; AVX512F-NEXT: vpextrb $9, %xmm1, %eax
517 ; AVX512F-NEXT: andl $63, %eax
518 ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4
519 ; AVX512F-NEXT: vpextrb $10, %xmm1, %eax
520 ; AVX512F-NEXT: andl $63, %eax
521 ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4
522 ; AVX512F-NEXT: vpextrb $11, %xmm1, %eax
523 ; AVX512F-NEXT: andl $63, %eax
524 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
525 ; AVX512F-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
526 ; AVX512F-NEXT: vpextrb $12, %xmm1, %eax
527 ; AVX512F-NEXT: andl $63, %eax
528 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
529 ; AVX512F-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
530 ; AVX512F-NEXT: vpextrb $13, %xmm1, %eax
531 ; AVX512F-NEXT: andl $63, %eax
532 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
533 ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
534 ; AVX512F-NEXT: vpextrb $14, %xmm1, %eax
535 ; AVX512F-NEXT: andl $63, %eax
536 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
537 ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
538 ; AVX512F-NEXT: vpextrb $15, %xmm1, %eax
539 ; AVX512F-NEXT: andl $63, %eax
540 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
541 ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1
542 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
543 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
544 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
545 ; AVX512F-NEXT: movq %rbp, %rsp
546 ; AVX512F-NEXT: popq %rbp
549 ; AVX512BW-LABEL: var_shuffle_v64i8:
551 ; AVX512BW-NEXT: pushq %rbp
552 ; AVX512BW-NEXT: movq %rsp, %rbp
553 ; AVX512BW-NEXT: andq $-64, %rsp
554 ; AVX512BW-NEXT: subq $128, %rsp
555 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
556 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
557 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4
558 ; AVX512BW-NEXT: vpextrb $0, %xmm4, %eax
559 ; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
560 ; AVX512BW-NEXT: andl $63, %eax
561 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
562 ; AVX512BW-NEXT: vmovd %eax, %xmm0
563 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax
564 ; AVX512BW-NEXT: andl $63, %eax
565 ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
566 ; AVX512BW-NEXT: vpextrb $2, %xmm4, %eax
567 ; AVX512BW-NEXT: andl $63, %eax
568 ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
569 ; AVX512BW-NEXT: vpextrb $3, %xmm4, %eax
570 ; AVX512BW-NEXT: andl $63, %eax
571 ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
572 ; AVX512BW-NEXT: vpextrb $4, %xmm4, %eax
573 ; AVX512BW-NEXT: andl $63, %eax
574 ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
575 ; AVX512BW-NEXT: vpextrb $5, %xmm4, %eax
576 ; AVX512BW-NEXT: andl $63, %eax
577 ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
578 ; AVX512BW-NEXT: vpextrb $6, %xmm4, %eax
579 ; AVX512BW-NEXT: andl $63, %eax
580 ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
581 ; AVX512BW-NEXT: vpextrb $7, %xmm4, %eax
582 ; AVX512BW-NEXT: andl $63, %eax
583 ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
584 ; AVX512BW-NEXT: vpextrb $8, %xmm4, %eax
585 ; AVX512BW-NEXT: andl $63, %eax
586 ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
587 ; AVX512BW-NEXT: vpextrb $9, %xmm4, %eax
588 ; AVX512BW-NEXT: andl $63, %eax
589 ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
590 ; AVX512BW-NEXT: vpextrb $10, %xmm4, %eax
591 ; AVX512BW-NEXT: andl $63, %eax
592 ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0
593 ; AVX512BW-NEXT: vpextrb $11, %xmm4, %eax
594 ; AVX512BW-NEXT: andl $63, %eax
595 ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
596 ; AVX512BW-NEXT: vpextrb $12, %xmm4, %eax
597 ; AVX512BW-NEXT: andl $63, %eax
598 ; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
599 ; AVX512BW-NEXT: vpextrb $13, %xmm4, %eax
600 ; AVX512BW-NEXT: andl $63, %eax
601 ; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
602 ; AVX512BW-NEXT: vpextrb $14, %xmm4, %eax
603 ; AVX512BW-NEXT: andl $63, %eax
604 ; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
605 ; AVX512BW-NEXT: vpextrb $15, %xmm4, %eax
606 ; AVX512BW-NEXT: andl $63, %eax
607 ; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
608 ; AVX512BW-NEXT: vpextrb $0, %xmm3, %eax
609 ; AVX512BW-NEXT: andl $63, %eax
610 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
611 ; AVX512BW-NEXT: vmovd %eax, %xmm4
612 ; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax
613 ; AVX512BW-NEXT: andl $63, %eax
614 ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4
615 ; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax
616 ; AVX512BW-NEXT: andl $63, %eax
617 ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4
618 ; AVX512BW-NEXT: vpextrb $3, %xmm3, %eax
619 ; AVX512BW-NEXT: andl $63, %eax
620 ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4
621 ; AVX512BW-NEXT: vpextrb $4, %xmm3, %eax
622 ; AVX512BW-NEXT: andl $63, %eax
623 ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4
624 ; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax
625 ; AVX512BW-NEXT: andl $63, %eax
626 ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4
627 ; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax
628 ; AVX512BW-NEXT: andl $63, %eax
629 ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4
630 ; AVX512BW-NEXT: vpextrb $7, %xmm3, %eax
631 ; AVX512BW-NEXT: andl $63, %eax
632 ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4
633 ; AVX512BW-NEXT: vpextrb $8, %xmm3, %eax
634 ; AVX512BW-NEXT: andl $63, %eax
635 ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4
636 ; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax
637 ; AVX512BW-NEXT: andl $63, %eax
638 ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4
639 ; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax
640 ; AVX512BW-NEXT: andl $63, %eax
641 ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4
642 ; AVX512BW-NEXT: vpextrb $11, %xmm3, %eax
643 ; AVX512BW-NEXT: andl $63, %eax
644 ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4
645 ; AVX512BW-NEXT: vpextrb $12, %xmm3, %eax
646 ; AVX512BW-NEXT: andl $63, %eax
647 ; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4
648 ; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax
649 ; AVX512BW-NEXT: andl $63, %eax
650 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
651 ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
652 ; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax
653 ; AVX512BW-NEXT: andl $63, %eax
654 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
655 ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
656 ; AVX512BW-NEXT: vpextrb $15, %xmm3, %eax
657 ; AVX512BW-NEXT: andl $63, %eax
658 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
659 ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3
660 ; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
661 ; AVX512BW-NEXT: andl $63, %eax
662 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
663 ; AVX512BW-NEXT: vmovd %eax, %xmm4
664 ; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
665 ; AVX512BW-NEXT: andl $63, %eax
666 ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4
667 ; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
668 ; AVX512BW-NEXT: andl $63, %eax
669 ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4
670 ; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
671 ; AVX512BW-NEXT: andl $63, %eax
672 ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4
673 ; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
674 ; AVX512BW-NEXT: andl $63, %eax
675 ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4
676 ; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
677 ; AVX512BW-NEXT: andl $63, %eax
678 ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4
679 ; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
680 ; AVX512BW-NEXT: andl $63, %eax
681 ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4
682 ; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
683 ; AVX512BW-NEXT: andl $63, %eax
684 ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4
685 ; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
686 ; AVX512BW-NEXT: andl $63, %eax
687 ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4
688 ; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
689 ; AVX512BW-NEXT: andl $63, %eax
690 ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4
691 ; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
692 ; AVX512BW-NEXT: andl $63, %eax
693 ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4
694 ; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
695 ; AVX512BW-NEXT: andl $63, %eax
696 ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4
697 ; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
698 ; AVX512BW-NEXT: andl $63, %eax
699 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
700 ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
701 ; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
702 ; AVX512BW-NEXT: andl $63, %eax
703 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
704 ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
705 ; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
706 ; AVX512BW-NEXT: andl $63, %eax
707 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
708 ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
709 ; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
710 ; AVX512BW-NEXT: andl $63, %eax
711 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
712 ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2
713 ; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
714 ; AVX512BW-NEXT: andl $63, %eax
715 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
716 ; AVX512BW-NEXT: vmovd %eax, %xmm4
717 ; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
718 ; AVX512BW-NEXT: andl $63, %eax
719 ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4
720 ; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
721 ; AVX512BW-NEXT: andl $63, %eax
722 ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4
723 ; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
724 ; AVX512BW-NEXT: andl $63, %eax
725 ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4
726 ; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
727 ; AVX512BW-NEXT: andl $63, %eax
728 ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4
729 ; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
730 ; AVX512BW-NEXT: andl $63, %eax
731 ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4
732 ; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
733 ; AVX512BW-NEXT: andl $63, %eax
734 ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4
735 ; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
736 ; AVX512BW-NEXT: andl $63, %eax
737 ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4
738 ; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
739 ; AVX512BW-NEXT: andl $63, %eax
740 ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4
741 ; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
742 ; AVX512BW-NEXT: andl $63, %eax
743 ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4
744 ; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
745 ; AVX512BW-NEXT: andl $63, %eax
746 ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4
747 ; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
748 ; AVX512BW-NEXT: andl $63, %eax
749 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
750 ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
751 ; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
752 ; AVX512BW-NEXT: andl $63, %eax
753 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
754 ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
755 ; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
756 ; AVX512BW-NEXT: andl $63, %eax
757 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
758 ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
759 ; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
760 ; AVX512BW-NEXT: andl $63, %eax
761 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
762 ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
763 ; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
764 ; AVX512BW-NEXT: andl $63, %eax
765 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
766 ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1
767 ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
768 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
769 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
770 ; AVX512BW-NEXT: movq %rbp, %rsp
771 ; AVX512BW-NEXT: popq %rbp
772 ; AVX512BW-NEXT: retq
774 ; AVX512VBMI-LABEL: var_shuffle_v64i8:
775 ; AVX512VBMI: # %bb.0:
776 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
777 ; AVX512VBMI-NEXT: retq
778 %index0 = extractelement <64 x i8> %indices, i32 0
779 %index1 = extractelement <64 x i8> %indices, i32 1
780 %index2 = extractelement <64 x i8> %indices, i32 2
781 %index3 = extractelement <64 x i8> %indices, i32 3
782 %index4 = extractelement <64 x i8> %indices, i32 4
783 %index5 = extractelement <64 x i8> %indices, i32 5
784 %index6 = extractelement <64 x i8> %indices, i32 6
785 %index7 = extractelement <64 x i8> %indices, i32 7
786 %index8 = extractelement <64 x i8> %indices, i32 8
787 %index9 = extractelement <64 x i8> %indices, i32 9
788 %index10 = extractelement <64 x i8> %indices, i32 10
789 %index11 = extractelement <64 x i8> %indices, i32 11
790 %index12 = extractelement <64 x i8> %indices, i32 12
791 %index13 = extractelement <64 x i8> %indices, i32 13
792 %index14 = extractelement <64 x i8> %indices, i32 14
793 %index15 = extractelement <64 x i8> %indices, i32 15
794 %index16 = extractelement <64 x i8> %indices, i32 16
795 %index17 = extractelement <64 x i8> %indices, i32 17
796 %index18 = extractelement <64 x i8> %indices, i32 18
797 %index19 = extractelement <64 x i8> %indices, i32 19
798 %index20 = extractelement <64 x i8> %indices, i32 20
799 %index21 = extractelement <64 x i8> %indices, i32 21
800 %index22 = extractelement <64 x i8> %indices, i32 22
801 %index23 = extractelement <64 x i8> %indices, i32 23
802 %index24 = extractelement <64 x i8> %indices, i32 24
803 %index25 = extractelement <64 x i8> %indices, i32 25
804 %index26 = extractelement <64 x i8> %indices, i32 26
805 %index27 = extractelement <64 x i8> %indices, i32 27
806 %index28 = extractelement <64 x i8> %indices, i32 28
807 %index29 = extractelement <64 x i8> %indices, i32 29
808 %index30 = extractelement <64 x i8> %indices, i32 30
809 %index31 = extractelement <64 x i8> %indices, i32 31
810 %index32 = extractelement <64 x i8> %indices, i32 32
811 %index33 = extractelement <64 x i8> %indices, i32 33
812 %index34 = extractelement <64 x i8> %indices, i32 34
813 %index35 = extractelement <64 x i8> %indices, i32 35
814 %index36 = extractelement <64 x i8> %indices, i32 36
815 %index37 = extractelement <64 x i8> %indices, i32 37
816 %index38 = extractelement <64 x i8> %indices, i32 38
817 %index39 = extractelement <64 x i8> %indices, i32 39
818 %index40 = extractelement <64 x i8> %indices, i32 40
819 %index41 = extractelement <64 x i8> %indices, i32 41
820 %index42 = extractelement <64 x i8> %indices, i32 42
821 %index43 = extractelement <64 x i8> %indices, i32 43
822 %index44 = extractelement <64 x i8> %indices, i32 44
823 %index45 = extractelement <64 x i8> %indices, i32 45
824 %index46 = extractelement <64 x i8> %indices, i32 46
825 %index47 = extractelement <64 x i8> %indices, i32 47
826 %index48 = extractelement <64 x i8> %indices, i32 48
827 %index49 = extractelement <64 x i8> %indices, i32 49
828 %index50 = extractelement <64 x i8> %indices, i32 50
829 %index51 = extractelement <64 x i8> %indices, i32 51
830 %index52 = extractelement <64 x i8> %indices, i32 52
831 %index53 = extractelement <64 x i8> %indices, i32 53
832 %index54 = extractelement <64 x i8> %indices, i32 54
833 %index55 = extractelement <64 x i8> %indices, i32 55
834 %index56 = extractelement <64 x i8> %indices, i32 56
835 %index57 = extractelement <64 x i8> %indices, i32 57
836 %index58 = extractelement <64 x i8> %indices, i32 58
837 %index59 = extractelement <64 x i8> %indices, i32 59
838 %index60 = extractelement <64 x i8> %indices, i32 60
839 %index61 = extractelement <64 x i8> %indices, i32 61
840 %index62 = extractelement <64 x i8> %indices, i32 62
841 %index63 = extractelement <64 x i8> %indices, i32 63
842 %v0 = extractelement <64 x i8> %v, i8 %index0
843 %v1 = extractelement <64 x i8> %v, i8 %index1
844 %v2 = extractelement <64 x i8> %v, i8 %index2
845 %v3 = extractelement <64 x i8> %v, i8 %index3
846 %v4 = extractelement <64 x i8> %v, i8 %index4
847 %v5 = extractelement <64 x i8> %v, i8 %index5
848 %v6 = extractelement <64 x i8> %v, i8 %index6
849 %v7 = extractelement <64 x i8> %v, i8 %index7
850 %v8 = extractelement <64 x i8> %v, i8 %index8
851 %v9 = extractelement <64 x i8> %v, i8 %index9
852 %v10 = extractelement <64 x i8> %v, i8 %index10
853 %v11 = extractelement <64 x i8> %v, i8 %index11
854 %v12 = extractelement <64 x i8> %v, i8 %index12
855 %v13 = extractelement <64 x i8> %v, i8 %index13
856 %v14 = extractelement <64 x i8> %v, i8 %index14
857 %v15 = extractelement <64 x i8> %v, i8 %index15
858 %v16 = extractelement <64 x i8> %v, i8 %index16
859 %v17 = extractelement <64 x i8> %v, i8 %index17
860 %v18 = extractelement <64 x i8> %v, i8 %index18
861 %v19 = extractelement <64 x i8> %v, i8 %index19
862 %v20 = extractelement <64 x i8> %v, i8 %index20
863 %v21 = extractelement <64 x i8> %v, i8 %index21
864 %v22 = extractelement <64 x i8> %v, i8 %index22
865 %v23 = extractelement <64 x i8> %v, i8 %index23
866 %v24 = extractelement <64 x i8> %v, i8 %index24
867 %v25 = extractelement <64 x i8> %v, i8 %index25
868 %v26 = extractelement <64 x i8> %v, i8 %index26
869 %v27 = extractelement <64 x i8> %v, i8 %index27
870 %v28 = extractelement <64 x i8> %v, i8 %index28
871 %v29 = extractelement <64 x i8> %v, i8 %index29
872 %v30 = extractelement <64 x i8> %v, i8 %index30
873 %v31 = extractelement <64 x i8> %v, i8 %index31
874 %v32 = extractelement <64 x i8> %v, i8 %index32
875 %v33 = extractelement <64 x i8> %v, i8 %index33
876 %v34 = extractelement <64 x i8> %v, i8 %index34
877 %v35 = extractelement <64 x i8> %v, i8 %index35
878 %v36 = extractelement <64 x i8> %v, i8 %index36
879 %v37 = extractelement <64 x i8> %v, i8 %index37
880 %v38 = extractelement <64 x i8> %v, i8 %index38
881 %v39 = extractelement <64 x i8> %v, i8 %index39
882 %v40 = extractelement <64 x i8> %v, i8 %index40
883 %v41 = extractelement <64 x i8> %v, i8 %index41
884 %v42 = extractelement <64 x i8> %v, i8 %index42
885 %v43 = extractelement <64 x i8> %v, i8 %index43
886 %v44 = extractelement <64 x i8> %v, i8 %index44
887 %v45 = extractelement <64 x i8> %v, i8 %index45
888 %v46 = extractelement <64 x i8> %v, i8 %index46
889 %v47 = extractelement <64 x i8> %v, i8 %index47
890 %v48 = extractelement <64 x i8> %v, i8 %index48
891 %v49 = extractelement <64 x i8> %v, i8 %index49
892 %v50 = extractelement <64 x i8> %v, i8 %index50
893 %v51 = extractelement <64 x i8> %v, i8 %index51
894 %v52 = extractelement <64 x i8> %v, i8 %index52
895 %v53 = extractelement <64 x i8> %v, i8 %index53
896 %v54 = extractelement <64 x i8> %v, i8 %index54
897 %v55 = extractelement <64 x i8> %v, i8 %index55
898 %v56 = extractelement <64 x i8> %v, i8 %index56
899 %v57 = extractelement <64 x i8> %v, i8 %index57
900 %v58 = extractelement <64 x i8> %v, i8 %index58
901 %v59 = extractelement <64 x i8> %v, i8 %index59
902 %v60 = extractelement <64 x i8> %v, i8 %index60
903 %v61 = extractelement <64 x i8> %v, i8 %index61
904 %v62 = extractelement <64 x i8> %v, i8 %index62
905 %v63 = extractelement <64 x i8> %v, i8 %index63
906 %ret0 = insertelement <64 x i8> undef, i8 %v0, i32 0
907 %ret1 = insertelement <64 x i8> %ret0, i8 %v1, i32 1
908 %ret2 = insertelement <64 x i8> %ret1, i8 %v2, i32 2
909 %ret3 = insertelement <64 x i8> %ret2, i8 %v3, i32 3
910 %ret4 = insertelement <64 x i8> %ret3, i8 %v4, i32 4
911 %ret5 = insertelement <64 x i8> %ret4, i8 %v5, i32 5
912 %ret6 = insertelement <64 x i8> %ret5, i8 %v6, i32 6
913 %ret7 = insertelement <64 x i8> %ret6, i8 %v7, i32 7
914 %ret8 = insertelement <64 x i8> %ret7, i8 %v8, i32 8
915 %ret9 = insertelement <64 x i8> %ret8, i8 %v9, i32 9
916 %ret10 = insertelement <64 x i8> %ret9, i8 %v10, i32 10
917 %ret11 = insertelement <64 x i8> %ret10, i8 %v11, i32 11
918 %ret12 = insertelement <64 x i8> %ret11, i8 %v12, i32 12
919 %ret13 = insertelement <64 x i8> %ret12, i8 %v13, i32 13
920 %ret14 = insertelement <64 x i8> %ret13, i8 %v14, i32 14
921 %ret15 = insertelement <64 x i8> %ret14, i8 %v15, i32 15
922 %ret16 = insertelement <64 x i8> %ret15, i8 %v16, i32 16
923 %ret17 = insertelement <64 x i8> %ret16, i8 %v17, i32 17
924 %ret18 = insertelement <64 x i8> %ret17, i8 %v18, i32 18
925 %ret19 = insertelement <64 x i8> %ret18, i8 %v19, i32 19
926 %ret20 = insertelement <64 x i8> %ret19, i8 %v20, i32 20
927 %ret21 = insertelement <64 x i8> %ret20, i8 %v21, i32 21
928 %ret22 = insertelement <64 x i8> %ret21, i8 %v22, i32 22
929 %ret23 = insertelement <64 x i8> %ret22, i8 %v23, i32 23
930 %ret24 = insertelement <64 x i8> %ret23, i8 %v24, i32 24
931 %ret25 = insertelement <64 x i8> %ret24, i8 %v25, i32 25
932 %ret26 = insertelement <64 x i8> %ret25, i8 %v26, i32 26
933 %ret27 = insertelement <64 x i8> %ret26, i8 %v27, i32 27
934 %ret28 = insertelement <64 x i8> %ret27, i8 %v28, i32 28
935 %ret29 = insertelement <64 x i8> %ret28, i8 %v29, i32 29
936 %ret30 = insertelement <64 x i8> %ret29, i8 %v30, i32 30
937 %ret31 = insertelement <64 x i8> %ret30, i8 %v31, i32 31
938 %ret32 = insertelement <64 x i8> %ret31, i8 %v32, i32 32
939 %ret33 = insertelement <64 x i8> %ret32, i8 %v33, i32 33
940 %ret34 = insertelement <64 x i8> %ret33, i8 %v34, i32 34
941 %ret35 = insertelement <64 x i8> %ret34, i8 %v35, i32 35
942 %ret36 = insertelement <64 x i8> %ret35, i8 %v36, i32 36
943 %ret37 = insertelement <64 x i8> %ret36, i8 %v37, i32 37
944 %ret38 = insertelement <64 x i8> %ret37, i8 %v38, i32 38
945 %ret39 = insertelement <64 x i8> %ret38, i8 %v39, i32 39
946 %ret40 = insertelement <64 x i8> %ret39, i8 %v40, i32 40
947 %ret41 = insertelement <64 x i8> %ret40, i8 %v41, i32 41
948 %ret42 = insertelement <64 x i8> %ret41, i8 %v42, i32 42
949 %ret43 = insertelement <64 x i8> %ret42, i8 %v43, i32 43
950 %ret44 = insertelement <64 x i8> %ret43, i8 %v44, i32 44
951 %ret45 = insertelement <64 x i8> %ret44, i8 %v45, i32 45
952 %ret46 = insertelement <64 x i8> %ret45, i8 %v46, i32 46
953 %ret47 = insertelement <64 x i8> %ret46, i8 %v47, i32 47
954 %ret48 = insertelement <64 x i8> %ret47, i8 %v48, i32 48
955 %ret49 = insertelement <64 x i8> %ret48, i8 %v49, i32 49
956 %ret50 = insertelement <64 x i8> %ret49, i8 %v50, i32 50
957 %ret51 = insertelement <64 x i8> %ret50, i8 %v51, i32 51
958 %ret52 = insertelement <64 x i8> %ret51, i8 %v52, i32 52
959 %ret53 = insertelement <64 x i8> %ret52, i8 %v53, i32 53
960 %ret54 = insertelement <64 x i8> %ret53, i8 %v54, i32 54
961 %ret55 = insertelement <64 x i8> %ret54, i8 %v55, i32 55
962 %ret56 = insertelement <64 x i8> %ret55, i8 %v56, i32 56
963 %ret57 = insertelement <64 x i8> %ret56, i8 %v57, i32 57
964 %ret58 = insertelement <64 x i8> %ret57, i8 %v58, i32 58
965 %ret59 = insertelement <64 x i8> %ret58, i8 %v59, i32 59
966 %ret60 = insertelement <64 x i8> %ret59, i8 %v60, i32 60
967 %ret61 = insertelement <64 x i8> %ret60, i8 %v61, i32 61
968 %ret62 = insertelement <64 x i8> %ret61, i8 %v62, i32 62
969 %ret63 = insertelement <64 x i8> %ret62, i8 %v63, i32 63
973 define <8 x double> @var_shuffle_v8f64(<8 x double> %v, <8 x i64> %indices) nounwind {
974 ; AVX512-LABEL: var_shuffle_v8f64:
976 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
978 %index0 = extractelement <8 x i64> %indices, i32 0
979 %index1 = extractelement <8 x i64> %indices, i32 1
980 %index2 = extractelement <8 x i64> %indices, i32 2
981 %index3 = extractelement <8 x i64> %indices, i32 3
982 %index4 = extractelement <8 x i64> %indices, i32 4
983 %index5 = extractelement <8 x i64> %indices, i32 5
984 %index6 = extractelement <8 x i64> %indices, i32 6
985 %index7 = extractelement <8 x i64> %indices, i32 7
986 %v0 = extractelement <8 x double> %v, i64 %index0
987 %v1 = extractelement <8 x double> %v, i64 %index1
988 %v2 = extractelement <8 x double> %v, i64 %index2
989 %v3 = extractelement <8 x double> %v, i64 %index3
990 %v4 = extractelement <8 x double> %v, i64 %index4
991 %v5 = extractelement <8 x double> %v, i64 %index5
992 %v6 = extractelement <8 x double> %v, i64 %index6
993 %v7 = extractelement <8 x double> %v, i64 %index7
994 %ret0 = insertelement <8 x double> undef, double %v0, i32 0
995 %ret1 = insertelement <8 x double> %ret0, double %v1, i32 1
996 %ret2 = insertelement <8 x double> %ret1, double %v2, i32 2
997 %ret3 = insertelement <8 x double> %ret2, double %v3, i32 3
998 %ret4 = insertelement <8 x double> %ret3, double %v4, i32 4
999 %ret5 = insertelement <8 x double> %ret4, double %v5, i32 5
1000 %ret6 = insertelement <8 x double> %ret5, double %v6, i32 6
1001 %ret7 = insertelement <8 x double> %ret6, double %v7, i32 7
1002 ret <8 x double> %ret7
1005 define <16 x float> @var_shuffle_v16f32(<16 x float> %v, <16 x i32> %indices) nounwind {
1006 ; AVX512-LABEL: var_shuffle_v16f32:
1008 ; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
1010 %index0 = extractelement <16 x i32> %indices, i32 0
1011 %index1 = extractelement <16 x i32> %indices, i32 1
1012 %index2 = extractelement <16 x i32> %indices, i32 2
1013 %index3 = extractelement <16 x i32> %indices, i32 3
1014 %index4 = extractelement <16 x i32> %indices, i32 4
1015 %index5 = extractelement <16 x i32> %indices, i32 5
1016 %index6 = extractelement <16 x i32> %indices, i32 6
1017 %index7 = extractelement <16 x i32> %indices, i32 7
1018 %index8 = extractelement <16 x i32> %indices, i32 8
1019 %index9 = extractelement <16 x i32> %indices, i32 9
1020 %index10 = extractelement <16 x i32> %indices, i32 10
1021 %index11 = extractelement <16 x i32> %indices, i32 11
1022 %index12 = extractelement <16 x i32> %indices, i32 12
1023 %index13 = extractelement <16 x i32> %indices, i32 13
1024 %index14 = extractelement <16 x i32> %indices, i32 14
1025 %index15 = extractelement <16 x i32> %indices, i32 15
1026 %v0 = extractelement <16 x float> %v, i32 %index0
1027 %v1 = extractelement <16 x float> %v, i32 %index1
1028 %v2 = extractelement <16 x float> %v, i32 %index2
1029 %v3 = extractelement <16 x float> %v, i32 %index3
1030 %v4 = extractelement <16 x float> %v, i32 %index4
1031 %v5 = extractelement <16 x float> %v, i32 %index5
1032 %v6 = extractelement <16 x float> %v, i32 %index6
1033 %v7 = extractelement <16 x float> %v, i32 %index7
1034 %v8 = extractelement <16 x float> %v, i32 %index8
1035 %v9 = extractelement <16 x float> %v, i32 %index9
1036 %v10 = extractelement <16 x float> %v, i32 %index10
1037 %v11 = extractelement <16 x float> %v, i32 %index11
1038 %v12 = extractelement <16 x float> %v, i32 %index12
1039 %v13 = extractelement <16 x float> %v, i32 %index13
1040 %v14 = extractelement <16 x float> %v, i32 %index14
1041 %v15 = extractelement <16 x float> %v, i32 %index15
1042 %ret0 = insertelement <16 x float> undef, float %v0, i32 0
1043 %ret1 = insertelement <16 x float> %ret0, float %v1, i32 1
1044 %ret2 = insertelement <16 x float> %ret1, float %v2, i32 2
1045 %ret3 = insertelement <16 x float> %ret2, float %v3, i32 3
1046 %ret4 = insertelement <16 x float> %ret3, float %v4, i32 4
1047 %ret5 = insertelement <16 x float> %ret4, float %v5, i32 5
1048 %ret6 = insertelement <16 x float> %ret5, float %v6, i32 6
1049 %ret7 = insertelement <16 x float> %ret6, float %v7, i32 7
1050 %ret8 = insertelement <16 x float> %ret7, float %v8, i32 8
1051 %ret9 = insertelement <16 x float> %ret8, float %v9, i32 9
1052 %ret10 = insertelement <16 x float> %ret9, float %v10, i32 10
1053 %ret11 = insertelement <16 x float> %ret10, float %v11, i32 11
1054 %ret12 = insertelement <16 x float> %ret11, float %v12, i32 12
1055 %ret13 = insertelement <16 x float> %ret12, float %v13, i32 13
1056 %ret14 = insertelement <16 x float> %ret13, float %v14, i32 14
1057 %ret15 = insertelement <16 x float> %ret14, float %v15, i32 15
1058 ret <16 x float> %ret15
1061 define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) nounwind {
1062 ; AVX512F-LABEL: var_cvt_shuffle_v64f32_v64i8_idx:
1064 ; AVX512F-NEXT: pushq %rbp
1065 ; AVX512F-NEXT: movq %rsp, %rbp
1066 ; AVX512F-NEXT: andq $-64, %rsp
1067 ; AVX512F-NEXT: subq $128, %rsp
1068 ; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi
1069 ; AVX512F-NEXT: vpbroadcastd %esi, %zmm2
1070 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1
1071 ; AVX512F-NEXT: vmovd %xmm1, %eax
1072 ; AVX512F-NEXT: vmovaps %zmm0, (%rsp)
1073 ; AVX512F-NEXT: andl $63, %eax
1074 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1075 ; AVX512F-NEXT: vmovd %eax, %xmm0
1076 ; AVX512F-NEXT: vpextrd $1, %xmm1, %eax
1077 ; AVX512F-NEXT: andl $63, %eax
1078 ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
1079 ; AVX512F-NEXT: vpextrd $2, %xmm1, %eax
1080 ; AVX512F-NEXT: andl $63, %eax
1081 ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
1082 ; AVX512F-NEXT: vpextrd $3, %xmm1, %eax
1083 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
1084 ; AVX512F-NEXT: andl $63, %eax
1085 ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
1086 ; AVX512F-NEXT: vmovd %xmm3, %eax
1087 ; AVX512F-NEXT: andl $63, %eax
1088 ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
1089 ; AVX512F-NEXT: vpextrd $1, %xmm3, %eax
1090 ; AVX512F-NEXT: andl $63, %eax
1091 ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
1092 ; AVX512F-NEXT: vpextrd $2, %xmm3, %eax
1093 ; AVX512F-NEXT: andl $63, %eax
1094 ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
1095 ; AVX512F-NEXT: vpextrd $3, %xmm3, %eax
1096 ; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5
1097 ; AVX512F-NEXT: andl $63, %eax
1098 ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
1099 ; AVX512F-NEXT: vmovd %xmm5, %eax
1100 ; AVX512F-NEXT: andl $63, %eax
1101 ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
1102 ; AVX512F-NEXT: vpextrd $1, %xmm5, %eax
1103 ; AVX512F-NEXT: andl $63, %eax
1104 ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
1105 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
1106 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3
1107 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
1108 ; AVX512F-NEXT: andl $63, %esi
1109 ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0
1110 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
1111 ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1
1112 ; AVX512F-NEXT: andl $63, %eax
1113 ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
1114 ; AVX512F-NEXT: vmovd %xmm1, %eax
1115 ; AVX512F-NEXT: andl $63, %eax
1116 ; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
1117 ; AVX512F-NEXT: vpextrd $1, %xmm1, %eax
1118 ; AVX512F-NEXT: andl $63, %eax
1119 ; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
1120 ; AVX512F-NEXT: vpextrd $2, %xmm1, %eax
1121 ; AVX512F-NEXT: andl $63, %eax
1122 ; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
1123 ; AVX512F-NEXT: vpextrd $3, %xmm1, %eax
1124 ; AVX512F-NEXT: andl $63, %eax
1125 ; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
1126 ; AVX512F-NEXT: vmovd %xmm4, %eax
1127 ; AVX512F-NEXT: andl $63, %eax
1128 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1129 ; AVX512F-NEXT: vmovd %eax, %xmm1
1130 ; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
1131 ; AVX512F-NEXT: andl $63, %eax
1132 ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1
1133 ; AVX512F-NEXT: vpextrd $2, %xmm4, %eax
1134 ; AVX512F-NEXT: andl $63, %eax
1135 ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1
1136 ; AVX512F-NEXT: vpextrd $3, %xmm4, %eax
1137 ; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5
1138 ; AVX512F-NEXT: andl $63, %eax
1139 ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1
1140 ; AVX512F-NEXT: vmovd %xmm5, %eax
1141 ; AVX512F-NEXT: andl $63, %eax
1142 ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1
1143 ; AVX512F-NEXT: vpextrd $1, %xmm5, %eax
1144 ; AVX512F-NEXT: andl $63, %eax
1145 ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1
1146 ; AVX512F-NEXT: vpextrd $2, %xmm5, %eax
1147 ; AVX512F-NEXT: andl $63, %eax
1148 ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1
1149 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
1150 ; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm5
1151 ; AVX512F-NEXT: andl $63, %eax
1152 ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1
1153 ; AVX512F-NEXT: vmovd %xmm5, %eax
1154 ; AVX512F-NEXT: andl $63, %eax
1155 ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1
1156 ; AVX512F-NEXT: vpextrd $1, %xmm5, %eax
1157 ; AVX512F-NEXT: andl $63, %eax
1158 ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1
1159 ; AVX512F-NEXT: vpextrd $2, %xmm5, %eax
1160 ; AVX512F-NEXT: andl $63, %eax
1161 ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6
1162 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
1163 ; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm1
1164 ; AVX512F-NEXT: andl $63, %eax
1165 ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4
1166 ; AVX512F-NEXT: vmovd %xmm1, %eax
1167 ; AVX512F-NEXT: andl $63, %eax
1168 ; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4
1169 ; AVX512F-NEXT: vmovd %xmm3, %eax
1170 ; AVX512F-NEXT: andl $63, %eax
1171 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1172 ; AVX512F-NEXT: vmovd %eax, %xmm5
1173 ; AVX512F-NEXT: vpextrd $1, %xmm3, %eax
1174 ; AVX512F-NEXT: andl $63, %eax
1175 ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm5, %xmm5
1176 ; AVX512F-NEXT: vpextrd $2, %xmm3, %eax
1177 ; AVX512F-NEXT: andl $63, %eax
1178 ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm5, %xmm5
1179 ; AVX512F-NEXT: vpextrd $3, %xmm3, %eax
1180 ; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6
1181 ; AVX512F-NEXT: andl $63, %eax
1182 ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm5, %xmm5
1183 ; AVX512F-NEXT: vmovd %xmm6, %eax
1184 ; AVX512F-NEXT: andl $63, %eax
1185 ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm5, %xmm5
1186 ; AVX512F-NEXT: vpextrd $1, %xmm6, %eax
1187 ; AVX512F-NEXT: andl $63, %eax
1188 ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm5, %xmm5
1189 ; AVX512F-NEXT: vpextrd $2, %xmm6, %eax
1190 ; AVX512F-NEXT: andl $63, %eax
1191 ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm7
1192 ; AVX512F-NEXT: vpextrd $3, %xmm6, %eax
1193 ; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm5
1194 ; AVX512F-NEXT: andl $63, %eax
1195 ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm6
1196 ; AVX512F-NEXT: vmovd %xmm5, %eax
1197 ; AVX512F-NEXT: andl $63, %eax
1198 ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6
1199 ; AVX512F-NEXT: vpextrd $1, %xmm5, %eax
1200 ; AVX512F-NEXT: andl $63, %eax
1201 ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6
1202 ; AVX512F-NEXT: vpextrd $2, %xmm5, %eax
1203 ; AVX512F-NEXT: andl $63, %eax
1204 ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6
1205 ; AVX512F-NEXT: vmovd %xmm2, %eax
1206 ; AVX512F-NEXT: andl $63, %eax
1207 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1208 ; AVX512F-NEXT: vmovd %eax, %xmm7
1209 ; AVX512F-NEXT: vpextrd $1, %xmm2, %eax
1210 ; AVX512F-NEXT: andl $63, %eax
1211 ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm7, %xmm7
1212 ; AVX512F-NEXT: vpextrd $2, %xmm2, %eax
1213 ; AVX512F-NEXT: andl $63, %eax
1214 ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm7, %xmm7
1215 ; AVX512F-NEXT: vpextrd $3, %xmm2, %eax
1216 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm8
1217 ; AVX512F-NEXT: andl $63, %eax
1218 ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm7, %xmm7
1219 ; AVX512F-NEXT: vmovd %xmm8, %eax
1220 ; AVX512F-NEXT: andl $63, %eax
1221 ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm7, %xmm7
1222 ; AVX512F-NEXT: vpextrd $1, %xmm8, %eax
1223 ; AVX512F-NEXT: andl $63, %eax
1224 ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm7, %xmm7
1225 ; AVX512F-NEXT: vpextrd $2, %xmm8, %eax
1226 ; AVX512F-NEXT: andl $63, %eax
1227 ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm7
1228 ; AVX512F-NEXT: vpextrd $3, %xmm8, %eax
1229 ; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm8
1230 ; AVX512F-NEXT: andl $63, %eax
1231 ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm7
1232 ; AVX512F-NEXT: vmovd %xmm8, %eax
1233 ; AVX512F-NEXT: andl $63, %eax
1234 ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm7, %xmm7
1235 ; AVX512F-NEXT: vpextrd $1, %xmm8, %eax
1236 ; AVX512F-NEXT: andl $63, %eax
1237 ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm7, %xmm7
1238 ; AVX512F-NEXT: vpextrd $2, %xmm8, %eax
1239 ; AVX512F-NEXT: andl $63, %eax
1240 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1241 ; AVX512F-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7
1242 ; AVX512F-NEXT: vpextrd $3, %xmm8, %eax
1243 ; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm2
1244 ; AVX512F-NEXT: andl $63, %eax
1245 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1246 ; AVX512F-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7
1247 ; AVX512F-NEXT: vmovd %xmm2, %eax
1248 ; AVX512F-NEXT: andl $63, %eax
1249 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1250 ; AVX512F-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7
1251 ; AVX512F-NEXT: vpextrd $1, %xmm2, %eax
1252 ; AVX512F-NEXT: andl $63, %eax
1253 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1254 ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm7, %xmm7
1255 ; AVX512F-NEXT: vpextrd $2, %xmm2, %eax
1256 ; AVX512F-NEXT: andl $63, %eax
1257 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1258 ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm7, %xmm7
1259 ; AVX512F-NEXT: vpextrd $3, %xmm2, %eax
1260 ; AVX512F-NEXT: andl $63, %eax
1261 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1262 ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2
1263 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
1264 ; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm3
1265 ; AVX512F-NEXT: andl $63, %eax
1266 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1267 ; AVX512F-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5
1268 ; AVX512F-NEXT: vmovd %xmm3, %eax
1269 ; AVX512F-NEXT: andl $63, %eax
1270 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1271 ; AVX512F-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
1272 ; AVX512F-NEXT: vpextrd $1, %xmm3, %eax
1273 ; AVX512F-NEXT: andl $63, %eax
1274 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1275 ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
1276 ; AVX512F-NEXT: vpextrd $2, %xmm3, %eax
1277 ; AVX512F-NEXT: andl $63, %eax
1278 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1279 ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
1280 ; AVX512F-NEXT: vpextrd $3, %xmm3, %eax
1281 ; AVX512F-NEXT: andl $63, %eax
1282 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1283 ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3
1284 ; AVX512F-NEXT: vpextrd $1, %xmm1, %eax
1285 ; AVX512F-NEXT: andl $63, %eax
1286 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1287 ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
1288 ; AVX512F-NEXT: vpextrd $2, %xmm1, %eax
1289 ; AVX512F-NEXT: andl $63, %eax
1290 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1291 ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
1292 ; AVX512F-NEXT: vpextrd $3, %xmm1, %eax
1293 ; AVX512F-NEXT: andl $63, %eax
1294 ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
1295 ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1
1296 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
1297 ; AVX512F-NEXT: vcvtdq2ps %zmm2, %zmm2
1298 ; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
1299 ; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3
1300 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
1301 ; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1
1302 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
1303 ; AVX512F-NEXT: vcvtdq2ps %zmm0, %zmm0
1304 ; AVX512F-NEXT: vmovaps %zmm0, 192(%rdi)
1305 ; AVX512F-NEXT: vmovaps %zmm1, 128(%rdi)
1306 ; AVX512F-NEXT: vmovaps %zmm3, 64(%rdi)
1307 ; AVX512F-NEXT: vmovaps %zmm2, (%rdi)
1308 ; AVX512F-NEXT: movq %rbp, %rsp
1309 ; AVX512F-NEXT: popq %rbp
1310 ; AVX512F-NEXT: vzeroupper
1311 ; AVX512F-NEXT: retq
1313 ; AVX512BW-LABEL: var_cvt_shuffle_v64f32_v64i8_idx:
1314 ; AVX512BW: # %bb.0:
1315 ; AVX512BW-NEXT: pushq %rbp
1316 ; AVX512BW-NEXT: movq %rsp, %rbp
1317 ; AVX512BW-NEXT: andq $-64, %rsp
1318 ; AVX512BW-NEXT: subq $128, %rsp
1319 ; AVX512BW-NEXT: # kill: def $esi killed $esi def $rsi
1320 ; AVX512BW-NEXT: vpbroadcastd %esi, %zmm2
1321 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1
1322 ; AVX512BW-NEXT: vmovd %xmm1, %eax
1323 ; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
1324 ; AVX512BW-NEXT: andl $63, %eax
1325 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1326 ; AVX512BW-NEXT: vmovd %eax, %xmm0
1327 ; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax
1328 ; AVX512BW-NEXT: andl $63, %eax
1329 ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
1330 ; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax
1331 ; AVX512BW-NEXT: andl $63, %eax
1332 ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
1333 ; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax
1334 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3
1335 ; AVX512BW-NEXT: andl $63, %eax
1336 ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
1337 ; AVX512BW-NEXT: vmovd %xmm3, %eax
1338 ; AVX512BW-NEXT: andl $63, %eax
1339 ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
1340 ; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax
1341 ; AVX512BW-NEXT: andl $63, %eax
1342 ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
1343 ; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax
1344 ; AVX512BW-NEXT: andl $63, %eax
1345 ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
1346 ; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax
1347 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5
1348 ; AVX512BW-NEXT: andl $63, %eax
1349 ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
1350 ; AVX512BW-NEXT: vmovd %xmm5, %eax
1351 ; AVX512BW-NEXT: andl $63, %eax
1352 ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
1353 ; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax
1354 ; AVX512BW-NEXT: andl $63, %eax
1355 ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
1356 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
1357 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3
1358 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
1359 ; AVX512BW-NEXT: andl $63, %esi
1360 ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0
1361 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax
1362 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1
1363 ; AVX512BW-NEXT: andl $63, %eax
1364 ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
1365 ; AVX512BW-NEXT: vmovd %xmm1, %eax
1366 ; AVX512BW-NEXT: andl $63, %eax
1367 ; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
1368 ; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax
1369 ; AVX512BW-NEXT: andl $63, %eax
1370 ; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
1371 ; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax
1372 ; AVX512BW-NEXT: andl $63, %eax
1373 ; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
1374 ; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax
1375 ; AVX512BW-NEXT: andl $63, %eax
1376 ; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
1377 ; AVX512BW-NEXT: vmovd %xmm4, %eax
1378 ; AVX512BW-NEXT: andl $63, %eax
1379 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1380 ; AVX512BW-NEXT: vmovd %eax, %xmm1
1381 ; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax
1382 ; AVX512BW-NEXT: andl $63, %eax
1383 ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1
1384 ; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax
1385 ; AVX512BW-NEXT: andl $63, %eax
1386 ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1
1387 ; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax
1388 ; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5
1389 ; AVX512BW-NEXT: andl $63, %eax
1390 ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1
1391 ; AVX512BW-NEXT: vmovd %xmm5, %eax
1392 ; AVX512BW-NEXT: andl $63, %eax
1393 ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1
1394 ; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax
1395 ; AVX512BW-NEXT: andl $63, %eax
1396 ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1
1397 ; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax
1398 ; AVX512BW-NEXT: andl $63, %eax
1399 ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1
1400 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax
1401 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm4, %xmm5
1402 ; AVX512BW-NEXT: andl $63, %eax
1403 ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1
1404 ; AVX512BW-NEXT: vmovd %xmm5, %eax
1405 ; AVX512BW-NEXT: andl $63, %eax
1406 ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1
1407 ; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax
1408 ; AVX512BW-NEXT: andl $63, %eax
1409 ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1
1410 ; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax
1411 ; AVX512BW-NEXT: andl $63, %eax
1412 ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6
1413 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax
1414 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm4, %xmm1
1415 ; AVX512BW-NEXT: andl $63, %eax
1416 ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4
1417 ; AVX512BW-NEXT: vmovd %xmm1, %eax
1418 ; AVX512BW-NEXT: andl $63, %eax
1419 ; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4
1420 ; AVX512BW-NEXT: vmovd %xmm3, %eax
1421 ; AVX512BW-NEXT: andl $63, %eax
1422 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1423 ; AVX512BW-NEXT: vmovd %eax, %xmm5
1424 ; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax
1425 ; AVX512BW-NEXT: andl $63, %eax
1426 ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm5, %xmm5
1427 ; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax
1428 ; AVX512BW-NEXT: andl $63, %eax
1429 ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm5, %xmm5
1430 ; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax
1431 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm6
1432 ; AVX512BW-NEXT: andl $63, %eax
1433 ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm5, %xmm5
1434 ; AVX512BW-NEXT: vmovd %xmm6, %eax
1435 ; AVX512BW-NEXT: andl $63, %eax
1436 ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm5, %xmm5
1437 ; AVX512BW-NEXT: vpextrd $1, %xmm6, %eax
1438 ; AVX512BW-NEXT: andl $63, %eax
1439 ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm5, %xmm5
1440 ; AVX512BW-NEXT: vpextrd $2, %xmm6, %eax
1441 ; AVX512BW-NEXT: andl $63, %eax
1442 ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm7
1443 ; AVX512BW-NEXT: vpextrd $3, %xmm6, %eax
1444 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm5
1445 ; AVX512BW-NEXT: andl $63, %eax
1446 ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm6
1447 ; AVX512BW-NEXT: vmovd %xmm5, %eax
1448 ; AVX512BW-NEXT: andl $63, %eax
1449 ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6
1450 ; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax
1451 ; AVX512BW-NEXT: andl $63, %eax
1452 ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6
1453 ; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax
1454 ; AVX512BW-NEXT: andl $63, %eax
1455 ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6
1456 ; AVX512BW-NEXT: vmovd %xmm2, %eax
1457 ; AVX512BW-NEXT: andl $63, %eax
1458 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1459 ; AVX512BW-NEXT: vmovd %eax, %xmm7
1460 ; AVX512BW-NEXT: vpextrd $1, %xmm2, %eax
1461 ; AVX512BW-NEXT: andl $63, %eax
1462 ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm7, %xmm7
1463 ; AVX512BW-NEXT: vpextrd $2, %xmm2, %eax
1464 ; AVX512BW-NEXT: andl $63, %eax
1465 ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm7, %xmm7
1466 ; AVX512BW-NEXT: vpextrd $3, %xmm2, %eax
1467 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm8
1468 ; AVX512BW-NEXT: andl $63, %eax
1469 ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm7, %xmm7
1470 ; AVX512BW-NEXT: vmovd %xmm8, %eax
1471 ; AVX512BW-NEXT: andl $63, %eax
1472 ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm7, %xmm7
1473 ; AVX512BW-NEXT: vpextrd $1, %xmm8, %eax
1474 ; AVX512BW-NEXT: andl $63, %eax
1475 ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm7, %xmm7
1476 ; AVX512BW-NEXT: vpextrd $2, %xmm8, %eax
1477 ; AVX512BW-NEXT: andl $63, %eax
1478 ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm7
1479 ; AVX512BW-NEXT: vpextrd $3, %xmm8, %eax
1480 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm8
1481 ; AVX512BW-NEXT: andl $63, %eax
1482 ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm7
1483 ; AVX512BW-NEXT: vmovd %xmm8, %eax
1484 ; AVX512BW-NEXT: andl $63, %eax
1485 ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm7, %xmm7
1486 ; AVX512BW-NEXT: vpextrd $1, %xmm8, %eax
1487 ; AVX512BW-NEXT: andl $63, %eax
1488 ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm7, %xmm7
1489 ; AVX512BW-NEXT: vpextrd $2, %xmm8, %eax
1490 ; AVX512BW-NEXT: andl $63, %eax
1491 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1492 ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7
1493 ; AVX512BW-NEXT: vpextrd $3, %xmm8, %eax
1494 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm2
1495 ; AVX512BW-NEXT: andl $63, %eax
1496 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1497 ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7
1498 ; AVX512BW-NEXT: vmovd %xmm2, %eax
1499 ; AVX512BW-NEXT: andl $63, %eax
1500 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1501 ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7
1502 ; AVX512BW-NEXT: vpextrd $1, %xmm2, %eax
1503 ; AVX512BW-NEXT: andl $63, %eax
1504 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1505 ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm7, %xmm7
1506 ; AVX512BW-NEXT: vpextrd $2, %xmm2, %eax
1507 ; AVX512BW-NEXT: andl $63, %eax
1508 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1509 ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm7, %xmm7
1510 ; AVX512BW-NEXT: vpextrd $3, %xmm2, %eax
1511 ; AVX512BW-NEXT: andl $63, %eax
1512 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1513 ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2
1514 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax
1515 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm3
1516 ; AVX512BW-NEXT: andl $63, %eax
1517 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1518 ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5
1519 ; AVX512BW-NEXT: vmovd %xmm3, %eax
1520 ; AVX512BW-NEXT: andl $63, %eax
1521 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1522 ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
1523 ; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax
1524 ; AVX512BW-NEXT: andl $63, %eax
1525 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1526 ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
1527 ; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax
1528 ; AVX512BW-NEXT: andl $63, %eax
1529 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1530 ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
1531 ; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax
1532 ; AVX512BW-NEXT: andl $63, %eax
1533 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1534 ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3
1535 ; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax
1536 ; AVX512BW-NEXT: andl $63, %eax
1537 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1538 ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
1539 ; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax
1540 ; AVX512BW-NEXT: andl $63, %eax
1541 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1542 ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
1543 ; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax
1544 ; AVX512BW-NEXT: andl $63, %eax
1545 ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
1546 ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1
1547 ; AVX512BW-NEXT: vpmovsxbd %xmm2, %zmm2
1548 ; AVX512BW-NEXT: vcvtdq2ps %zmm2, %zmm2
1549 ; AVX512BW-NEXT: vpmovsxbd %xmm3, %zmm3
1550 ; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3
1551 ; AVX512BW-NEXT: vpmovsxbd %xmm1, %zmm1
1552 ; AVX512BW-NEXT: vcvtdq2ps %zmm1, %zmm1
1553 ; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0
1554 ; AVX512BW-NEXT: vcvtdq2ps %zmm0, %zmm0
1555 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rdi)
1556 ; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdi)
1557 ; AVX512BW-NEXT: vmovaps %zmm3, 64(%rdi)
1558 ; AVX512BW-NEXT: vmovaps %zmm2, (%rdi)
1559 ; AVX512BW-NEXT: movq %rbp, %rsp
1560 ; AVX512BW-NEXT: popq %rbp
1561 ; AVX512BW-NEXT: vzeroupper
1562 ; AVX512BW-NEXT: retq
1564 ; AVX512VBMI-LABEL: var_cvt_shuffle_v64f32_v64i8_idx:
1565 ; AVX512VBMI: # %bb.0:
1566 ; AVX512VBMI-NEXT: pushq %rbp
1567 ; AVX512VBMI-NEXT: movq %rsp, %rbp
1568 ; AVX512VBMI-NEXT: andq $-64, %rsp
1569 ; AVX512VBMI-NEXT: subq $128, %rsp
1570 ; AVX512VBMI-NEXT: # kill: def $esi killed $esi def $rsi
1571 ; AVX512VBMI-NEXT: vpbroadcastd %esi, %zmm1
1572 ; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2
1573 ; AVX512VBMI-NEXT: vmovd %xmm2, %eax
1574 ; AVX512VBMI-NEXT: vmovdqa64 %zmm0, (%rsp)
1575 ; AVX512VBMI-NEXT: andl $63, %eax
1576 ; AVX512VBMI-NEXT: movzbl (%rsp,%rax), %eax
1577 ; AVX512VBMI-NEXT: vmovd %eax, %xmm3
1578 ; AVX512VBMI-NEXT: vpextrd $1, %xmm2, %eax
1579 ; AVX512VBMI-NEXT: andl $63, %eax
1580 ; AVX512VBMI-NEXT: vpinsrb $1, (%rsp,%rax), %xmm3, %xmm3
1581 ; AVX512VBMI-NEXT: vpextrd $2, %xmm2, %eax
1582 ; AVX512VBMI-NEXT: andl $63, %eax
1583 ; AVX512VBMI-NEXT: vpinsrb $2, (%rsp,%rax), %xmm3, %xmm3
1584 ; AVX512VBMI-NEXT: vpextrd $3, %xmm2, %eax
1585 ; AVX512VBMI-NEXT: vextracti128 $1, %ymm2, %xmm4
1586 ; AVX512VBMI-NEXT: andl $63, %eax
1587 ; AVX512VBMI-NEXT: vpinsrb $3, (%rsp,%rax), %xmm3, %xmm3
1588 ; AVX512VBMI-NEXT: vmovd %xmm4, %eax
1589 ; AVX512VBMI-NEXT: andl $63, %eax
1590 ; AVX512VBMI-NEXT: vpinsrb $4, (%rsp,%rax), %xmm3, %xmm3
1591 ; AVX512VBMI-NEXT: vpextrd $1, %xmm4, %eax
1592 ; AVX512VBMI-NEXT: andl $63, %eax
1593 ; AVX512VBMI-NEXT: vpinsrb $5, (%rsp,%rax), %xmm3, %xmm3
1594 ; AVX512VBMI-NEXT: vpextrd $2, %xmm4, %eax
1595 ; AVX512VBMI-NEXT: andl $63, %eax
1596 ; AVX512VBMI-NEXT: vpinsrb $6, (%rsp,%rax), %xmm3, %xmm3
1597 ; AVX512VBMI-NEXT: vpextrd $3, %xmm4, %eax
1598 ; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm2, %xmm4
1599 ; AVX512VBMI-NEXT: andl $63, %eax
1600 ; AVX512VBMI-NEXT: vpinsrb $7, (%rsp,%rax), %xmm3, %xmm3
1601 ; AVX512VBMI-NEXT: vmovd %xmm4, %eax
1602 ; AVX512VBMI-NEXT: andl $63, %eax
1603 ; AVX512VBMI-NEXT: vpinsrb $8, (%rsp,%rax), %xmm3, %xmm3
1604 ; AVX512VBMI-NEXT: vpextrd $1, %xmm4, %eax
1605 ; AVX512VBMI-NEXT: andl $63, %eax
1606 ; AVX512VBMI-NEXT: vpinsrb $9, (%rsp,%rax), %xmm3, %xmm3
1607 ; AVX512VBMI-NEXT: andl $63, %esi
1608 ; AVX512VBMI-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm3, %xmm3
1609 ; AVX512VBMI-NEXT: vpextrd $3, %xmm4, %eax
1610 ; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm2, %xmm2
1611 ; AVX512VBMI-NEXT: andl $63, %eax
1612 ; AVX512VBMI-NEXT: vpinsrb $11, (%rsp,%rax), %xmm3, %xmm3
1613 ; AVX512VBMI-NEXT: vmovd %xmm2, %eax
1614 ; AVX512VBMI-NEXT: andl $63, %eax
1615 ; AVX512VBMI-NEXT: vpinsrb $12, (%rsp,%rax), %xmm3, %xmm3
1616 ; AVX512VBMI-NEXT: vpextrd $1, %xmm2, %eax
1617 ; AVX512VBMI-NEXT: andl $63, %eax
1618 ; AVX512VBMI-NEXT: vpinsrb $13, (%rsp,%rax), %xmm3, %xmm3
1619 ; AVX512VBMI-NEXT: vpextrd $2, %xmm2, %eax
1620 ; AVX512VBMI-NEXT: andl $63, %eax
1621 ; AVX512VBMI-NEXT: vpinsrb $14, (%rsp,%rax), %xmm3, %xmm3
1622 ; AVX512VBMI-NEXT: vpextrd $3, %xmm2, %eax
1623 ; AVX512VBMI-NEXT: andl $63, %eax
1624 ; AVX512VBMI-NEXT: vpinsrb $15, (%rsp,%rax), %xmm3, %xmm2
1625 ; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3
1626 ; AVX512VBMI-NEXT: vpmovdb %zmm3, %xmm3
1627 ; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4
1628 ; AVX512VBMI-NEXT: vpmovdb %zmm4, %xmm4
1629 ; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1630 ; AVX512VBMI-NEXT: vpmovdb %zmm1, %xmm1
1631 ; AVX512VBMI-NEXT: vpmovsxbd %xmm2, %zmm2
1632 ; AVX512VBMI-NEXT: vcvtdq2ps %zmm2, %zmm2
1633 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1
1634 ; AVX512VBMI-NEXT: vpmovsxbd %xmm1, %zmm1
1635 ; AVX512VBMI-NEXT: vcvtdq2ps %zmm1, %zmm1
1636 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm4, %zmm4
1637 ; AVX512VBMI-NEXT: vpmovsxbd %xmm4, %zmm4
1638 ; AVX512VBMI-NEXT: vcvtdq2ps %zmm4, %zmm4
1639 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm3, %zmm0
1640 ; AVX512VBMI-NEXT: vpmovsxbd %xmm0, %zmm0
1641 ; AVX512VBMI-NEXT: vcvtdq2ps %zmm0, %zmm0
1642 ; AVX512VBMI-NEXT: vmovaps %zmm0, 128(%rdi)
1643 ; AVX512VBMI-NEXT: vmovaps %zmm4, 64(%rdi)
1644 ; AVX512VBMI-NEXT: vmovaps %zmm1, (%rdi)
1645 ; AVX512VBMI-NEXT: vmovaps %zmm2, 192(%rdi)
1646 ; AVX512VBMI-NEXT: movq %rbp, %rsp
1647 ; AVX512VBMI-NEXT: popq %rbp
1648 ; AVX512VBMI-NEXT: vzeroupper
1649 ; AVX512VBMI-NEXT: retq
1650 %b_broadcast_init = insertelement <64 x i32> undef, i32 %b, i32 0
1651 %b_broadcast = shufflevector <64 x i32> %b_broadcast_init, <64 x i32> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0>
1652 %sub_add__b_broadcast_ = add <64 x i32> %b_broadcast, <i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef, i32 -1, i32 -2, i32 -3, i32 -4, i32 -5>
1653 %index_0.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 0
1654 %index_1.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 1
1655 %index_2.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 2
1656 %index_3.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 3
1657 %index_4.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 4
1658 %index_5.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 5
1659 %index_6.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 6
1660 %index_7.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 7
1661 %index_8.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 8
1662 %index_9.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 9
1663 %index_10.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 10
1664 %index_11.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 11
1665 %index_12.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 12
1666 %index_13.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 13
1667 %index_14.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 14
1668 %index_15.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 15
1669 %index_16.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 16
1670 %index_17.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 17
1671 %index_18.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 18
1672 %index_19.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 19
1673 %index_20.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 20
1674 %index_21.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 21
1675 %index_22.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 22
1676 %index_23.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 23
1677 %index_24.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 24
1678 %index_25.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 25
1679 %index_26.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 26
1680 %index_27.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 27
1681 %index_28.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 28
1682 %index_29.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 29
1683 %index_30.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 30
1684 %index_31.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 31
1685 %index_32.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 32
1686 %index_33.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 33
1687 %index_34.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 34
1688 %index_35.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 35
1689 %index_36.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 36
1690 %index_37.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 37
1691 %index_38.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 38
1692 %index_39.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 39
1693 %index_40.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 40
1694 %index_41.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 41
1695 %index_42.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 42
1696 %index_43.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 43
1697 %index_44.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 44
1698 %index_45.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 45
1699 %index_46.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 46
1700 %index_47.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 47
1701 %index_48.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 48
1702 %index_49.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 49
1703 %index_50.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 50
1704 %index_51.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 51
1705 %index_52.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 52
1706 %index_53.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 53
1707 %index_54.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 54
1708 %index_55.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 55
1709 %index_56.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 56
1710 %index_57.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 57
1711 %index_59.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 59
1712 %index_60.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 60
1713 %index_61.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 61
1714 %index_62.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 62
1715 %index_63.i.i = extractelement <64 x i32> %sub_add__b_broadcast_, i32 63
1716 %v_0.i.i = extractelement <64 x i8> %src, i32 %index_0.i.i
1717 %v_1.i.i = extractelement <64 x i8> %src, i32 %index_1.i.i
1718 %v_2.i.i = extractelement <64 x i8> %src, i32 %index_2.i.i
1719 %v_3.i.i = extractelement <64 x i8> %src, i32 %index_3.i.i
1720 %v_4.i.i = extractelement <64 x i8> %src, i32 %index_4.i.i
1721 %v_5.i.i = extractelement <64 x i8> %src, i32 %index_5.i.i
1722 %v_6.i.i = extractelement <64 x i8> %src, i32 %index_6.i.i
1723 %v_7.i.i = extractelement <64 x i8> %src, i32 %index_7.i.i
1724 %v_8.i.i = extractelement <64 x i8> %src, i32 %index_8.i.i
1725 %v_9.i.i = extractelement <64 x i8> %src, i32 %index_9.i.i
1726 %v_10.i.i = extractelement <64 x i8> %src, i32 %index_10.i.i
1727 %v_11.i.i = extractelement <64 x i8> %src, i32 %index_11.i.i
1728 %v_12.i.i = extractelement <64 x i8> %src, i32 %index_12.i.i
1729 %v_13.i.i = extractelement <64 x i8> %src, i32 %index_13.i.i
1730 %v_14.i.i = extractelement <64 x i8> %src, i32 %index_14.i.i
1731 %v_15.i.i = extractelement <64 x i8> %src, i32 %index_15.i.i
1732 %v_16.i.i = extractelement <64 x i8> %src, i32 %index_16.i.i
1733 %v_17.i.i = extractelement <64 x i8> %src, i32 %index_17.i.i
1734 %v_18.i.i = extractelement <64 x i8> %src, i32 %index_18.i.i
1735 %v_19.i.i = extractelement <64 x i8> %src, i32 %index_19.i.i
1736 %v_20.i.i = extractelement <64 x i8> %src, i32 %index_20.i.i
1737 %v_21.i.i = extractelement <64 x i8> %src, i32 %index_21.i.i
1738 %v_22.i.i = extractelement <64 x i8> %src, i32 %index_22.i.i
1739 %v_23.i.i = extractelement <64 x i8> %src, i32 %index_23.i.i
1740 %v_24.i.i = extractelement <64 x i8> %src, i32 %index_24.i.i
1741 %v_25.i.i = extractelement <64 x i8> %src, i32 %index_25.i.i
1742 %v_26.i.i = extractelement <64 x i8> %src, i32 %index_26.i.i
1743 %v_27.i.i = extractelement <64 x i8> %src, i32 %index_27.i.i
1744 %v_28.i.i = extractelement <64 x i8> %src, i32 %index_28.i.i
1745 %v_29.i.i = extractelement <64 x i8> %src, i32 %index_29.i.i
1746 %v_30.i.i = extractelement <64 x i8> %src, i32 %index_30.i.i
1747 %v_31.i.i = extractelement <64 x i8> %src, i32 %index_31.i.i
1748 %v_32.i.i = extractelement <64 x i8> %src, i32 %index_32.i.i
1749 %v_33.i.i = extractelement <64 x i8> %src, i32 %index_33.i.i
1750 %v_34.i.i = extractelement <64 x i8> %src, i32 %index_34.i.i
1751 %v_35.i.i = extractelement <64 x i8> %src, i32 %index_35.i.i
1752 %v_36.i.i = extractelement <64 x i8> %src, i32 %index_36.i.i
1753 %v_37.i.i = extractelement <64 x i8> %src, i32 %index_37.i.i
1754 %v_38.i.i = extractelement <64 x i8> %src, i32 %index_38.i.i
1755 %v_39.i.i = extractelement <64 x i8> %src, i32 %index_39.i.i
1756 %v_40.i.i = extractelement <64 x i8> %src, i32 %index_40.i.i
1757 %v_41.i.i = extractelement <64 x i8> %src, i32 %index_41.i.i
1758 %v_42.i.i = extractelement <64 x i8> %src, i32 %index_42.i.i
1759 %v_43.i.i = extractelement <64 x i8> %src, i32 %index_43.i.i
1760 %v_44.i.i = extractelement <64 x i8> %src, i32 %index_44.i.i
1761 %v_45.i.i = extractelement <64 x i8> %src, i32 %index_45.i.i
1762 %v_46.i.i = extractelement <64 x i8> %src, i32 %index_46.i.i
1763 %v_47.i.i = extractelement <64 x i8> %src, i32 %index_47.i.i
1764 %v_48.i.i = extractelement <64 x i8> %src, i32 %index_48.i.i
1765 %v_49.i.i = extractelement <64 x i8> %src, i32 %index_49.i.i
1766 %v_50.i.i = extractelement <64 x i8> %src, i32 %index_50.i.i
1767 %v_51.i.i = extractelement <64 x i8> %src, i32 %index_51.i.i
1768 %v_52.i.i = extractelement <64 x i8> %src, i32 %index_52.i.i
1769 %v_53.i.i = extractelement <64 x i8> %src, i32 %index_53.i.i
1770 %v_54.i.i = extractelement <64 x i8> %src, i32 %index_54.i.i
1771 %v_55.i.i = extractelement <64 x i8> %src, i32 %index_55.i.i
1772 %v_56.i.i = extractelement <64 x i8> %src, i32 %index_56.i.i
1773 %v_57.i.i = extractelement <64 x i8> %src, i32 %index_57.i.i
1774 %v_58.i.i = extractelement <64 x i8> %src, i32 %b
1775 %v_59.i.i = extractelement <64 x i8> %src, i32 %index_59.i.i
1776 %v_60.i.i = extractelement <64 x i8> %src, i32 %index_60.i.i
1777 %v_61.i.i = extractelement <64 x i8> %src, i32 %index_61.i.i
1778 %v_62.i.i = extractelement <64 x i8> %src, i32 %index_62.i.i
1779 %v_63.i.i = extractelement <64 x i8> %src, i32 %index_63.i.i
1780 %dst_0.i.i = insertelement <64 x i8> undef, i8 %v_0.i.i, i32 0
1781 %dst_1.i.i = insertelement <64 x i8> %dst_0.i.i, i8 %v_1.i.i, i32 1
1782 %dst_2.i.i = insertelement <64 x i8> %dst_1.i.i, i8 %v_2.i.i, i32 2
1783 %dst_3.i.i = insertelement <64 x i8> %dst_2.i.i, i8 %v_3.i.i, i32 3
1784 %dst_4.i.i = insertelement <64 x i8> %dst_3.i.i, i8 %v_4.i.i, i32 4
1785 %dst_5.i.i = insertelement <64 x i8> %dst_4.i.i, i8 %v_5.i.i, i32 5
1786 %dst_6.i.i = insertelement <64 x i8> %dst_5.i.i, i8 %v_6.i.i, i32 6
1787 %dst_7.i.i = insertelement <64 x i8> %dst_6.i.i, i8 %v_7.i.i, i32 7
1788 %dst_8.i.i = insertelement <64 x i8> %dst_7.i.i, i8 %v_8.i.i, i32 8
1789 %dst_9.i.i = insertelement <64 x i8> %dst_8.i.i, i8 %v_9.i.i, i32 9
1790 %dst_10.i.i = insertelement <64 x i8> %dst_9.i.i, i8 %v_10.i.i, i32 10
1791 %dst_11.i.i = insertelement <64 x i8> %dst_10.i.i, i8 %v_11.i.i, i32 11
1792 %dst_12.i.i = insertelement <64 x i8> %dst_11.i.i, i8 %v_12.i.i, i32 12
1793 %dst_13.i.i = insertelement <64 x i8> %dst_12.i.i, i8 %v_13.i.i, i32 13
1794 %dst_14.i.i = insertelement <64 x i8> %dst_13.i.i, i8 %v_14.i.i, i32 14
1795 %dst_15.i.i = insertelement <64 x i8> %dst_14.i.i, i8 %v_15.i.i, i32 15
1796 %dst_16.i.i = insertelement <64 x i8> %dst_15.i.i, i8 %v_16.i.i, i32 16
1797 %dst_17.i.i = insertelement <64 x i8> %dst_16.i.i, i8 %v_17.i.i, i32 17
1798 %dst_18.i.i = insertelement <64 x i8> %dst_17.i.i, i8 %v_18.i.i, i32 18
1799 %dst_19.i.i = insertelement <64 x i8> %dst_18.i.i, i8 %v_19.i.i, i32 19
1800 %dst_20.i.i = insertelement <64 x i8> %dst_19.i.i, i8 %v_20.i.i, i32 20
1801 %dst_21.i.i = insertelement <64 x i8> %dst_20.i.i, i8 %v_21.i.i, i32 21
1802 %dst_22.i.i = insertelement <64 x i8> %dst_21.i.i, i8 %v_22.i.i, i32 22
1803 %dst_23.i.i = insertelement <64 x i8> %dst_22.i.i, i8 %v_23.i.i, i32 23
1804 %dst_24.i.i = insertelement <64 x i8> %dst_23.i.i, i8 %v_24.i.i, i32 24
1805 %dst_25.i.i = insertelement <64 x i8> %dst_24.i.i, i8 %v_25.i.i, i32 25
1806 %dst_26.i.i = insertelement <64 x i8> %dst_25.i.i, i8 %v_26.i.i, i32 26
1807 %dst_27.i.i = insertelement <64 x i8> %dst_26.i.i, i8 %v_27.i.i, i32 27
1808 %dst_28.i.i = insertelement <64 x i8> %dst_27.i.i, i8 %v_28.i.i, i32 28
1809 %dst_29.i.i = insertelement <64 x i8> %dst_28.i.i, i8 %v_29.i.i, i32 29
1810 %dst_30.i.i = insertelement <64 x i8> %dst_29.i.i, i8 %v_30.i.i, i32 30
1811 %dst_31.i.i = insertelement <64 x i8> %dst_30.i.i, i8 %v_31.i.i, i32 31
1812 %dst_32.i.i = insertelement <64 x i8> %dst_31.i.i, i8 %v_32.i.i, i32 32
1813 %dst_33.i.i = insertelement <64 x i8> %dst_32.i.i, i8 %v_33.i.i, i32 33
1814 %dst_34.i.i = insertelement <64 x i8> %dst_33.i.i, i8 %v_34.i.i, i32 34
1815 %dst_35.i.i = insertelement <64 x i8> %dst_34.i.i, i8 %v_35.i.i, i32 35
1816 %dst_36.i.i = insertelement <64 x i8> %dst_35.i.i, i8 %v_36.i.i, i32 36
1817 %dst_37.i.i = insertelement <64 x i8> %dst_36.i.i, i8 %v_37.i.i, i32 37
1818 %dst_38.i.i = insertelement <64 x i8> %dst_37.i.i, i8 %v_38.i.i, i32 38
1819 %dst_39.i.i = insertelement <64 x i8> %dst_38.i.i, i8 %v_39.i.i, i32 39
1820 %dst_40.i.i = insertelement <64 x i8> %dst_39.i.i, i8 %v_40.i.i, i32 40
1821 %dst_41.i.i = insertelement <64 x i8> %dst_40.i.i, i8 %v_41.i.i, i32 41
1822 %dst_42.i.i = insertelement <64 x i8> %dst_41.i.i, i8 %v_42.i.i, i32 42
1823 %dst_43.i.i = insertelement <64 x i8> %dst_42.i.i, i8 %v_43.i.i, i32 43
1824 %dst_44.i.i = insertelement <64 x i8> %dst_43.i.i, i8 %v_44.i.i, i32 44
1825 %dst_45.i.i = insertelement <64 x i8> %dst_44.i.i, i8 %v_45.i.i, i32 45
1826 %dst_46.i.i = insertelement <64 x i8> %dst_45.i.i, i8 %v_46.i.i, i32 46
1827 %dst_47.i.i = insertelement <64 x i8> %dst_46.i.i, i8 %v_47.i.i, i32 47
1828 %dst_48.i.i = insertelement <64 x i8> %dst_47.i.i, i8 %v_48.i.i, i32 48
1829 %dst_49.i.i = insertelement <64 x i8> %dst_48.i.i, i8 %v_49.i.i, i32 49
1830 %dst_50.i.i = insertelement <64 x i8> %dst_49.i.i, i8 %v_50.i.i, i32 50
1831 %dst_51.i.i = insertelement <64 x i8> %dst_50.i.i, i8 %v_51.i.i, i32 51
1832 %dst_52.i.i = insertelement <64 x i8> %dst_51.i.i, i8 %v_52.i.i, i32 52
1833 %dst_53.i.i = insertelement <64 x i8> %dst_52.i.i, i8 %v_53.i.i, i32 53
1834 %dst_54.i.i = insertelement <64 x i8> %dst_53.i.i, i8 %v_54.i.i, i32 54
1835 %dst_55.i.i = insertelement <64 x i8> %dst_54.i.i, i8 %v_55.i.i, i32 55
1836 %dst_56.i.i = insertelement <64 x i8> %dst_55.i.i, i8 %v_56.i.i, i32 56
1837 %dst_57.i.i = insertelement <64 x i8> %dst_56.i.i, i8 %v_57.i.i, i32 57
1838 %dst_58.i.i = insertelement <64 x i8> %dst_57.i.i, i8 %v_58.i.i, i32 58
1839 %dst_59.i.i = insertelement <64 x i8> %dst_58.i.i, i8 %v_59.i.i, i32 59
1840 %dst_60.i.i = insertelement <64 x i8> %dst_59.i.i, i8 %v_60.i.i, i32 60
1841 %dst_61.i.i = insertelement <64 x i8> %dst_60.i.i, i8 %v_61.i.i, i32 61
1842 %dst_62.i.i = insertelement <64 x i8> %dst_61.i.i, i8 %v_62.i.i, i32 62
1843 %dst_63.i.i = insertelement <64 x i8> %dst_62.i.i, i8 %v_63.i.i, i32 63
1844 %shuf_load_to_float = sitofp <64 x i8> %dst_63.i.i to <64 x float>
1845 store <64 x float> %shuf_load_to_float, ptr %dst