1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,INT256,AVX512,AVX512VLBW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,INT256,AVX512,AVX512VLBW,VBMI
9 define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
10 ; AVX1-LABEL: var_shuffle_v4i64:
12 ; AVX1-NEXT: pushq %rbp
13 ; AVX1-NEXT: movq %rsp, %rbp
14 ; AVX1-NEXT: andq $-32, %rsp
15 ; AVX1-NEXT: subq $64, %rsp
16 ; AVX1-NEXT: vmovq %xmm1, %rax
17 ; AVX1-NEXT: andl $3, %eax
18 ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
19 ; AVX1-NEXT: andl $3, %ecx
20 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
21 ; AVX1-NEXT: vmovq %xmm1, %rdx
22 ; AVX1-NEXT: andl $3, %edx
23 ; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
24 ; AVX1-NEXT: andl $3, %esi
25 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
26 ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
27 ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
28 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
29 ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
30 ; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
31 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
32 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
33 ; AVX1-NEXT: movq %rbp, %rsp
34 ; AVX1-NEXT: popq %rbp
37 ; AVX2-LABEL: var_shuffle_v4i64:
39 ; AVX2-NEXT: pushq %rbp
40 ; AVX2-NEXT: movq %rsp, %rbp
41 ; AVX2-NEXT: andq $-32, %rsp
42 ; AVX2-NEXT: subq $64, %rsp
43 ; AVX2-NEXT: vmovq %xmm1, %rax
44 ; AVX2-NEXT: andl $3, %eax
45 ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
46 ; AVX2-NEXT: andl $3, %ecx
47 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
48 ; AVX2-NEXT: vmovq %xmm1, %rdx
49 ; AVX2-NEXT: andl $3, %edx
50 ; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
51 ; AVX2-NEXT: andl $3, %esi
52 ; AVX2-NEXT: vmovaps %ymm0, (%rsp)
53 ; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
54 ; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
55 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
56 ; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
57 ; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
58 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
59 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
60 ; AVX2-NEXT: movq %rbp, %rsp
61 ; AVX2-NEXT: popq %rbp
64 ; AVX512F-LABEL: var_shuffle_v4i64:
66 ; AVX512F-NEXT: pushq %rbp
67 ; AVX512F-NEXT: movq %rsp, %rbp
68 ; AVX512F-NEXT: andq $-32, %rsp
69 ; AVX512F-NEXT: subq $64, %rsp
70 ; AVX512F-NEXT: vmovq %xmm1, %rax
71 ; AVX512F-NEXT: andl $3, %eax
72 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
73 ; AVX512F-NEXT: andl $3, %ecx
74 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
75 ; AVX512F-NEXT: vmovq %xmm1, %rdx
76 ; AVX512F-NEXT: andl $3, %edx
77 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
78 ; AVX512F-NEXT: andl $3, %esi
79 ; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
80 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
81 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
82 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
83 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
84 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
85 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
86 ; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
87 ; AVX512F-NEXT: movq %rbp, %rsp
88 ; AVX512F-NEXT: popq %rbp
91 ; AVX512VL-LABEL: var_shuffle_v4i64:
93 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
96 ; AVX512VLBW-LABEL: var_shuffle_v4i64:
98 ; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0
99 ; AVX512VLBW-NEXT: retq
100 %index0 = extractelement <4 x i64> %indices, i32 0
101 %index1 = extractelement <4 x i64> %indices, i32 1
102 %index2 = extractelement <4 x i64> %indices, i32 2
103 %index3 = extractelement <4 x i64> %indices, i32 3
104 %v0 = extractelement <4 x i64> %v, i64 %index0
105 %v1 = extractelement <4 x i64> %v, i64 %index1
106 %v2 = extractelement <4 x i64> %v, i64 %index2
107 %v3 = extractelement <4 x i64> %v, i64 %index3
108 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
109 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
110 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
111 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3
115 define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
116 ; AVX1-LABEL: var_shuffle_v8i32:
118 ; AVX1-NEXT: pushq %rbp
119 ; AVX1-NEXT: movq %rsp, %rbp
120 ; AVX1-NEXT: andq $-32, %rsp
121 ; AVX1-NEXT: subq $64, %rsp
122 ; AVX1-NEXT: vpextrq $1, %xmm1, %r8
123 ; AVX1-NEXT: movq %r8, %rcx
124 ; AVX1-NEXT: shrq $30, %rcx
125 ; AVX1-NEXT: vmovq %xmm1, %r9
126 ; AVX1-NEXT: movq %r9, %rsi
127 ; AVX1-NEXT: shrq $30, %rsi
128 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
129 ; AVX1-NEXT: vpextrq $1, %xmm1, %r10
130 ; AVX1-NEXT: movq %r10, %rdi
131 ; AVX1-NEXT: shrq $30, %rdi
132 ; AVX1-NEXT: vmovq %xmm1, %rax
133 ; AVX1-NEXT: movq %rax, %rdx
134 ; AVX1-NEXT: shrq $30, %rdx
135 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
136 ; AVX1-NEXT: andl $7, %r9d
137 ; AVX1-NEXT: andl $28, %esi
138 ; AVX1-NEXT: andl $7, %r8d
139 ; AVX1-NEXT: andl $28, %ecx
140 ; AVX1-NEXT: andl $7, %eax
141 ; AVX1-NEXT: andl $28, %edx
142 ; AVX1-NEXT: andl $7, %r10d
143 ; AVX1-NEXT: andl $28, %edi
144 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
145 ; AVX1-NEXT: movq %rsp, %rax
146 ; AVX1-NEXT: vpinsrd $1, (%rdx,%rax), %xmm0, %xmm0
147 ; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0
148 ; AVX1-NEXT: vpinsrd $3, (%rdi,%rax), %xmm0, %xmm0
149 ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
150 ; AVX1-NEXT: vpinsrd $1, (%rsi,%rax), %xmm1, %xmm1
151 ; AVX1-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1
152 ; AVX1-NEXT: vpinsrd $3, (%rcx,%rax), %xmm1, %xmm1
153 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
154 ; AVX1-NEXT: movq %rbp, %rsp
155 ; AVX1-NEXT: popq %rbp
158 ; INT256-LABEL: var_shuffle_v8i32:
160 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
162 %index0 = extractelement <8 x i32> %indices, i32 0
163 %index1 = extractelement <8 x i32> %indices, i32 1
164 %index2 = extractelement <8 x i32> %indices, i32 2
165 %index3 = extractelement <8 x i32> %indices, i32 3
166 %index4 = extractelement <8 x i32> %indices, i32 4
167 %index5 = extractelement <8 x i32> %indices, i32 5
168 %index6 = extractelement <8 x i32> %indices, i32 6
169 %index7 = extractelement <8 x i32> %indices, i32 7
170 %v0 = extractelement <8 x i32> %v, i32 %index0
171 %v1 = extractelement <8 x i32> %v, i32 %index1
172 %v2 = extractelement <8 x i32> %v, i32 %index2
173 %v3 = extractelement <8 x i32> %v, i32 %index3
174 %v4 = extractelement <8 x i32> %v, i32 %index4
175 %v5 = extractelement <8 x i32> %v, i32 %index5
176 %v6 = extractelement <8 x i32> %v, i32 %index6
177 %v7 = extractelement <8 x i32> %v, i32 %index7
178 %ret0 = insertelement <8 x i32> undef, i32 %v0, i32 0
179 %ret1 = insertelement <8 x i32> %ret0, i32 %v1, i32 1
180 %ret2 = insertelement <8 x i32> %ret1, i32 %v2, i32 2
181 %ret3 = insertelement <8 x i32> %ret2, i32 %v3, i32 3
182 %ret4 = insertelement <8 x i32> %ret3, i32 %v4, i32 4
183 %ret5 = insertelement <8 x i32> %ret4, i32 %v5, i32 5
184 %ret6 = insertelement <8 x i32> %ret5, i32 %v6, i32 6
185 %ret7 = insertelement <8 x i32> %ret6, i32 %v7, i32 7
189 define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
190 ; AVX1-LABEL: var_shuffle_v16i16:
192 ; AVX1-NEXT: pushq %rbp
193 ; AVX1-NEXT: movq %rsp, %rbp
194 ; AVX1-NEXT: andq $-32, %rsp
195 ; AVX1-NEXT: subq $64, %rsp
196 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
197 ; AVX1-NEXT: vmovd %xmm2, %eax
198 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
199 ; AVX1-NEXT: andl $15, %eax
200 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
201 ; AVX1-NEXT: vmovd %eax, %xmm0
202 ; AVX1-NEXT: vpextrw $1, %xmm2, %eax
203 ; AVX1-NEXT: andl $15, %eax
204 ; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
205 ; AVX1-NEXT: vpextrw $2, %xmm2, %eax
206 ; AVX1-NEXT: andl $15, %eax
207 ; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
208 ; AVX1-NEXT: vpextrw $3, %xmm2, %eax
209 ; AVX1-NEXT: andl $15, %eax
210 ; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
211 ; AVX1-NEXT: vpextrw $4, %xmm2, %eax
212 ; AVX1-NEXT: andl $15, %eax
213 ; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
214 ; AVX1-NEXT: vpextrw $5, %xmm2, %eax
215 ; AVX1-NEXT: andl $15, %eax
216 ; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
217 ; AVX1-NEXT: vpextrw $6, %xmm2, %eax
218 ; AVX1-NEXT: andl $15, %eax
219 ; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
220 ; AVX1-NEXT: vpextrw $7, %xmm2, %eax
221 ; AVX1-NEXT: andl $15, %eax
222 ; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
223 ; AVX1-NEXT: vmovd %xmm1, %eax
224 ; AVX1-NEXT: andl $15, %eax
225 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
226 ; AVX1-NEXT: vmovd %eax, %xmm2
227 ; AVX1-NEXT: vpextrw $1, %xmm1, %eax
228 ; AVX1-NEXT: andl $15, %eax
229 ; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
230 ; AVX1-NEXT: vpextrw $2, %xmm1, %eax
231 ; AVX1-NEXT: andl $15, %eax
232 ; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
233 ; AVX1-NEXT: vpextrw $3, %xmm1, %eax
234 ; AVX1-NEXT: andl $15, %eax
235 ; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
236 ; AVX1-NEXT: vpextrw $4, %xmm1, %eax
237 ; AVX1-NEXT: andl $15, %eax
238 ; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
239 ; AVX1-NEXT: vpextrw $5, %xmm1, %eax
240 ; AVX1-NEXT: andl $15, %eax
241 ; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
242 ; AVX1-NEXT: vpextrw $6, %xmm1, %eax
243 ; AVX1-NEXT: andl $15, %eax
244 ; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
245 ; AVX1-NEXT: vpextrw $7, %xmm1, %eax
246 ; AVX1-NEXT: andl $15, %eax
247 ; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
248 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
249 ; AVX1-NEXT: movq %rbp, %rsp
250 ; AVX1-NEXT: popq %rbp
253 ; AVX2-LABEL: var_shuffle_v16i16:
255 ; AVX2-NEXT: pushq %rbp
256 ; AVX2-NEXT: movq %rsp, %rbp
257 ; AVX2-NEXT: andq $-32, %rsp
258 ; AVX2-NEXT: subq $64, %rsp
259 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
260 ; AVX2-NEXT: vmovd %xmm2, %eax
261 ; AVX2-NEXT: vmovaps %ymm0, (%rsp)
262 ; AVX2-NEXT: andl $15, %eax
263 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
264 ; AVX2-NEXT: vmovd %eax, %xmm0
265 ; AVX2-NEXT: vpextrw $1, %xmm2, %eax
266 ; AVX2-NEXT: andl $15, %eax
267 ; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
268 ; AVX2-NEXT: vpextrw $2, %xmm2, %eax
269 ; AVX2-NEXT: andl $15, %eax
270 ; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
271 ; AVX2-NEXT: vpextrw $3, %xmm2, %eax
272 ; AVX2-NEXT: andl $15, %eax
273 ; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
274 ; AVX2-NEXT: vpextrw $4, %xmm2, %eax
275 ; AVX2-NEXT: andl $15, %eax
276 ; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
277 ; AVX2-NEXT: vpextrw $5, %xmm2, %eax
278 ; AVX2-NEXT: andl $15, %eax
279 ; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
280 ; AVX2-NEXT: vpextrw $6, %xmm2, %eax
281 ; AVX2-NEXT: andl $15, %eax
282 ; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
283 ; AVX2-NEXT: vpextrw $7, %xmm2, %eax
284 ; AVX2-NEXT: andl $15, %eax
285 ; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
286 ; AVX2-NEXT: vmovd %xmm1, %eax
287 ; AVX2-NEXT: andl $15, %eax
288 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
289 ; AVX2-NEXT: vmovd %eax, %xmm2
290 ; AVX2-NEXT: vpextrw $1, %xmm1, %eax
291 ; AVX2-NEXT: andl $15, %eax
292 ; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
293 ; AVX2-NEXT: vpextrw $2, %xmm1, %eax
294 ; AVX2-NEXT: andl $15, %eax
295 ; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
296 ; AVX2-NEXT: vpextrw $3, %xmm1, %eax
297 ; AVX2-NEXT: andl $15, %eax
298 ; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
299 ; AVX2-NEXT: vpextrw $4, %xmm1, %eax
300 ; AVX2-NEXT: andl $15, %eax
301 ; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
302 ; AVX2-NEXT: vpextrw $5, %xmm1, %eax
303 ; AVX2-NEXT: andl $15, %eax
304 ; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
305 ; AVX2-NEXT: vpextrw $6, %xmm1, %eax
306 ; AVX2-NEXT: andl $15, %eax
307 ; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
308 ; AVX2-NEXT: vpextrw $7, %xmm1, %eax
309 ; AVX2-NEXT: andl $15, %eax
310 ; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
311 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
312 ; AVX2-NEXT: movq %rbp, %rsp
313 ; AVX2-NEXT: popq %rbp
316 ; AVX512F-LABEL: var_shuffle_v16i16:
318 ; AVX512F-NEXT: pushq %rbp
319 ; AVX512F-NEXT: movq %rsp, %rbp
320 ; AVX512F-NEXT: andq $-32, %rsp
321 ; AVX512F-NEXT: subq $64, %rsp
322 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
323 ; AVX512F-NEXT: vmovd %xmm2, %eax
324 ; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
325 ; AVX512F-NEXT: andl $15, %eax
326 ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
327 ; AVX512F-NEXT: vmovd %eax, %xmm0
328 ; AVX512F-NEXT: vpextrw $1, %xmm2, %eax
329 ; AVX512F-NEXT: andl $15, %eax
330 ; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
331 ; AVX512F-NEXT: vpextrw $2, %xmm2, %eax
332 ; AVX512F-NEXT: andl $15, %eax
333 ; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
334 ; AVX512F-NEXT: vpextrw $3, %xmm2, %eax
335 ; AVX512F-NEXT: andl $15, %eax
336 ; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
337 ; AVX512F-NEXT: vpextrw $4, %xmm2, %eax
338 ; AVX512F-NEXT: andl $15, %eax
339 ; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
340 ; AVX512F-NEXT: vpextrw $5, %xmm2, %eax
341 ; AVX512F-NEXT: andl $15, %eax
342 ; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
343 ; AVX512F-NEXT: vpextrw $6, %xmm2, %eax
344 ; AVX512F-NEXT: andl $15, %eax
345 ; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
346 ; AVX512F-NEXT: vpextrw $7, %xmm2, %eax
347 ; AVX512F-NEXT: andl $15, %eax
348 ; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
349 ; AVX512F-NEXT: vmovd %xmm1, %eax
350 ; AVX512F-NEXT: andl $15, %eax
351 ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
352 ; AVX512F-NEXT: vmovd %eax, %xmm2
353 ; AVX512F-NEXT: vpextrw $1, %xmm1, %eax
354 ; AVX512F-NEXT: andl $15, %eax
355 ; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
356 ; AVX512F-NEXT: vpextrw $2, %xmm1, %eax
357 ; AVX512F-NEXT: andl $15, %eax
358 ; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
359 ; AVX512F-NEXT: vpextrw $3, %xmm1, %eax
360 ; AVX512F-NEXT: andl $15, %eax
361 ; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
362 ; AVX512F-NEXT: vpextrw $4, %xmm1, %eax
363 ; AVX512F-NEXT: andl $15, %eax
364 ; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
365 ; AVX512F-NEXT: vpextrw $5, %xmm1, %eax
366 ; AVX512F-NEXT: andl $15, %eax
367 ; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
368 ; AVX512F-NEXT: vpextrw $6, %xmm1, %eax
369 ; AVX512F-NEXT: andl $15, %eax
370 ; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
371 ; AVX512F-NEXT: vpextrw $7, %xmm1, %eax
372 ; AVX512F-NEXT: andl $15, %eax
373 ; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
374 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
375 ; AVX512F-NEXT: movq %rbp, %rsp
376 ; AVX512F-NEXT: popq %rbp
379 ; AVX512VL-LABEL: var_shuffle_v16i16:
381 ; AVX512VL-NEXT: pushq %rbp
382 ; AVX512VL-NEXT: movq %rsp, %rbp
383 ; AVX512VL-NEXT: andq $-32, %rsp
384 ; AVX512VL-NEXT: subq $64, %rsp
385 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
386 ; AVX512VL-NEXT: vmovd %xmm2, %eax
387 ; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
388 ; AVX512VL-NEXT: andl $15, %eax
389 ; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax
390 ; AVX512VL-NEXT: vmovd %eax, %xmm0
391 ; AVX512VL-NEXT: vpextrw $1, %xmm2, %eax
392 ; AVX512VL-NEXT: andl $15, %eax
393 ; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
394 ; AVX512VL-NEXT: vpextrw $2, %xmm2, %eax
395 ; AVX512VL-NEXT: andl $15, %eax
396 ; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
397 ; AVX512VL-NEXT: vpextrw $3, %xmm2, %eax
398 ; AVX512VL-NEXT: andl $15, %eax
399 ; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
400 ; AVX512VL-NEXT: vpextrw $4, %xmm2, %eax
401 ; AVX512VL-NEXT: andl $15, %eax
402 ; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
403 ; AVX512VL-NEXT: vpextrw $5, %xmm2, %eax
404 ; AVX512VL-NEXT: andl $15, %eax
405 ; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
406 ; AVX512VL-NEXT: vpextrw $6, %xmm2, %eax
407 ; AVX512VL-NEXT: andl $15, %eax
408 ; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
409 ; AVX512VL-NEXT: vpextrw $7, %xmm2, %eax
410 ; AVX512VL-NEXT: andl $15, %eax
411 ; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
412 ; AVX512VL-NEXT: vmovd %xmm1, %eax
413 ; AVX512VL-NEXT: andl $15, %eax
414 ; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax
415 ; AVX512VL-NEXT: vmovd %eax, %xmm2
416 ; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
417 ; AVX512VL-NEXT: andl $15, %eax
418 ; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
419 ; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
420 ; AVX512VL-NEXT: andl $15, %eax
421 ; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
422 ; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
423 ; AVX512VL-NEXT: andl $15, %eax
424 ; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
425 ; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
426 ; AVX512VL-NEXT: andl $15, %eax
427 ; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
428 ; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
429 ; AVX512VL-NEXT: andl $15, %eax
430 ; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
431 ; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
432 ; AVX512VL-NEXT: andl $15, %eax
433 ; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
434 ; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
435 ; AVX512VL-NEXT: andl $15, %eax
436 ; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
437 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
438 ; AVX512VL-NEXT: movq %rbp, %rsp
439 ; AVX512VL-NEXT: popq %rbp
440 ; AVX512VL-NEXT: retq
442 ; AVX512VLBW-LABEL: var_shuffle_v16i16:
443 ; AVX512VLBW: # BB#0:
444 ; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
445 ; AVX512VLBW-NEXT: retq
446 %index0 = extractelement <16 x i16> %indices, i32 0
447 %index1 = extractelement <16 x i16> %indices, i32 1
448 %index2 = extractelement <16 x i16> %indices, i32 2
449 %index3 = extractelement <16 x i16> %indices, i32 3
450 %index4 = extractelement <16 x i16> %indices, i32 4
451 %index5 = extractelement <16 x i16> %indices, i32 5
452 %index6 = extractelement <16 x i16> %indices, i32 6
453 %index7 = extractelement <16 x i16> %indices, i32 7
454 %index8 = extractelement <16 x i16> %indices, i32 8
455 %index9 = extractelement <16 x i16> %indices, i32 9
456 %index10 = extractelement <16 x i16> %indices, i32 10
457 %index11 = extractelement <16 x i16> %indices, i32 11
458 %index12 = extractelement <16 x i16> %indices, i32 12
459 %index13 = extractelement <16 x i16> %indices, i32 13
460 %index14 = extractelement <16 x i16> %indices, i32 14
461 %index15 = extractelement <16 x i16> %indices, i32 15
462 %v0 = extractelement <16 x i16> %v, i16 %index0
463 %v1 = extractelement <16 x i16> %v, i16 %index1
464 %v2 = extractelement <16 x i16> %v, i16 %index2
465 %v3 = extractelement <16 x i16> %v, i16 %index3
466 %v4 = extractelement <16 x i16> %v, i16 %index4
467 %v5 = extractelement <16 x i16> %v, i16 %index5
468 %v6 = extractelement <16 x i16> %v, i16 %index6
469 %v7 = extractelement <16 x i16> %v, i16 %index7
470 %v8 = extractelement <16 x i16> %v, i16 %index8
471 %v9 = extractelement <16 x i16> %v, i16 %index9
472 %v10 = extractelement <16 x i16> %v, i16 %index10
473 %v11 = extractelement <16 x i16> %v, i16 %index11
474 %v12 = extractelement <16 x i16> %v, i16 %index12
475 %v13 = extractelement <16 x i16> %v, i16 %index13
476 %v14 = extractelement <16 x i16> %v, i16 %index14
477 %v15 = extractelement <16 x i16> %v, i16 %index15
478 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0
479 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1
480 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2
481 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3
482 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4
483 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5
484 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6
485 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7
486 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8
487 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9
488 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10
489 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11
490 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12
491 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13
492 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14
493 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15
494 ret <16 x i16> %ret15
497 define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
498 ; AVX1-LABEL: var_shuffle_v32i8:
500 ; AVX1-NEXT: pushq %rbp
501 ; AVX1-NEXT: movq %rsp, %rbp
502 ; AVX1-NEXT: andq $-32, %rsp
503 ; AVX1-NEXT: subq $64, %rsp
504 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
505 ; AVX1-NEXT: vpextrb $0, %xmm2, %eax
506 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
507 ; AVX1-NEXT: andl $31, %eax
508 ; AVX1-NEXT: movq %rsp, %rcx
509 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
510 ; AVX1-NEXT: vmovd %eax, %xmm0
511 ; AVX1-NEXT: vpextrb $1, %xmm2, %eax
512 ; AVX1-NEXT: andl $31, %eax
513 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
514 ; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
515 ; AVX1-NEXT: vpextrb $2, %xmm2, %eax
516 ; AVX1-NEXT: andl $31, %eax
517 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
518 ; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
519 ; AVX1-NEXT: vpextrb $3, %xmm2, %eax
520 ; AVX1-NEXT: andl $31, %eax
521 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
522 ; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
523 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax
524 ; AVX1-NEXT: andl $31, %eax
525 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
526 ; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
527 ; AVX1-NEXT: vpextrb $5, %xmm2, %eax
528 ; AVX1-NEXT: andl $31, %eax
529 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
530 ; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
531 ; AVX1-NEXT: vpextrb $6, %xmm2, %eax
532 ; AVX1-NEXT: andl $31, %eax
533 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
534 ; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
535 ; AVX1-NEXT: vpextrb $7, %xmm2, %eax
536 ; AVX1-NEXT: andl $31, %eax
537 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
538 ; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
539 ; AVX1-NEXT: vpextrb $8, %xmm2, %eax
540 ; AVX1-NEXT: andl $31, %eax
541 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
542 ; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
543 ; AVX1-NEXT: vpextrb $9, %xmm2, %eax
544 ; AVX1-NEXT: andl $31, %eax
545 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
546 ; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
547 ; AVX1-NEXT: vpextrb $10, %xmm2, %eax
548 ; AVX1-NEXT: andl $31, %eax
549 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
550 ; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
551 ; AVX1-NEXT: vpextrb $11, %xmm2, %eax
552 ; AVX1-NEXT: andl $31, %eax
553 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
554 ; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
555 ; AVX1-NEXT: vpextrb $12, %xmm2, %eax
556 ; AVX1-NEXT: andl $31, %eax
557 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
558 ; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
559 ; AVX1-NEXT: vpextrb $13, %xmm2, %eax
560 ; AVX1-NEXT: andl $31, %eax
561 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
562 ; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
563 ; AVX1-NEXT: vpextrb $14, %xmm2, %eax
564 ; AVX1-NEXT: andl $31, %eax
565 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
566 ; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
567 ; AVX1-NEXT: vpextrb $15, %xmm2, %eax
568 ; AVX1-NEXT: andl $31, %eax
569 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
570 ; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
571 ; AVX1-NEXT: vpextrb $0, %xmm1, %eax
572 ; AVX1-NEXT: andl $31, %eax
573 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
574 ; AVX1-NEXT: vmovd %eax, %xmm2
575 ; AVX1-NEXT: vpextrb $1, %xmm1, %eax
576 ; AVX1-NEXT: andl $31, %eax
577 ; AVX1-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
578 ; AVX1-NEXT: vpextrb $2, %xmm1, %eax
579 ; AVX1-NEXT: andl $31, %eax
580 ; AVX1-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
581 ; AVX1-NEXT: vpextrb $3, %xmm1, %eax
582 ; AVX1-NEXT: andl $31, %eax
583 ; AVX1-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
584 ; AVX1-NEXT: vpextrb $4, %xmm1, %eax
585 ; AVX1-NEXT: andl $31, %eax
586 ; AVX1-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
587 ; AVX1-NEXT: vpextrb $5, %xmm1, %eax
588 ; AVX1-NEXT: andl $31, %eax
589 ; AVX1-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
590 ; AVX1-NEXT: vpextrb $6, %xmm1, %eax
591 ; AVX1-NEXT: andl $31, %eax
592 ; AVX1-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
593 ; AVX1-NEXT: vpextrb $7, %xmm1, %eax
594 ; AVX1-NEXT: andl $31, %eax
595 ; AVX1-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
596 ; AVX1-NEXT: vpextrb $8, %xmm1, %eax
597 ; AVX1-NEXT: andl $31, %eax
598 ; AVX1-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
599 ; AVX1-NEXT: vpextrb $9, %xmm1, %eax
600 ; AVX1-NEXT: andl $31, %eax
601 ; AVX1-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
602 ; AVX1-NEXT: vpextrb $10, %xmm1, %eax
603 ; AVX1-NEXT: andl $31, %eax
604 ; AVX1-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
605 ; AVX1-NEXT: vpextrb $11, %xmm1, %eax
606 ; AVX1-NEXT: andl $31, %eax
607 ; AVX1-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
608 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax
609 ; AVX1-NEXT: andl $31, %eax
610 ; AVX1-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
611 ; AVX1-NEXT: vpextrb $13, %xmm1, %eax
612 ; AVX1-NEXT: andl $31, %eax
613 ; AVX1-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
614 ; AVX1-NEXT: vpextrb $14, %xmm1, %eax
615 ; AVX1-NEXT: andl $31, %eax
616 ; AVX1-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
617 ; AVX1-NEXT: vpextrb $15, %xmm1, %eax
618 ; AVX1-NEXT: andl $31, %eax
619 ; AVX1-NEXT: movzbl (%rax,%rcx), %eax
620 ; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
621 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
622 ; AVX1-NEXT: movq %rbp, %rsp
623 ; AVX1-NEXT: popq %rbp
626 ; AVX2-LABEL: var_shuffle_v32i8:
628 ; AVX2-NEXT: pushq %rbp
629 ; AVX2-NEXT: movq %rsp, %rbp
630 ; AVX2-NEXT: andq $-32, %rsp
631 ; AVX2-NEXT: subq $64, %rsp
632 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
633 ; AVX2-NEXT: vpextrb $0, %xmm2, %eax
634 ; AVX2-NEXT: vmovaps %ymm0, (%rsp)
635 ; AVX2-NEXT: andl $31, %eax
636 ; AVX2-NEXT: movq %rsp, %rcx
637 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
638 ; AVX2-NEXT: vmovd %eax, %xmm0
639 ; AVX2-NEXT: vpextrb $1, %xmm2, %eax
640 ; AVX2-NEXT: andl $31, %eax
641 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
642 ; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
643 ; AVX2-NEXT: vpextrb $2, %xmm2, %eax
644 ; AVX2-NEXT: andl $31, %eax
645 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
646 ; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
647 ; AVX2-NEXT: vpextrb $3, %xmm2, %eax
648 ; AVX2-NEXT: andl $31, %eax
649 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
650 ; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
651 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax
652 ; AVX2-NEXT: andl $31, %eax
653 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
654 ; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
655 ; AVX2-NEXT: vpextrb $5, %xmm2, %eax
656 ; AVX2-NEXT: andl $31, %eax
657 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
658 ; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
659 ; AVX2-NEXT: vpextrb $6, %xmm2, %eax
660 ; AVX2-NEXT: andl $31, %eax
661 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
662 ; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
663 ; AVX2-NEXT: vpextrb $7, %xmm2, %eax
664 ; AVX2-NEXT: andl $31, %eax
665 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
666 ; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
667 ; AVX2-NEXT: vpextrb $8, %xmm2, %eax
668 ; AVX2-NEXT: andl $31, %eax
669 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
670 ; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
671 ; AVX2-NEXT: vpextrb $9, %xmm2, %eax
672 ; AVX2-NEXT: andl $31, %eax
673 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
674 ; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
675 ; AVX2-NEXT: vpextrb $10, %xmm2, %eax
676 ; AVX2-NEXT: andl $31, %eax
677 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
678 ; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
679 ; AVX2-NEXT: vpextrb $11, %xmm2, %eax
680 ; AVX2-NEXT: andl $31, %eax
681 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
682 ; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
683 ; AVX2-NEXT: vpextrb $12, %xmm2, %eax
684 ; AVX2-NEXT: andl $31, %eax
685 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
686 ; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
687 ; AVX2-NEXT: vpextrb $13, %xmm2, %eax
688 ; AVX2-NEXT: andl $31, %eax
689 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
690 ; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
691 ; AVX2-NEXT: vpextrb $14, %xmm2, %eax
692 ; AVX2-NEXT: andl $31, %eax
693 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
694 ; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
695 ; AVX2-NEXT: vpextrb $15, %xmm2, %eax
696 ; AVX2-NEXT: andl $31, %eax
697 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
698 ; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
699 ; AVX2-NEXT: vpextrb $0, %xmm1, %eax
700 ; AVX2-NEXT: andl $31, %eax
701 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
702 ; AVX2-NEXT: vmovd %eax, %xmm2
703 ; AVX2-NEXT: vpextrb $1, %xmm1, %eax
704 ; AVX2-NEXT: andl $31, %eax
705 ; AVX2-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
706 ; AVX2-NEXT: vpextrb $2, %xmm1, %eax
707 ; AVX2-NEXT: andl $31, %eax
708 ; AVX2-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
709 ; AVX2-NEXT: vpextrb $3, %xmm1, %eax
710 ; AVX2-NEXT: andl $31, %eax
711 ; AVX2-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
712 ; AVX2-NEXT: vpextrb $4, %xmm1, %eax
713 ; AVX2-NEXT: andl $31, %eax
714 ; AVX2-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
715 ; AVX2-NEXT: vpextrb $5, %xmm1, %eax
716 ; AVX2-NEXT: andl $31, %eax
717 ; AVX2-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
718 ; AVX2-NEXT: vpextrb $6, %xmm1, %eax
719 ; AVX2-NEXT: andl $31, %eax
720 ; AVX2-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
721 ; AVX2-NEXT: vpextrb $7, %xmm1, %eax
722 ; AVX2-NEXT: andl $31, %eax
723 ; AVX2-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
724 ; AVX2-NEXT: vpextrb $8, %xmm1, %eax
725 ; AVX2-NEXT: andl $31, %eax
726 ; AVX2-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
727 ; AVX2-NEXT: vpextrb $9, %xmm1, %eax
728 ; AVX2-NEXT: andl $31, %eax
729 ; AVX2-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
730 ; AVX2-NEXT: vpextrb $10, %xmm1, %eax
731 ; AVX2-NEXT: andl $31, %eax
732 ; AVX2-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
733 ; AVX2-NEXT: vpextrb $11, %xmm1, %eax
734 ; AVX2-NEXT: andl $31, %eax
735 ; AVX2-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
736 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax
737 ; AVX2-NEXT: andl $31, %eax
738 ; AVX2-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
739 ; AVX2-NEXT: vpextrb $13, %xmm1, %eax
740 ; AVX2-NEXT: andl $31, %eax
741 ; AVX2-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
742 ; AVX2-NEXT: vpextrb $14, %xmm1, %eax
743 ; AVX2-NEXT: andl $31, %eax
744 ; AVX2-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
745 ; AVX2-NEXT: vpextrb $15, %xmm1, %eax
746 ; AVX2-NEXT: andl $31, %eax
747 ; AVX2-NEXT: movzbl (%rax,%rcx), %eax
748 ; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
749 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
750 ; AVX2-NEXT: movq %rbp, %rsp
751 ; AVX2-NEXT: popq %rbp
754 ; AVX512F-LABEL: var_shuffle_v32i8:
756 ; AVX512F-NEXT: pushq %rbp
757 ; AVX512F-NEXT: movq %rsp, %rbp
758 ; AVX512F-NEXT: andq $-32, %rsp
759 ; AVX512F-NEXT: subq $64, %rsp
760 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
761 ; AVX512F-NEXT: vpextrb $0, %xmm2, %eax
762 ; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
763 ; AVX512F-NEXT: andl $31, %eax
764 ; AVX512F-NEXT: movq %rsp, %rcx
765 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
766 ; AVX512F-NEXT: vmovd %eax, %xmm0
767 ; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
768 ; AVX512F-NEXT: andl $31, %eax
769 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
770 ; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
771 ; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
772 ; AVX512F-NEXT: andl $31, %eax
773 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
774 ; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
775 ; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
776 ; AVX512F-NEXT: andl $31, %eax
777 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
778 ; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
779 ; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
780 ; AVX512F-NEXT: andl $31, %eax
781 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
782 ; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
783 ; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
784 ; AVX512F-NEXT: andl $31, %eax
785 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
786 ; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
787 ; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
788 ; AVX512F-NEXT: andl $31, %eax
789 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
790 ; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
791 ; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
792 ; AVX512F-NEXT: andl $31, %eax
793 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
794 ; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
795 ; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
796 ; AVX512F-NEXT: andl $31, %eax
797 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
798 ; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
799 ; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
800 ; AVX512F-NEXT: andl $31, %eax
801 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
802 ; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
803 ; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
804 ; AVX512F-NEXT: andl $31, %eax
805 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
806 ; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
807 ; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
808 ; AVX512F-NEXT: andl $31, %eax
809 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
810 ; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
811 ; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
812 ; AVX512F-NEXT: andl $31, %eax
813 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
814 ; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
815 ; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
816 ; AVX512F-NEXT: andl $31, %eax
817 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
818 ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
819 ; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
820 ; AVX512F-NEXT: andl $31, %eax
821 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
822 ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
823 ; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
824 ; AVX512F-NEXT: andl $31, %eax
825 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
826 ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
827 ; AVX512F-NEXT: vpextrb $0, %xmm1, %eax
828 ; AVX512F-NEXT: andl $31, %eax
829 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
830 ; AVX512F-NEXT: vmovd %eax, %xmm2
831 ; AVX512F-NEXT: vpextrb $1, %xmm1, %eax
832 ; AVX512F-NEXT: andl $31, %eax
833 ; AVX512F-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
834 ; AVX512F-NEXT: vpextrb $2, %xmm1, %eax
835 ; AVX512F-NEXT: andl $31, %eax
836 ; AVX512F-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
837 ; AVX512F-NEXT: vpextrb $3, %xmm1, %eax
838 ; AVX512F-NEXT: andl $31, %eax
839 ; AVX512F-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
840 ; AVX512F-NEXT: vpextrb $4, %xmm1, %eax
841 ; AVX512F-NEXT: andl $31, %eax
842 ; AVX512F-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
843 ; AVX512F-NEXT: vpextrb $5, %xmm1, %eax
844 ; AVX512F-NEXT: andl $31, %eax
845 ; AVX512F-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
846 ; AVX512F-NEXT: vpextrb $6, %xmm1, %eax
847 ; AVX512F-NEXT: andl $31, %eax
848 ; AVX512F-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
849 ; AVX512F-NEXT: vpextrb $7, %xmm1, %eax
850 ; AVX512F-NEXT: andl $31, %eax
851 ; AVX512F-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
852 ; AVX512F-NEXT: vpextrb $8, %xmm1, %eax
853 ; AVX512F-NEXT: andl $31, %eax
854 ; AVX512F-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
855 ; AVX512F-NEXT: vpextrb $9, %xmm1, %eax
856 ; AVX512F-NEXT: andl $31, %eax
857 ; AVX512F-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
858 ; AVX512F-NEXT: vpextrb $10, %xmm1, %eax
859 ; AVX512F-NEXT: andl $31, %eax
860 ; AVX512F-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
861 ; AVX512F-NEXT: vpextrb $11, %xmm1, %eax
862 ; AVX512F-NEXT: andl $31, %eax
863 ; AVX512F-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
864 ; AVX512F-NEXT: vpextrb $12, %xmm1, %eax
865 ; AVX512F-NEXT: andl $31, %eax
866 ; AVX512F-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
867 ; AVX512F-NEXT: vpextrb $13, %xmm1, %eax
868 ; AVX512F-NEXT: andl $31, %eax
869 ; AVX512F-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
870 ; AVX512F-NEXT: vpextrb $14, %xmm1, %eax
871 ; AVX512F-NEXT: andl $31, %eax
872 ; AVX512F-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
873 ; AVX512F-NEXT: vpextrb $15, %xmm1, %eax
874 ; AVX512F-NEXT: andl $31, %eax
875 ; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
876 ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
877 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
878 ; AVX512F-NEXT: movq %rbp, %rsp
879 ; AVX512F-NEXT: popq %rbp
882 ; AVX512VL-LABEL: var_shuffle_v32i8:
884 ; AVX512VL-NEXT: pushq %rbp
885 ; AVX512VL-NEXT: movq %rsp, %rbp
886 ; AVX512VL-NEXT: andq $-32, %rsp
887 ; AVX512VL-NEXT: subq $64, %rsp
888 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
889 ; AVX512VL-NEXT: vpextrb $0, %xmm2, %eax
890 ; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
891 ; AVX512VL-NEXT: andl $31, %eax
892 ; AVX512VL-NEXT: movq %rsp, %rcx
893 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
894 ; AVX512VL-NEXT: vmovd %eax, %xmm0
895 ; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax
896 ; AVX512VL-NEXT: andl $31, %eax
897 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
898 ; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
899 ; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax
900 ; AVX512VL-NEXT: andl $31, %eax
901 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
902 ; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
903 ; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax
904 ; AVX512VL-NEXT: andl $31, %eax
905 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
906 ; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
907 ; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax
908 ; AVX512VL-NEXT: andl $31, %eax
909 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
910 ; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
911 ; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax
912 ; AVX512VL-NEXT: andl $31, %eax
913 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
914 ; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
915 ; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax
916 ; AVX512VL-NEXT: andl $31, %eax
917 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
918 ; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
919 ; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax
920 ; AVX512VL-NEXT: andl $31, %eax
921 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
922 ; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
923 ; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax
924 ; AVX512VL-NEXT: andl $31, %eax
925 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
926 ; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
927 ; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax
928 ; AVX512VL-NEXT: andl $31, %eax
929 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
930 ; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
931 ; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax
932 ; AVX512VL-NEXT: andl $31, %eax
933 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
934 ; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
935 ; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax
936 ; AVX512VL-NEXT: andl $31, %eax
937 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
938 ; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
939 ; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax
940 ; AVX512VL-NEXT: andl $31, %eax
941 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
942 ; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
943 ; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax
944 ; AVX512VL-NEXT: andl $31, %eax
945 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
946 ; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
947 ; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax
948 ; AVX512VL-NEXT: andl $31, %eax
949 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
950 ; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
951 ; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax
952 ; AVX512VL-NEXT: andl $31, %eax
953 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
954 ; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
955 ; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax
956 ; AVX512VL-NEXT: andl $31, %eax
957 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
958 ; AVX512VL-NEXT: vmovd %eax, %xmm2
959 ; AVX512VL-NEXT: vpextrb $1, %xmm1, %eax
960 ; AVX512VL-NEXT: andl $31, %eax
961 ; AVX512VL-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
962 ; AVX512VL-NEXT: vpextrb $2, %xmm1, %eax
963 ; AVX512VL-NEXT: andl $31, %eax
964 ; AVX512VL-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
965 ; AVX512VL-NEXT: vpextrb $3, %xmm1, %eax
966 ; AVX512VL-NEXT: andl $31, %eax
967 ; AVX512VL-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
968 ; AVX512VL-NEXT: vpextrb $4, %xmm1, %eax
969 ; AVX512VL-NEXT: andl $31, %eax
970 ; AVX512VL-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
971 ; AVX512VL-NEXT: vpextrb $5, %xmm1, %eax
972 ; AVX512VL-NEXT: andl $31, %eax
973 ; AVX512VL-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
974 ; AVX512VL-NEXT: vpextrb $6, %xmm1, %eax
975 ; AVX512VL-NEXT: andl $31, %eax
976 ; AVX512VL-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
977 ; AVX512VL-NEXT: vpextrb $7, %xmm1, %eax
978 ; AVX512VL-NEXT: andl $31, %eax
979 ; AVX512VL-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
980 ; AVX512VL-NEXT: vpextrb $8, %xmm1, %eax
981 ; AVX512VL-NEXT: andl $31, %eax
982 ; AVX512VL-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
983 ; AVX512VL-NEXT: vpextrb $9, %xmm1, %eax
984 ; AVX512VL-NEXT: andl $31, %eax
985 ; AVX512VL-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
986 ; AVX512VL-NEXT: vpextrb $10, %xmm1, %eax
987 ; AVX512VL-NEXT: andl $31, %eax
988 ; AVX512VL-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
989 ; AVX512VL-NEXT: vpextrb $11, %xmm1, %eax
990 ; AVX512VL-NEXT: andl $31, %eax
991 ; AVX512VL-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
992 ; AVX512VL-NEXT: vpextrb $12, %xmm1, %eax
993 ; AVX512VL-NEXT: andl $31, %eax
994 ; AVX512VL-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
995 ; AVX512VL-NEXT: vpextrb $13, %xmm1, %eax
996 ; AVX512VL-NEXT: andl $31, %eax
997 ; AVX512VL-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
998 ; AVX512VL-NEXT: vpextrb $14, %xmm1, %eax
999 ; AVX512VL-NEXT: andl $31, %eax
1000 ; AVX512VL-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
1001 ; AVX512VL-NEXT: vpextrb $15, %xmm1, %eax
1002 ; AVX512VL-NEXT: andl $31, %eax
1003 ; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
1004 ; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
1005 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1006 ; AVX512VL-NEXT: movq %rbp, %rsp
1007 ; AVX512VL-NEXT: popq %rbp
1008 ; AVX512VL-NEXT: retq
1010 ; VBMI-LABEL: var_shuffle_v32i8:
1012 ; VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
1014 %index0 = extractelement <32 x i8> %indices, i32 0
1015 %index1 = extractelement <32 x i8> %indices, i32 1
1016 %index2 = extractelement <32 x i8> %indices, i32 2
1017 %index3 = extractelement <32 x i8> %indices, i32 3
1018 %index4 = extractelement <32 x i8> %indices, i32 4
1019 %index5 = extractelement <32 x i8> %indices, i32 5
1020 %index6 = extractelement <32 x i8> %indices, i32 6
1021 %index7 = extractelement <32 x i8> %indices, i32 7
1022 %index8 = extractelement <32 x i8> %indices, i32 8
1023 %index9 = extractelement <32 x i8> %indices, i32 9
1024 %index10 = extractelement <32 x i8> %indices, i32 10
1025 %index11 = extractelement <32 x i8> %indices, i32 11
1026 %index12 = extractelement <32 x i8> %indices, i32 12
1027 %index13 = extractelement <32 x i8> %indices, i32 13
1028 %index14 = extractelement <32 x i8> %indices, i32 14
1029 %index15 = extractelement <32 x i8> %indices, i32 15
1030 %index16 = extractelement <32 x i8> %indices, i32 16
1031 %index17 = extractelement <32 x i8> %indices, i32 17
1032 %index18 = extractelement <32 x i8> %indices, i32 18
1033 %index19 = extractelement <32 x i8> %indices, i32 19
1034 %index20 = extractelement <32 x i8> %indices, i32 20
1035 %index21 = extractelement <32 x i8> %indices, i32 21
1036 %index22 = extractelement <32 x i8> %indices, i32 22
1037 %index23 = extractelement <32 x i8> %indices, i32 23
1038 %index24 = extractelement <32 x i8> %indices, i32 24
1039 %index25 = extractelement <32 x i8> %indices, i32 25
1040 %index26 = extractelement <32 x i8> %indices, i32 26
1041 %index27 = extractelement <32 x i8> %indices, i32 27
1042 %index28 = extractelement <32 x i8> %indices, i32 28
1043 %index29 = extractelement <32 x i8> %indices, i32 29
1044 %index30 = extractelement <32 x i8> %indices, i32 30
1045 %index31 = extractelement <32 x i8> %indices, i32 31
1046 %v0 = extractelement <32 x i8> %v, i8 %index0
1047 %v1 = extractelement <32 x i8> %v, i8 %index1
1048 %v2 = extractelement <32 x i8> %v, i8 %index2
1049 %v3 = extractelement <32 x i8> %v, i8 %index3
1050 %v4 = extractelement <32 x i8> %v, i8 %index4
1051 %v5 = extractelement <32 x i8> %v, i8 %index5
1052 %v6 = extractelement <32 x i8> %v, i8 %index6
1053 %v7 = extractelement <32 x i8> %v, i8 %index7
1054 %v8 = extractelement <32 x i8> %v, i8 %index8
1055 %v9 = extractelement <32 x i8> %v, i8 %index9
1056 %v10 = extractelement <32 x i8> %v, i8 %index10
1057 %v11 = extractelement <32 x i8> %v, i8 %index11
1058 %v12 = extractelement <32 x i8> %v, i8 %index12
1059 %v13 = extractelement <32 x i8> %v, i8 %index13
1060 %v14 = extractelement <32 x i8> %v, i8 %index14
1061 %v15 = extractelement <32 x i8> %v, i8 %index15
1062 %v16 = extractelement <32 x i8> %v, i8 %index16
1063 %v17 = extractelement <32 x i8> %v, i8 %index17
1064 %v18 = extractelement <32 x i8> %v, i8 %index18
1065 %v19 = extractelement <32 x i8> %v, i8 %index19
1066 %v20 = extractelement <32 x i8> %v, i8 %index20
1067 %v21 = extractelement <32 x i8> %v, i8 %index21
1068 %v22 = extractelement <32 x i8> %v, i8 %index22
1069 %v23 = extractelement <32 x i8> %v, i8 %index23
1070 %v24 = extractelement <32 x i8> %v, i8 %index24
1071 %v25 = extractelement <32 x i8> %v, i8 %index25
1072 %v26 = extractelement <32 x i8> %v, i8 %index26
1073 %v27 = extractelement <32 x i8> %v, i8 %index27
1074 %v28 = extractelement <32 x i8> %v, i8 %index28
1075 %v29 = extractelement <32 x i8> %v, i8 %index29
1076 %v30 = extractelement <32 x i8> %v, i8 %index30
1077 %v31 = extractelement <32 x i8> %v, i8 %index31
1078 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0
1079 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1
1080 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2
1081 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3
1082 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4
1083 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5
1084 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6
1085 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7
1086 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8
1087 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9
1088 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10
1089 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11
1090 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12
1091 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13
1092 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14
1093 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15
1094 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16
1095 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17
1096 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18
1097 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19
1098 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20
1099 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21
1100 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22
1101 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23
1102 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24
1103 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25
1104 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26
1105 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27
1106 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28
1107 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29
1108 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30
1109 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31
1110 ret <32 x i8> %ret31
1113 define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind {
1114 ; AVX1-LABEL: var_shuffle_v4f64:
1116 ; AVX1-NEXT: pushq %rbp
1117 ; AVX1-NEXT: movq %rsp, %rbp
1118 ; AVX1-NEXT: andq $-32, %rsp
1119 ; AVX1-NEXT: subq $64, %rsp
1120 ; AVX1-NEXT: vmovq %xmm1, %rax
1121 ; AVX1-NEXT: andl $3, %eax
1122 ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
1123 ; AVX1-NEXT: andl $3, %ecx
1124 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1125 ; AVX1-NEXT: vmovq %xmm1, %rdx
1126 ; AVX1-NEXT: andl $3, %edx
1127 ; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
1128 ; AVX1-NEXT: andl $3, %esi
1129 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
1130 ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1131 ; AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
1132 ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1133 ; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1134 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1135 ; AVX1-NEXT: movq %rbp, %rsp
1136 ; AVX1-NEXT: popq %rbp
1139 ; AVX2-LABEL: var_shuffle_v4f64:
1141 ; AVX2-NEXT: pushq %rbp
1142 ; AVX2-NEXT: movq %rsp, %rbp
1143 ; AVX2-NEXT: andq $-32, %rsp
1144 ; AVX2-NEXT: subq $64, %rsp
1145 ; AVX2-NEXT: vmovq %xmm1, %rax
1146 ; AVX2-NEXT: andl $3, %eax
1147 ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
1148 ; AVX2-NEXT: andl $3, %ecx
1149 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
1150 ; AVX2-NEXT: vmovq %xmm1, %rdx
1151 ; AVX2-NEXT: andl $3, %edx
1152 ; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
1153 ; AVX2-NEXT: andl $3, %esi
1154 ; AVX2-NEXT: vmovaps %ymm0, (%rsp)
1155 ; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1156 ; AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
1157 ; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1158 ; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1159 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1160 ; AVX2-NEXT: movq %rbp, %rsp
1161 ; AVX2-NEXT: popq %rbp
1164 ; AVX512F-LABEL: var_shuffle_v4f64:
1166 ; AVX512F-NEXT: pushq %rbp
1167 ; AVX512F-NEXT: movq %rsp, %rbp
1168 ; AVX512F-NEXT: andq $-32, %rsp
1169 ; AVX512F-NEXT: subq $64, %rsp
1170 ; AVX512F-NEXT: vmovq %xmm1, %rax
1171 ; AVX512F-NEXT: andl $3, %eax
1172 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
1173 ; AVX512F-NEXT: andl $3, %ecx
1174 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
1175 ; AVX512F-NEXT: vmovq %xmm1, %rdx
1176 ; AVX512F-NEXT: andl $3, %edx
1177 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
1178 ; AVX512F-NEXT: andl $3, %esi
1179 ; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
1180 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1181 ; AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
1182 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1183 ; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1184 ; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1185 ; AVX512F-NEXT: movq %rbp, %rsp
1186 ; AVX512F-NEXT: popq %rbp
1187 ; AVX512F-NEXT: retq
1189 ; AVX512VL-LABEL: var_shuffle_v4f64:
1191 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
1192 ; AVX512VL-NEXT: retq
1194 ; AVX512VLBW-LABEL: var_shuffle_v4f64:
1195 ; AVX512VLBW: # BB#0:
1196 ; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0
1197 ; AVX512VLBW-NEXT: retq
1198 %index0 = extractelement <4 x i64> %indices, i32 0
1199 %index1 = extractelement <4 x i64> %indices, i32 1
1200 %index2 = extractelement <4 x i64> %indices, i32 2
1201 %index3 = extractelement <4 x i64> %indices, i32 3
1202 %v0 = extractelement <4 x double> %v, i64 %index0
1203 %v1 = extractelement <4 x double> %v, i64 %index1
1204 %v2 = extractelement <4 x double> %v, i64 %index2
1205 %v3 = extractelement <4 x double> %v, i64 %index3
1206 %ret0 = insertelement <4 x double> undef, double %v0, i32 0
1207 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1
1208 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2
1209 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3
1210 ret <4 x double> %ret3
1213 define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind {
1214 ; AVX1-LABEL: var_shuffle_v8f32:
1216 ; AVX1-NEXT: pushq %rbp
1217 ; AVX1-NEXT: movq %rsp, %rbp
1218 ; AVX1-NEXT: andq $-32, %rsp
1219 ; AVX1-NEXT: subq $64, %rsp
1220 ; AVX1-NEXT: vpextrq $1, %xmm1, %r8
1221 ; AVX1-NEXT: movq %r8, %rcx
1222 ; AVX1-NEXT: shrq $30, %rcx
1223 ; AVX1-NEXT: vmovq %xmm1, %r9
1224 ; AVX1-NEXT: movq %r9, %rdx
1225 ; AVX1-NEXT: shrq $30, %rdx
1226 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1227 ; AVX1-NEXT: vpextrq $1, %xmm1, %r10
1228 ; AVX1-NEXT: movq %r10, %rdi
1229 ; AVX1-NEXT: shrq $30, %rdi
1230 ; AVX1-NEXT: vmovq %xmm1, %rax
1231 ; AVX1-NEXT: movq %rax, %rsi
1232 ; AVX1-NEXT: shrq $30, %rsi
1233 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
1234 ; AVX1-NEXT: andl $7, %r9d
1235 ; AVX1-NEXT: andl $28, %edx
1236 ; AVX1-NEXT: andl $7, %r8d
1237 ; AVX1-NEXT: andl $28, %ecx
1238 ; AVX1-NEXT: andl $7, %eax
1239 ; AVX1-NEXT: andl $28, %esi
1240 ; AVX1-NEXT: andl $7, %r10d
1241 ; AVX1-NEXT: andl $28, %edi
1242 ; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1243 ; AVX1-NEXT: movq %rsp, %rax
1244 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1245 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1246 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1247 ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1248 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
1249 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
1250 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
1251 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1252 ; AVX1-NEXT: movq %rbp, %rsp
1253 ; AVX1-NEXT: popq %rbp
1256 ; INT256-LABEL: var_shuffle_v8f32:
1258 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
1260 %index0 = extractelement <8 x i32> %indices, i32 0
1261 %index1 = extractelement <8 x i32> %indices, i32 1
1262 %index2 = extractelement <8 x i32> %indices, i32 2
1263 %index3 = extractelement <8 x i32> %indices, i32 3
1264 %index4 = extractelement <8 x i32> %indices, i32 4
1265 %index5 = extractelement <8 x i32> %indices, i32 5
1266 %index6 = extractelement <8 x i32> %indices, i32 6
1267 %index7 = extractelement <8 x i32> %indices, i32 7
1268 %v0 = extractelement <8 x float> %v, i32 %index0
1269 %v1 = extractelement <8 x float> %v, i32 %index1
1270 %v2 = extractelement <8 x float> %v, i32 %index2
1271 %v3 = extractelement <8 x float> %v, i32 %index3
1272 %v4 = extractelement <8 x float> %v, i32 %index4
1273 %v5 = extractelement <8 x float> %v, i32 %index5
1274 %v6 = extractelement <8 x float> %v, i32 %index6
1275 %v7 = extractelement <8 x float> %v, i32 %index7
1276 %ret0 = insertelement <8 x float> undef, float %v0, i32 0
1277 %ret1 = insertelement <8 x float> %ret0, float %v1, i32 1
1278 %ret2 = insertelement <8 x float> %ret1, float %v2, i32 2
1279 %ret3 = insertelement <8 x float> %ret2, float %v3, i32 3
1280 %ret4 = insertelement <8 x float> %ret3, float %v4, i32 4
1281 %ret5 = insertelement <8 x float> %ret4, float %v5, i32 5
1282 %ret6 = insertelement <8 x float> %ret5, float %v6, i32 6
1283 %ret7 = insertelement <8 x float> %ret6, float %v7, i32 7
1284 ret <8 x float> %ret7