1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; Unary shuffle indices from registers
9 define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
10 ; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64:
12 ; ALL-NEXT: pushq %rbp
13 ; ALL-NEXT: movq %rsp, %rbp
14 ; ALL-NEXT: andq $-32, %rsp
15 ; ALL-NEXT: subq $64, %rsp
16 ; ALL-NEXT: andl $3, %esi
17 ; ALL-NEXT: andl $3, %edi
18 ; ALL-NEXT: andl $3, %ecx
19 ; ALL-NEXT: andl $3, %edx
20 ; ALL-NEXT: vmovaps %ymm0, (%rsp)
21 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
22 ; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
23 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
24 ; ALL-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
25 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
26 ; ALL-NEXT: movq %rbp, %rsp
29 %x0 = extractelement <4 x double> %x, i64 %i0
30 %x1 = extractelement <4 x double> %x, i64 %i1
31 %x2 = extractelement <4 x double> %x, i64 %i2
32 %x3 = extractelement <4 x double> %x, i64 %i3
33 %r0 = insertelement <4 x double> undef, double %x0, i32 0
34 %r1 = insertelement <4 x double> %r0, double %x1, i32 1
35 %r2 = insertelement <4 x double> %r1, double %x2, i32 2
36 %r3 = insertelement <4 x double> %r2, double %x3, i32 3
40 define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
41 ; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64:
43 ; ALL-NEXT: pushq %rbp
44 ; ALL-NEXT: movq %rsp, %rbp
45 ; ALL-NEXT: andq $-32, %rsp
46 ; ALL-NEXT: subq $64, %rsp
47 ; ALL-NEXT: andl $3, %edx
48 ; ALL-NEXT: andl $3, %esi
49 ; ALL-NEXT: vmovaps %ymm0, (%rsp)
50 ; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
51 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
52 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
53 ; ALL-NEXT: movq %rbp, %rsp
56 %x0 = extractelement <4 x double> %x, i64 %i0
57 %x1 = extractelement <4 x double> %x, i64 %i1
58 %x2 = extractelement <4 x double> %x, i64 %i2
59 %x3 = extractelement <4 x double> %x, i64 %i3
60 %r0 = insertelement <4 x double> undef, double undef, i32 0
61 %r1 = insertelement <4 x double> %r0, double %x1, i32 1
62 %r2 = insertelement <4 x double> %r1, double %x2, i32 2
63 %r3 = insertelement <4 x double> %r2, double 0.0, i32 3
67 define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
68 ; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
70 ; ALL-NEXT: andl $1, %esi
71 ; ALL-NEXT: andl $1, %edi
72 ; ALL-NEXT: andl $1, %ecx
73 ; ALL-NEXT: andl $1, %edx
74 ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
75 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
76 ; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
77 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
78 ; ALL-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
79 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
81 %x0 = extractelement <2 x double> %x, i64 %i0
82 %x1 = extractelement <2 x double> %x, i64 %i1
83 %x2 = extractelement <2 x double> %x, i64 %i2
84 %x3 = extractelement <2 x double> %x, i64 %i3
85 %r0 = insertelement <4 x double> undef, double %x0, i32 0
86 %r1 = insertelement <4 x double> %r0, double %x1, i32 1
87 %r2 = insertelement <4 x double> %r1, double %x2, i32 2
88 %r3 = insertelement <4 x double> %r2, double %x3, i32 3
92 define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
93 ; ALL-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
95 ; ALL-NEXT: pushq %rbp
96 ; ALL-NEXT: movq %rsp, %rbp
97 ; ALL-NEXT: andq $-32, %rsp
98 ; ALL-NEXT: subq $64, %rsp
99 ; ALL-NEXT: andl $3, %edi
100 ; ALL-NEXT: andl $3, %esi
101 ; ALL-NEXT: andl $3, %edx
102 ; ALL-NEXT: andl $3, %ecx
103 ; ALL-NEXT: vmovaps %ymm0, (%rsp)
104 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
105 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
106 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
107 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
108 ; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
109 ; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
110 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
111 ; ALL-NEXT: movq %rbp, %rsp
112 ; ALL-NEXT: popq %rbp
114 %x0 = extractelement <4 x i64> %x, i64 %i0
115 %x1 = extractelement <4 x i64> %x, i64 %i1
116 %x2 = extractelement <4 x i64> %x, i64 %i2
117 %x3 = extractelement <4 x i64> %x, i64 %i3
118 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
119 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
120 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
121 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
125 define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
126 ; ALL-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
128 ; ALL-NEXT: pushq %rbp
129 ; ALL-NEXT: movq %rsp, %rbp
130 ; ALL-NEXT: andq $-32, %rsp
131 ; ALL-NEXT: subq $64, %rsp
132 ; ALL-NEXT: andl $3, %edi
133 ; ALL-NEXT: andl $3, %esi
134 ; ALL-NEXT: vmovaps %ymm0, (%rsp)
135 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
136 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
137 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
138 ; ALL-NEXT: movq %rbp, %rsp
139 ; ALL-NEXT: popq %rbp
141 %x0 = extractelement <4 x i64> %x, i64 %i0
142 %x1 = extractelement <4 x i64> %x, i64 %i1
143 %x2 = extractelement <4 x i64> %x, i64 %i2
144 %x3 = extractelement <4 x i64> %x, i64 %i3
145 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
146 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
147 %r2 = insertelement <4 x i64> %r1, i64 0, i32 2
148 %r3 = insertelement <4 x i64> %r2, i64 0, i32 3
152 define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
153 ; ALL-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
155 ; ALL-NEXT: andl $1, %edi
156 ; ALL-NEXT: andl $1, %esi
157 ; ALL-NEXT: andl $1, %edx
158 ; ALL-NEXT: andl $1, %ecx
159 ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
160 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
161 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
162 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
163 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
164 ; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
165 ; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
166 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
168 %x0 = extractelement <2 x i64> %x, i64 %i0
169 %x1 = extractelement <2 x i64> %x, i64 %i1
170 %x2 = extractelement <2 x i64> %x, i64 %i2
171 %x3 = extractelement <2 x i64> %x, i64 %i3
172 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
173 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
174 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
175 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
179 define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
180 ; ALL-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
182 ; ALL-NEXT: pushq %rbp
183 ; ALL-NEXT: movq %rsp, %rbp
184 ; ALL-NEXT: andq $-32, %rsp
185 ; ALL-NEXT: subq $64, %rsp
186 ; ALL-NEXT: # kill: def $r9d killed $r9d def $r9
187 ; ALL-NEXT: # kill: def $r8d killed $r8d def $r8
188 ; ALL-NEXT: # kill: def $ecx killed $ecx def $rcx
189 ; ALL-NEXT: # kill: def $edx killed $edx def $rdx
190 ; ALL-NEXT: # kill: def $esi killed $esi def $rsi
191 ; ALL-NEXT: # kill: def $edi killed $edi def $rdi
192 ; ALL-NEXT: movl 24(%rbp), %r10d
193 ; ALL-NEXT: andl $7, %r10d
194 ; ALL-NEXT: movl 16(%rbp), %eax
195 ; ALL-NEXT: andl $7, %eax
196 ; ALL-NEXT: andl $7, %edi
197 ; ALL-NEXT: andl $7, %esi
198 ; ALL-NEXT: andl $7, %edx
199 ; ALL-NEXT: andl $7, %ecx
200 ; ALL-NEXT: andl $7, %r8d
201 ; ALL-NEXT: vmovaps %ymm0, (%rsp)
202 ; ALL-NEXT: andl $7, %r9d
203 ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
204 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
205 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
206 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
207 ; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
208 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
209 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
210 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
211 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
212 ; ALL-NEXT: movq %rbp, %rsp
213 ; ALL-NEXT: popq %rbp
215 %x0 = extractelement <8 x float> %x, i32 %i0
216 %x1 = extractelement <8 x float> %x, i32 %i1
217 %x2 = extractelement <8 x float> %x, i32 %i2
218 %x3 = extractelement <8 x float> %x, i32 %i3
219 %x4 = extractelement <8 x float> %x, i32 %i4
220 %x5 = extractelement <8 x float> %x, i32 %i5
221 %x6 = extractelement <8 x float> %x, i32 %i6
222 %x7 = extractelement <8 x float> %x, i32 %i7
223 %r0 = insertelement <8 x float> undef, float %x0, i32 0
224 %r1 = insertelement <8 x float> %r0, float %x1, i32 1
225 %r2 = insertelement <8 x float> %r1, float %x2, i32 2
226 %r3 = insertelement <8 x float> %r2, float %x3, i32 3
227 %r4 = insertelement <8 x float> %r3, float %x4, i32 4
228 %r5 = insertelement <8 x float> %r4, float %x5, i32 5
229 %r6 = insertelement <8 x float> %r5, float %x6, i32 6
230 %r7 = insertelement <8 x float> %r6, float %x7, i32 7
234 define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
235 ; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32:
237 ; ALL-NEXT: # kill: def $r9d killed $r9d def $r9
238 ; ALL-NEXT: # kill: def $r8d killed $r8d def $r8
239 ; ALL-NEXT: # kill: def $ecx killed $ecx def $rcx
240 ; ALL-NEXT: # kill: def $edx killed $edx def $rdx
241 ; ALL-NEXT: # kill: def $esi killed $esi def $rsi
242 ; ALL-NEXT: # kill: def $edi killed $edi def $rdi
243 ; ALL-NEXT: movl {{[0-9]+}}(%rsp), %r10d
244 ; ALL-NEXT: andl $3, %r10d
245 ; ALL-NEXT: movl {{[0-9]+}}(%rsp), %eax
246 ; ALL-NEXT: andl $3, %eax
247 ; ALL-NEXT: andl $3, %edi
248 ; ALL-NEXT: andl $3, %esi
249 ; ALL-NEXT: andl $3, %edx
250 ; ALL-NEXT: andl $3, %ecx
251 ; ALL-NEXT: andl $3, %r8d
252 ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
253 ; ALL-NEXT: andl $3, %r9d
254 ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
255 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
256 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
257 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
258 ; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
259 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
260 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
261 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
262 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
264 %x0 = extractelement <4 x float> %x, i32 %i0
265 %x1 = extractelement <4 x float> %x, i32 %i1
266 %x2 = extractelement <4 x float> %x, i32 %i2
267 %x3 = extractelement <4 x float> %x, i32 %i3
268 %x4 = extractelement <4 x float> %x, i32 %i4
269 %x5 = extractelement <4 x float> %x, i32 %i5
270 %x6 = extractelement <4 x float> %x, i32 %i6
271 %x7 = extractelement <4 x float> %x, i32 %i7
272 %r0 = insertelement <8 x float> undef, float %x0, i32 0
273 %r1 = insertelement <8 x float> %r0, float %x1, i32 1
274 %r2 = insertelement <8 x float> %r1, float %x2, i32 2
275 %r3 = insertelement <8 x float> %r2, float %x3, i32 3
276 %r4 = insertelement <8 x float> %r3, float %x4, i32 4
277 %r5 = insertelement <8 x float> %r4, float %x5, i32 5
278 %r6 = insertelement <8 x float> %r5, float %x6, i32 6
279 %r7 = insertelement <8 x float> %r6, float %x7, i32 7
283 define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
284 ; AVX1-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
286 ; AVX1-NEXT: pushq %rbp
287 ; AVX1-NEXT: movq %rsp, %rbp
288 ; AVX1-NEXT: andq $-32, %rsp
289 ; AVX1-NEXT: subq $64, %rsp
290 ; AVX1-NEXT: # kill: def $r9d killed $r9d def $r9
291 ; AVX1-NEXT: # kill: def $r8d killed $r8d def $r8
292 ; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx
293 ; AVX1-NEXT: # kill: def $edx killed $edx def $rdx
294 ; AVX1-NEXT: # kill: def $esi killed $esi def $rsi
295 ; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
296 ; AVX1-NEXT: andl $15, %edi
297 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
298 ; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax
299 ; AVX1-NEXT: vmovd %eax, %xmm0
300 ; AVX1-NEXT: andl $15, %esi
301 ; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0
302 ; AVX1-NEXT: andl $15, %edx
303 ; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0
304 ; AVX1-NEXT: andl $15, %ecx
305 ; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0
306 ; AVX1-NEXT: andl $15, %r8d
307 ; AVX1-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0
308 ; AVX1-NEXT: andl $15, %r9d
309 ; AVX1-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0
310 ; AVX1-NEXT: movl 16(%rbp), %eax
311 ; AVX1-NEXT: andl $15, %eax
312 ; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
313 ; AVX1-NEXT: movl 24(%rbp), %eax
314 ; AVX1-NEXT: andl $15, %eax
315 ; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
316 ; AVX1-NEXT: movl 32(%rbp), %eax
317 ; AVX1-NEXT: andl $15, %eax
318 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
319 ; AVX1-NEXT: vmovd %eax, %xmm1
320 ; AVX1-NEXT: movl 40(%rbp), %eax
321 ; AVX1-NEXT: andl $15, %eax
322 ; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
323 ; AVX1-NEXT: movl 48(%rbp), %eax
324 ; AVX1-NEXT: andl $15, %eax
325 ; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
326 ; AVX1-NEXT: movl 56(%rbp), %eax
327 ; AVX1-NEXT: andl $15, %eax
328 ; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
329 ; AVX1-NEXT: movl 64(%rbp), %eax
330 ; AVX1-NEXT: andl $15, %eax
331 ; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
332 ; AVX1-NEXT: movl 72(%rbp), %eax
333 ; AVX1-NEXT: andl $15, %eax
334 ; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
335 ; AVX1-NEXT: movl 80(%rbp), %eax
336 ; AVX1-NEXT: andl $15, %eax
337 ; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1
338 ; AVX1-NEXT: movl 88(%rbp), %eax
339 ; AVX1-NEXT: andl $15, %eax
340 ; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1
341 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
342 ; AVX1-NEXT: movq %rbp, %rsp
343 ; AVX1-NEXT: popq %rbp
346 ; AVX2-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
348 ; AVX2-NEXT: pushq %rbp
349 ; AVX2-NEXT: movq %rsp, %rbp
350 ; AVX2-NEXT: andq $-32, %rsp
351 ; AVX2-NEXT: subq $64, %rsp
352 ; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9
353 ; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8
354 ; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
355 ; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
356 ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
357 ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
358 ; AVX2-NEXT: andl $15, %edi
359 ; AVX2-NEXT: vmovaps %ymm0, (%rsp)
360 ; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax
361 ; AVX2-NEXT: vmovd %eax, %xmm0
362 ; AVX2-NEXT: andl $15, %esi
363 ; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0
364 ; AVX2-NEXT: andl $15, %edx
365 ; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0
366 ; AVX2-NEXT: andl $15, %ecx
367 ; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0
368 ; AVX2-NEXT: andl $15, %r8d
369 ; AVX2-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0
370 ; AVX2-NEXT: andl $15, %r9d
371 ; AVX2-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0
372 ; AVX2-NEXT: movl 16(%rbp), %eax
373 ; AVX2-NEXT: andl $15, %eax
374 ; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
375 ; AVX2-NEXT: movl 24(%rbp), %eax
376 ; AVX2-NEXT: andl $15, %eax
377 ; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
378 ; AVX2-NEXT: movl 32(%rbp), %eax
379 ; AVX2-NEXT: andl $15, %eax
380 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
381 ; AVX2-NEXT: vmovd %eax, %xmm1
382 ; AVX2-NEXT: movl 40(%rbp), %eax
383 ; AVX2-NEXT: andl $15, %eax
384 ; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
385 ; AVX2-NEXT: movl 48(%rbp), %eax
386 ; AVX2-NEXT: andl $15, %eax
387 ; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
388 ; AVX2-NEXT: movl 56(%rbp), %eax
389 ; AVX2-NEXT: andl $15, %eax
390 ; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
391 ; AVX2-NEXT: movl 64(%rbp), %eax
392 ; AVX2-NEXT: andl $15, %eax
393 ; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
394 ; AVX2-NEXT: movl 72(%rbp), %eax
395 ; AVX2-NEXT: andl $15, %eax
396 ; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
397 ; AVX2-NEXT: movl 80(%rbp), %eax
398 ; AVX2-NEXT: andl $15, %eax
399 ; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1
400 ; AVX2-NEXT: movl 88(%rbp), %eax
401 ; AVX2-NEXT: andl $15, %eax
402 ; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1
403 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
404 ; AVX2-NEXT: movq %rbp, %rsp
405 ; AVX2-NEXT: popq %rbp
407 %x0 = extractelement <16 x i16> %x, i32 %i0
408 %x1 = extractelement <16 x i16> %x, i32 %i1
409 %x2 = extractelement <16 x i16> %x, i32 %i2
410 %x3 = extractelement <16 x i16> %x, i32 %i3
411 %x4 = extractelement <16 x i16> %x, i32 %i4
412 %x5 = extractelement <16 x i16> %x, i32 %i5
413 %x6 = extractelement <16 x i16> %x, i32 %i6
414 %x7 = extractelement <16 x i16> %x, i32 %i7
415 %x8 = extractelement <16 x i16> %x, i32 %i8
416 %x9 = extractelement <16 x i16> %x, i32 %i9
417 %x10 = extractelement <16 x i16> %x, i32 %i10
418 %x11 = extractelement <16 x i16> %x, i32 %i11
419 %x12 = extractelement <16 x i16> %x, i32 %i12
420 %x13 = extractelement <16 x i16> %x, i32 %i13
421 %x14 = extractelement <16 x i16> %x, i32 %i14
422 %x15 = extractelement <16 x i16> %x, i32 %i15
423 %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0
424 %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1
425 %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2
426 %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3
427 %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4
428 %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5
429 %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6
430 %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7
431 %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8
432 %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9
433 %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10
434 %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11
435 %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12
436 %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13
437 %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14
438 %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15
442 define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
443 ; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
445 ; AVX1-NEXT: # kill: def $r9d killed $r9d def $r9
446 ; AVX1-NEXT: # kill: def $r8d killed $r8d def $r8
447 ; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx
448 ; AVX1-NEXT: # kill: def $edx killed $edx def $rdx
449 ; AVX1-NEXT: # kill: def $esi killed $esi def $rsi
450 ; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
451 ; AVX1-NEXT: andl $7, %edi
452 ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
453 ; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax
454 ; AVX1-NEXT: vmovd %eax, %xmm0
455 ; AVX1-NEXT: andl $7, %esi
456 ; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
457 ; AVX1-NEXT: andl $7, %edx
458 ; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
459 ; AVX1-NEXT: andl $7, %ecx
460 ; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
461 ; AVX1-NEXT: andl $7, %r8d
462 ; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
463 ; AVX1-NEXT: andl $7, %r9d
464 ; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
465 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
466 ; AVX1-NEXT: andl $7, %eax
467 ; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
468 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
469 ; AVX1-NEXT: andl $7, %eax
470 ; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
471 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
472 ; AVX1-NEXT: andl $7, %eax
473 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
474 ; AVX1-NEXT: vmovd %eax, %xmm1
475 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
476 ; AVX1-NEXT: andl $7, %eax
477 ; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
478 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
479 ; AVX1-NEXT: andl $7, %eax
480 ; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
481 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
482 ; AVX1-NEXT: andl $7, %eax
483 ; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
484 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
485 ; AVX1-NEXT: andl $7, %eax
486 ; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
487 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
488 ; AVX1-NEXT: andl $7, %eax
489 ; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
490 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
491 ; AVX1-NEXT: andl $7, %eax
492 ; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1
493 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
494 ; AVX1-NEXT: andl $7, %eax
495 ; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1
496 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
499 ; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
501 ; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9
502 ; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8
503 ; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
504 ; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
505 ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
506 ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
507 ; AVX2-NEXT: andl $7, %edi
508 ; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
509 ; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax
510 ; AVX2-NEXT: vmovd %eax, %xmm0
511 ; AVX2-NEXT: andl $7, %esi
512 ; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
513 ; AVX2-NEXT: andl $7, %edx
514 ; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
515 ; AVX2-NEXT: andl $7, %ecx
516 ; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
517 ; AVX2-NEXT: andl $7, %r8d
518 ; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
519 ; AVX2-NEXT: andl $7, %r9d
520 ; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
521 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
522 ; AVX2-NEXT: andl $7, %eax
523 ; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
524 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
525 ; AVX2-NEXT: andl $7, %eax
526 ; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
527 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
528 ; AVX2-NEXT: andl $7, %eax
529 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
530 ; AVX2-NEXT: vmovd %eax, %xmm1
531 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
532 ; AVX2-NEXT: andl $7, %eax
533 ; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
534 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
535 ; AVX2-NEXT: andl $7, %eax
536 ; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
537 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
538 ; AVX2-NEXT: andl $7, %eax
539 ; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
540 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
541 ; AVX2-NEXT: andl $7, %eax
542 ; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
543 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
544 ; AVX2-NEXT: andl $7, %eax
545 ; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
546 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
547 ; AVX2-NEXT: andl $7, %eax
548 ; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1
549 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
550 ; AVX2-NEXT: andl $7, %eax
551 ; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1
552 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
554 %x0 = extractelement <8 x i16> %x, i32 %i0
555 %x1 = extractelement <8 x i16> %x, i32 %i1
556 %x2 = extractelement <8 x i16> %x, i32 %i2
557 %x3 = extractelement <8 x i16> %x, i32 %i3
558 %x4 = extractelement <8 x i16> %x, i32 %i4
559 %x5 = extractelement <8 x i16> %x, i32 %i5
560 %x6 = extractelement <8 x i16> %x, i32 %i6
561 %x7 = extractelement <8 x i16> %x, i32 %i7
562 %x8 = extractelement <8 x i16> %x, i32 %i8
563 %x9 = extractelement <8 x i16> %x, i32 %i9
564 %x10 = extractelement <8 x i16> %x, i32 %i10
565 %x11 = extractelement <8 x i16> %x, i32 %i11
566 %x12 = extractelement <8 x i16> %x, i32 %i12
567 %x13 = extractelement <8 x i16> %x, i32 %i13
568 %x14 = extractelement <8 x i16> %x, i32 %i14
569 %x15 = extractelement <8 x i16> %x, i32 %i15
570 %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0
571 %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1
572 %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2
573 %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3
574 %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4
575 %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5
576 %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6
577 %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7
578 %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8
579 %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9
580 %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10
581 %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11
582 %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12
583 %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13
584 %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14
585 %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15
590 ; Unary shuffle indices from memory
593 define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind {
594 ; ALL-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
596 ; ALL-NEXT: pushq %rbp
597 ; ALL-NEXT: movq %rsp, %rbp
598 ; ALL-NEXT: andq $-32, %rsp
599 ; ALL-NEXT: subq $64, %rsp
600 ; ALL-NEXT: movq (%rdi), %rax
601 ; ALL-NEXT: movq 8(%rdi), %rcx
602 ; ALL-NEXT: andl $3, %eax
603 ; ALL-NEXT: andl $3, %ecx
604 ; ALL-NEXT: movq 16(%rdi), %rdx
605 ; ALL-NEXT: andl $3, %edx
606 ; ALL-NEXT: movq 24(%rdi), %rsi
607 ; ALL-NEXT: andl $3, %esi
608 ; ALL-NEXT: vmovaps %ymm0, (%rsp)
609 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
610 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
611 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
612 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
613 ; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
614 ; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
615 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
616 ; ALL-NEXT: movq %rbp, %rsp
617 ; ALL-NEXT: popq %rbp
619 %p0 = getelementptr inbounds i64, i64* %i, i32 0
620 %p1 = getelementptr inbounds i64, i64* %i, i32 1
621 %p2 = getelementptr inbounds i64, i64* %i, i32 2
622 %p3 = getelementptr inbounds i64, i64* %i, i32 3
623 %i0 = load i64, i64* %p0, align 4
624 %i1 = load i64, i64* %p1, align 4
625 %i2 = load i64, i64* %p2, align 4
626 %i3 = load i64, i64* %p3, align 4
627 %x0 = extractelement <4 x i64> %x, i64 %i0
628 %x1 = extractelement <4 x i64> %x, i64 %i1
629 %x2 = extractelement <4 x i64> %x, i64 %i2
630 %x3 = extractelement <4 x i64> %x, i64 %i3
631 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
632 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
633 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
634 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
638 define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind {
639 ; ALL-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
641 ; ALL-NEXT: movq (%rdi), %rax
642 ; ALL-NEXT: movq 8(%rdi), %rcx
643 ; ALL-NEXT: andl $1, %eax
644 ; ALL-NEXT: andl $1, %ecx
645 ; ALL-NEXT: movq 16(%rdi), %rdx
646 ; ALL-NEXT: andl $1, %edx
647 ; ALL-NEXT: movq 24(%rdi), %rsi
648 ; ALL-NEXT: andl $1, %esi
649 ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
650 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
651 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
652 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
653 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
654 ; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
655 ; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
656 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
658 %p0 = getelementptr inbounds i64, i64* %i, i32 0
659 %p1 = getelementptr inbounds i64, i64* %i, i32 1
660 %p2 = getelementptr inbounds i64, i64* %i, i32 2
661 %p3 = getelementptr inbounds i64, i64* %i, i32 3
662 %i0 = load i64, i64* %p0, align 4
663 %i1 = load i64, i64* %p1, align 4
664 %i2 = load i64, i64* %p2, align 4
665 %i3 = load i64, i64* %p3, align 4
666 %x0 = extractelement <2 x i64> %x, i64 %i0
667 %x1 = extractelement <2 x i64> %x, i64 %i1
668 %x2 = extractelement <2 x i64> %x, i64 %i2
669 %x3 = extractelement <2 x i64> %x, i64 %i3
670 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
671 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
672 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
673 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3