1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --no_x86_scrub_mem_shuffle
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI
14 define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
15 ; SSE3-LABEL: var_shuffle_v2i64:
17 ; SSE3-NEXT: movq %xmm1, %rax
18 ; SSE3-NEXT: andl $1, %eax
19 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
20 ; SSE3-NEXT: movq %xmm1, %rcx
21 ; SSE3-NEXT: andl $1, %ecx
22 ; SSE3-NEXT: movaps %xmm0, -24(%rsp)
23 ; SSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
24 ; SSE3-NEXT: movsd -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
25 ; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
28 ; SSSE3-LABEL: var_shuffle_v2i64:
30 ; SSSE3-NEXT: movq %xmm1, %rax
31 ; SSSE3-NEXT: andl $1, %eax
32 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
33 ; SSSE3-NEXT: movq %xmm1, %rcx
34 ; SSSE3-NEXT: andl $1, %ecx
35 ; SSSE3-NEXT: movaps %xmm0, -24(%rsp)
36 ; SSSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
37 ; SSSE3-NEXT: movsd -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
38 ; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
41 ; SSE41-LABEL: var_shuffle_v2i64:
43 ; SSE41-NEXT: pxor %xmm2, %xmm2
44 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm2
45 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
46 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
47 ; SSE41-NEXT: movdqa %xmm2, %xmm0
48 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
49 ; SSE41-NEXT: movapd %xmm1, %xmm0
52 ; AVX-LABEL: var_shuffle_v2i64:
54 ; AVX-NEXT: vpaddq %xmm1, %xmm1, %xmm1
55 ; AVX-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
57 %index0 = extractelement <2 x i64> %indices, i32 0
58 %index1 = extractelement <2 x i64> %indices, i32 1
59 %v0 = extractelement <2 x i64> %v, i64 %index0
60 %v1 = extractelement <2 x i64> %v, i64 %index1
61 %ret0 = insertelement <2 x i64> undef, i64 %v0, i32 0
62 %ret1 = insertelement <2 x i64> %ret0, i64 %v1, i32 1
66 define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
67 ; SSE3-LABEL: var_shuffle_zero_v2i64:
69 ; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
70 ; SSE3-NEXT: pxor %xmm1, %xmm2
71 ; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
72 ; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
73 ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
74 ; SSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
75 ; SSE3-NEXT: pand %xmm4, %xmm3
76 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
77 ; SSE3-NEXT: por %xmm3, %xmm2
78 ; SSE3-NEXT: por %xmm2, %xmm1
79 ; SSE3-NEXT: movq %xmm1, %rax
80 ; SSE3-NEXT: andl $1, %eax
81 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
82 ; SSE3-NEXT: movq %xmm1, %rcx
83 ; SSE3-NEXT: andl $1, %ecx
84 ; SSE3-NEXT: movaps %xmm0, -24(%rsp)
85 ; SSE3-NEXT: movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
86 ; SSE3-NEXT: movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
87 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
88 ; SSE3-NEXT: pandn %xmm0, %xmm2
89 ; SSE3-NEXT: movdqa %xmm2, %xmm0
92 ; SSSE3-LABEL: var_shuffle_zero_v2i64:
94 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
95 ; SSSE3-NEXT: pxor %xmm1, %xmm2
96 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
97 ; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
98 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
99 ; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
100 ; SSSE3-NEXT: pand %xmm4, %xmm3
101 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
102 ; SSSE3-NEXT: por %xmm3, %xmm2
103 ; SSSE3-NEXT: por %xmm2, %xmm1
104 ; SSSE3-NEXT: movq %xmm1, %rax
105 ; SSSE3-NEXT: andl $1, %eax
106 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
107 ; SSSE3-NEXT: movq %xmm1, %rcx
108 ; SSSE3-NEXT: andl $1, %ecx
109 ; SSSE3-NEXT: movaps %xmm0, -24(%rsp)
110 ; SSSE3-NEXT: movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
111 ; SSSE3-NEXT: movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
112 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
113 ; SSSE3-NEXT: pandn %xmm0, %xmm2
114 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
117 ; SSE41-LABEL: var_shuffle_zero_v2i64:
119 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
120 ; SSE41-NEXT: pxor %xmm1, %xmm2
121 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
122 ; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
123 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
124 ; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
125 ; SSE41-NEXT: pand %xmm4, %xmm3
126 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
127 ; SSE41-NEXT: por %xmm3, %xmm2
128 ; SSE41-NEXT: por %xmm2, %xmm1
129 ; SSE41-NEXT: pxor %xmm3, %xmm3
130 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm3
131 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
132 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
133 ; SSE41-NEXT: movdqa %xmm3, %xmm0
134 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
135 ; SSE41-NEXT: pandn %xmm4, %xmm2
136 ; SSE41-NEXT: movdqa %xmm2, %xmm0
139 ; XOP-LABEL: var_shuffle_zero_v2i64:
141 ; XOP-NEXT: vpcomgtuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
142 ; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1
143 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
144 ; XOP-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
145 ; XOP-NEXT: vpandn %xmm0, %xmm2, %xmm0
148 ; AVX1-LABEL: var_shuffle_zero_v2i64:
150 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
151 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
152 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
153 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
154 ; AVX1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
155 ; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0
158 ; AVX2-LABEL: var_shuffle_zero_v2i64:
160 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
161 ; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
162 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
163 ; AVX2-NEXT: vpaddq %xmm1, %xmm1, %xmm1
164 ; AVX2-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
165 ; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0
168 ; AVX512-LABEL: var_shuffle_zero_v2i64:
170 ; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
171 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
172 ; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
173 ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
174 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
175 ; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
176 ; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
177 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
178 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
179 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
180 ; AVX512-NEXT: vzeroupper
183 ; AVX512VL-LABEL: var_shuffle_zero_v2i64:
185 ; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
186 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
187 ; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
188 ; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
189 ; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
190 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
191 ; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
192 ; AVX512VL-NEXT: retq
193 %cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
194 %or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
195 %idx0 = extractelement <2 x i64> %or, i64 0
196 %idx1 = extractelement <2 x i64> %or, i64 1
197 %elt0 = extractelement <2 x i64> %v, i64 %idx0
198 %elt1 = extractelement <2 x i64> %v, i64 %idx1
199 %vec0 = insertelement <2 x i64> poison, i64 %elt0, i64 0
200 %vec1 = insertelement <2 x i64> %vec0, i64 %elt1, i64 1
201 %res = select <2 x i1> %cmp, <2 x i64> zeroinitializer, <2 x i64> %vec1
205 define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
206 ; SSE3-LABEL: var_shuffle_v4i32:
208 ; SSE3-NEXT: movd %xmm1, %eax
209 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
210 ; SSE3-NEXT: movd %xmm2, %ecx
211 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
212 ; SSE3-NEXT: movd %xmm2, %edx
213 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
214 ; SSE3-NEXT: movd %xmm1, %esi
215 ; SSE3-NEXT: movaps %xmm0, -24(%rsp)
216 ; SSE3-NEXT: andl $3, %eax
217 ; SSE3-NEXT: andl $3, %ecx
218 ; SSE3-NEXT: andl $3, %edx
219 ; SSE3-NEXT: andl $3, %esi
220 ; SSE3-NEXT: movss -24(%rsp,%rsi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
221 ; SSE3-NEXT: movss -24(%rsp,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
222 ; SSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
223 ; SSE3-NEXT: movss -24(%rsp,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
224 ; SSE3-NEXT: movss -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
225 ; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
226 ; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
229 ; SSSE3-LABEL: var_shuffle_v4i32:
231 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036]
232 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
233 ; SSSE3-NEXT: pmuludq %xmm2, %xmm1
234 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
235 ; SSSE3-NEXT: pmuludq %xmm2, %xmm3
236 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
237 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
238 ; SSSE3-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
239 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
242 ; SSE41-LABEL: var_shuffle_v4i32:
244 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
245 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
246 ; SSE41-NEXT: pshufb %xmm1, %xmm0
249 ; AVX-LABEL: var_shuffle_v4i32:
251 ; AVX-NEXT: vpermilps %xmm1, %xmm0, %xmm0
253 %index0 = extractelement <4 x i32> %indices, i32 0
254 %index1 = extractelement <4 x i32> %indices, i32 1
255 %index2 = extractelement <4 x i32> %indices, i32 2
256 %index3 = extractelement <4 x i32> %indices, i32 3
257 %v0 = extractelement <4 x i32> %v, i32 %index0
258 %v1 = extractelement <4 x i32> %v, i32 %index1
259 %v2 = extractelement <4 x i32> %v, i32 %index2
260 %v3 = extractelement <4 x i32> %v, i32 %index3
261 %ret0 = insertelement <4 x i32> undef, i32 %v0, i32 0
262 %ret1 = insertelement <4 x i32> %ret0, i32 %v1, i32 1
263 %ret2 = insertelement <4 x i32> %ret1, i32 %v2, i32 2
264 %ret3 = insertelement <4 x i32> %ret2, i32 %v3, i32 3
268 define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
269 ; SSE3-LABEL: var_shuffle_zero_v4i32:
271 ; SSE3-NEXT: movaps %xmm0, %xmm2
272 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
273 ; SSE3-NEXT: pxor %xmm1, %xmm0
274 ; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
275 ; SSE3-NEXT: por %xmm0, %xmm1
276 ; SSE3-NEXT: movd %xmm1, %eax
277 ; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
278 ; SSE3-NEXT: movd %xmm3, %ecx
279 ; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
280 ; SSE3-NEXT: movd %xmm3, %edx
281 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
282 ; SSE3-NEXT: movd %xmm1, %esi
283 ; SSE3-NEXT: movaps %xmm2, -24(%rsp)
284 ; SSE3-NEXT: andl $3, %eax
285 ; SSE3-NEXT: andl $3, %ecx
286 ; SSE3-NEXT: andl $3, %edx
287 ; SSE3-NEXT: andl $3, %esi
288 ; SSE3-NEXT: movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
289 ; SSE3-NEXT: movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
290 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
291 ; SSE3-NEXT: movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
292 ; SSE3-NEXT: movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero
293 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
294 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
295 ; SSE3-NEXT: pandn %xmm1, %xmm0
298 ; SSSE3-LABEL: var_shuffle_zero_v4i32:
300 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
301 ; SSSE3-NEXT: pxor %xmm1, %xmm2
302 ; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
303 ; SSSE3-NEXT: por %xmm2, %xmm1
304 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [67372036,67372036,67372036,67372036]
305 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
306 ; SSSE3-NEXT: pmuludq %xmm3, %xmm1
307 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
308 ; SSSE3-NEXT: pmuludq %xmm3, %xmm4
309 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
310 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
311 ; SSSE3-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
312 ; SSSE3-NEXT: por %xmm2, %xmm1
313 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
316 ; SSE41-LABEL: var_shuffle_zero_v4i32:
318 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [4,4,4,4]
319 ; SSE41-NEXT: pmaxud %xmm1, %xmm2
320 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
321 ; SSE41-NEXT: por %xmm2, %xmm1
322 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
323 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
324 ; SSE41-NEXT: por %xmm2, %xmm1
325 ; SSE41-NEXT: pshufb %xmm1, %xmm0
328 ; XOP-LABEL: var_shuffle_zero_v4i32:
330 ; XOP-NEXT: vpcomgtud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
331 ; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1
332 ; XOP-NEXT: vpermilps %xmm1, %xmm0, %xmm0
333 ; XOP-NEXT: vpandn %xmm0, %xmm2, %xmm0
336 ; AVX1-LABEL: var_shuffle_zero_v4i32:
338 ; AVX1-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
339 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
340 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
341 ; AVX1-NEXT: vpermilps %xmm1, %xmm0, %xmm0
342 ; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0
345 ; AVX2-LABEL: var_shuffle_zero_v4i32:
347 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
348 ; AVX2-NEXT: vpmaxud %xmm2, %xmm1, %xmm2
349 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
350 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
351 ; AVX2-NEXT: vpermilps %xmm1, %xmm0, %xmm0
352 ; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0
355 ; AVX512-LABEL: var_shuffle_zero_v4i32:
357 ; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
358 ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
359 ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
360 ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
361 ; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
362 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
363 ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
364 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
365 ; AVX512-NEXT: vzeroupper
368 ; AVX512VL-LABEL: var_shuffle_zero_v4i32:
370 ; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
371 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
372 ; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
373 ; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
374 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
375 ; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
376 ; AVX512VL-NEXT: retq
377 %cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
378 %or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
379 %idx0 = extractelement <4 x i32> %or, i64 0
380 %idx1 = extractelement <4 x i32> %or, i64 1
381 %idx2 = extractelement <4 x i32> %or, i64 2
382 %idx3 = extractelement <4 x i32> %or, i64 3
383 %elt0 = extractelement <4 x i32> %v, i32 %idx0
384 %elt1 = extractelement <4 x i32> %v, i32 %idx1
385 %elt2 = extractelement <4 x i32> %v, i32 %idx2
386 %elt3 = extractelement <4 x i32> %v, i32 %idx3
387 %vec0 = insertelement <4 x i32> poison, i32 %elt0, i32 0
388 %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
389 %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
390 %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
391 %res = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> %vec3
395 define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
396 ; SSE3-LABEL: var_shuffle_v8i16:
398 ; SSE3-NEXT: pextrw $0, %xmm1, %eax
399 ; SSE3-NEXT: pextrw $1, %xmm1, %ecx
400 ; SSE3-NEXT: pextrw $2, %xmm1, %edx
401 ; SSE3-NEXT: pextrw $3, %xmm1, %esi
402 ; SSE3-NEXT: pextrw $4, %xmm1, %edi
403 ; SSE3-NEXT: pextrw $5, %xmm1, %r8d
404 ; SSE3-NEXT: pextrw $6, %xmm1, %r9d
405 ; SSE3-NEXT: pextrw $7, %xmm1, %r10d
406 ; SSE3-NEXT: movaps %xmm0, -24(%rsp)
407 ; SSE3-NEXT: andl $7, %eax
408 ; SSE3-NEXT: andl $7, %ecx
409 ; SSE3-NEXT: andl $7, %edx
410 ; SSE3-NEXT: andl $7, %esi
411 ; SSE3-NEXT: andl $7, %edi
412 ; SSE3-NEXT: andl $7, %r8d
413 ; SSE3-NEXT: andl $7, %r9d
414 ; SSE3-NEXT: andl $7, %r10d
415 ; SSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d
416 ; SSE3-NEXT: movd %r10d, %xmm0
417 ; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %r9d
418 ; SSE3-NEXT: movd %r9d, %xmm1
419 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
420 ; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %r8d
421 ; SSE3-NEXT: movd %r8d, %xmm0
422 ; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi
423 ; SSE3-NEXT: movd %edi, %xmm2
424 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
425 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
426 ; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi
427 ; SSE3-NEXT: movd %esi, %xmm0
428 ; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx
429 ; SSE3-NEXT: movd %edx, %xmm1
430 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
431 ; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
432 ; SSE3-NEXT: movd %ecx, %xmm3
433 ; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
434 ; SSE3-NEXT: movd %eax, %xmm0
435 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
436 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
437 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
440 ; SSSE3-LABEL: var_shuffle_v8i16:
442 ; SSSE3-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
443 ; SSSE3-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
444 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
447 ; SSE41-LABEL: var_shuffle_v8i16:
449 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
450 ; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
451 ; SSE41-NEXT: pshufb %xmm1, %xmm0
454 ; AVXNOVLBW-LABEL: var_shuffle_v8i16:
455 ; AVXNOVLBW: # %bb.0:
456 ; AVXNOVLBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
457 ; AVXNOVLBW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
458 ; AVXNOVLBW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
459 ; AVXNOVLBW-NEXT: retq
461 ; AVX512VL-LABEL: var_shuffle_v8i16:
463 ; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0
464 ; AVX512VL-NEXT: retq
465 %index0 = extractelement <8 x i16> %indices, i32 0
466 %index1 = extractelement <8 x i16> %indices, i32 1
467 %index2 = extractelement <8 x i16> %indices, i32 2
468 %index3 = extractelement <8 x i16> %indices, i32 3
469 %index4 = extractelement <8 x i16> %indices, i32 4
470 %index5 = extractelement <8 x i16> %indices, i32 5
471 %index6 = extractelement <8 x i16> %indices, i32 6
472 %index7 = extractelement <8 x i16> %indices, i32 7
473 %v0 = extractelement <8 x i16> %v, i16 %index0
474 %v1 = extractelement <8 x i16> %v, i16 %index1
475 %v2 = extractelement <8 x i16> %v, i16 %index2
476 %v3 = extractelement <8 x i16> %v, i16 %index3
477 %v4 = extractelement <8 x i16> %v, i16 %index4
478 %v5 = extractelement <8 x i16> %v, i16 %index5
479 %v6 = extractelement <8 x i16> %v, i16 %index6
480 %v7 = extractelement <8 x i16> %v, i16 %index7
481 %ret0 = insertelement <8 x i16> undef, i16 %v0, i32 0
482 %ret1 = insertelement <8 x i16> %ret0, i16 %v1, i32 1
483 %ret2 = insertelement <8 x i16> %ret1, i16 %v2, i32 2
484 %ret3 = insertelement <8 x i16> %ret2, i16 %v3, i32 3
485 %ret4 = insertelement <8 x i16> %ret3, i16 %v4, i32 4
486 %ret5 = insertelement <8 x i16> %ret4, i16 %v5, i32 5
487 %ret6 = insertelement <8 x i16> %ret5, i16 %v6, i32 6
488 %ret7 = insertelement <8 x i16> %ret6, i16 %v7, i32 7
492 define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
493 ; SSE3-LABEL: var_shuffle_zero_v8i16:
495 ; SSE3-NEXT: movdqa %xmm0, %xmm2
496 ; SSE3-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8]
497 ; SSE3-NEXT: psubusw %xmm1, %xmm3
498 ; SSE3-NEXT: pxor %xmm0, %xmm0
499 ; SSE3-NEXT: pcmpeqw %xmm3, %xmm0
500 ; SSE3-NEXT: por %xmm0, %xmm1
501 ; SSE3-NEXT: pextrw $0, %xmm1, %eax
502 ; SSE3-NEXT: pextrw $1, %xmm1, %ecx
503 ; SSE3-NEXT: pextrw $2, %xmm1, %edx
504 ; SSE3-NEXT: pextrw $3, %xmm1, %esi
505 ; SSE3-NEXT: pextrw $4, %xmm1, %edi
506 ; SSE3-NEXT: pextrw $5, %xmm1, %r8d
507 ; SSE3-NEXT: pextrw $6, %xmm1, %r9d
508 ; SSE3-NEXT: pextrw $7, %xmm1, %r10d
509 ; SSE3-NEXT: movdqa %xmm2, -24(%rsp)
510 ; SSE3-NEXT: andl $7, %eax
511 ; SSE3-NEXT: andl $7, %ecx
512 ; SSE3-NEXT: andl $7, %edx
513 ; SSE3-NEXT: andl $7, %esi
514 ; SSE3-NEXT: andl $7, %edi
515 ; SSE3-NEXT: andl $7, %r8d
516 ; SSE3-NEXT: andl $7, %r9d
517 ; SSE3-NEXT: andl $7, %r10d
518 ; SSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d
519 ; SSE3-NEXT: movd %r10d, %xmm1
520 ; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %r9d
521 ; SSE3-NEXT: movd %r9d, %xmm2
522 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
523 ; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %r8d
524 ; SSE3-NEXT: movd %r8d, %xmm1
525 ; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi
526 ; SSE3-NEXT: movd %edi, %xmm3
527 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
528 ; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
529 ; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi
530 ; SSE3-NEXT: movd %esi, %xmm1
531 ; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx
532 ; SSE3-NEXT: movd %edx, %xmm2
533 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
534 ; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
535 ; SSE3-NEXT: movd %ecx, %xmm1
536 ; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
537 ; SSE3-NEXT: movd %eax, %xmm4
538 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
539 ; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
540 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
541 ; SSE3-NEXT: pandn %xmm4, %xmm0
544 ; SSSE3-LABEL: var_shuffle_zero_v8i16:
546 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
547 ; SSSE3-NEXT: psubusw %xmm1, %xmm2
548 ; SSSE3-NEXT: pxor %xmm3, %xmm3
549 ; SSSE3-NEXT: pcmpeqw %xmm2, %xmm3
550 ; SSSE3-NEXT: por %xmm3, %xmm1
551 ; SSSE3-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
552 ; SSSE3-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
553 ; SSSE3-NEXT: por %xmm3, %xmm1
554 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
557 ; SSE41-LABEL: var_shuffle_zero_v8i16:
559 ; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
560 ; SSE41-NEXT: pmaxuw %xmm1, %xmm2
561 ; SSE41-NEXT: pcmpeqw %xmm1, %xmm2
562 ; SSE41-NEXT: por %xmm2, %xmm1
563 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
564 ; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
565 ; SSE41-NEXT: por %xmm2, %xmm1
566 ; SSE41-NEXT: pshufb %xmm1, %xmm0
569 ; XOP-LABEL: var_shuffle_zero_v8i16:
571 ; XOP-NEXT: vpcomgtuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
572 ; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1
573 ; XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
574 ; XOP-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
575 ; XOP-NEXT: vpor %xmm2, %xmm1, %xmm1
576 ; XOP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
579 ; AVX1-LABEL: var_shuffle_zero_v8i16:
581 ; AVX1-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
582 ; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
583 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
584 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
585 ; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
586 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
587 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
590 ; AVX2-LABEL: var_shuffle_zero_v8i16:
592 ; AVX2-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
593 ; AVX2-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
594 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
595 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
596 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
597 ; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1
598 ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
601 ; AVX512VL-LABEL: var_shuffle_zero_v8i16:
603 ; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
604 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
605 ; AVX512VL-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1}
606 ; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0
607 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
608 ; AVX512VL-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
609 ; AVX512VL-NEXT: retq
610 %cmp = icmp ugt <8 x i16> %indices, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
611 %or = select <8 x i1> %cmp, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %indices
612 %idx0 = extractelement <8 x i16> %or, i64 0
613 %idx1 = extractelement <8 x i16> %or, i64 1
614 %idx2 = extractelement <8 x i16> %or, i64 2
615 %idx3 = extractelement <8 x i16> %or, i64 3
616 %idx4 = extractelement <8 x i16> %or, i64 4
617 %idx5 = extractelement <8 x i16> %or, i64 5
618 %idx6 = extractelement <8 x i16> %or, i64 6
619 %idx7 = extractelement <8 x i16> %or, i64 7
620 %elt0 = extractelement <8 x i16> %v, i16 %idx0
621 %elt1 = extractelement <8 x i16> %v, i16 %idx1
622 %elt2 = extractelement <8 x i16> %v, i16 %idx2
623 %elt3 = extractelement <8 x i16> %v, i16 %idx3
624 %elt4 = extractelement <8 x i16> %v, i16 %idx4
625 %elt5 = extractelement <8 x i16> %v, i16 %idx5
626 %elt6 = extractelement <8 x i16> %v, i16 %idx6
627 %elt7 = extractelement <8 x i16> %v, i16 %idx7
628 %vec0 = insertelement <8 x i16> poison, i16 %elt0, i64 0
629 %vec1 = insertelement <8 x i16> %vec0, i16 %elt1, i64 1
630 %vec2 = insertelement <8 x i16> %vec1, i16 %elt2, i64 2
631 %vec3 = insertelement <8 x i16> %vec2, i16 %elt3, i64 3
632 %vec4 = insertelement <8 x i16> %vec3, i16 %elt4, i64 4
633 %vec5 = insertelement <8 x i16> %vec4, i16 %elt5, i64 5
634 %vec6 = insertelement <8 x i16> %vec5, i16 %elt6, i64 6
635 %vec7 = insertelement <8 x i16> %vec6, i16 %elt7, i64 7
636 %res = select <8 x i1> %cmp, <8 x i16> zeroinitializer, <8 x i16> %vec7
640 define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
641 ; SSE3-LABEL: var_shuffle_v16i8:
643 ; SSE3-NEXT: movaps %xmm1, -40(%rsp)
644 ; SSE3-NEXT: movaps %xmm0, -24(%rsp)
645 ; SSE3-NEXT: movzbl -25(%rsp), %eax
646 ; SSE3-NEXT: andl $15, %eax
647 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
648 ; SSE3-NEXT: movd %eax, %xmm1
649 ; SSE3-NEXT: movzbl -26(%rsp), %eax
650 ; SSE3-NEXT: andl $15, %eax
651 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
652 ; SSE3-NEXT: movd %eax, %xmm2
653 ; SSE3-NEXT: movzbl -27(%rsp), %eax
654 ; SSE3-NEXT: andl $15, %eax
655 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
656 ; SSE3-NEXT: movd %eax, %xmm4
657 ; SSE3-NEXT: movzbl -28(%rsp), %eax
658 ; SSE3-NEXT: andl $15, %eax
659 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
660 ; SSE3-NEXT: movd %eax, %xmm3
661 ; SSE3-NEXT: movzbl -29(%rsp), %eax
662 ; SSE3-NEXT: andl $15, %eax
663 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
664 ; SSE3-NEXT: movd %eax, %xmm6
665 ; SSE3-NEXT: movzbl -30(%rsp), %eax
666 ; SSE3-NEXT: andl $15, %eax
667 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
668 ; SSE3-NEXT: movd %eax, %xmm7
669 ; SSE3-NEXT: movzbl -31(%rsp), %eax
670 ; SSE3-NEXT: andl $15, %eax
671 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
672 ; SSE3-NEXT: movd %eax, %xmm8
673 ; SSE3-NEXT: movzbl -32(%rsp), %eax
674 ; SSE3-NEXT: andl $15, %eax
675 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
676 ; SSE3-NEXT: movd %eax, %xmm5
677 ; SSE3-NEXT: movzbl -33(%rsp), %eax
678 ; SSE3-NEXT: andl $15, %eax
679 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
680 ; SSE3-NEXT: movd %eax, %xmm9
681 ; SSE3-NEXT: movzbl -34(%rsp), %eax
682 ; SSE3-NEXT: andl $15, %eax
683 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
684 ; SSE3-NEXT: movd %eax, %xmm10
685 ; SSE3-NEXT: movzbl -35(%rsp), %eax
686 ; SSE3-NEXT: andl $15, %eax
687 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
688 ; SSE3-NEXT: movd %eax, %xmm12
689 ; SSE3-NEXT: movzbl -36(%rsp), %eax
690 ; SSE3-NEXT: andl $15, %eax
691 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
692 ; SSE3-NEXT: movd %eax, %xmm11
693 ; SSE3-NEXT: movzbl -37(%rsp), %eax
694 ; SSE3-NEXT: andl $15, %eax
695 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
696 ; SSE3-NEXT: movd %eax, %xmm13
697 ; SSE3-NEXT: movzbl -38(%rsp), %eax
698 ; SSE3-NEXT: andl $15, %eax
699 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
700 ; SSE3-NEXT: movd %eax, %xmm14
701 ; SSE3-NEXT: movzbl -39(%rsp), %eax
702 ; SSE3-NEXT: andl $15, %eax
703 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
704 ; SSE3-NEXT: movd %eax, %xmm15
705 ; SSE3-NEXT: movzbl -40(%rsp), %eax
706 ; SSE3-NEXT: andl $15, %eax
707 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
708 ; SSE3-NEXT: movd %eax, %xmm0
709 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
710 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
711 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
712 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
713 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
714 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
715 ; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
716 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
717 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
718 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
719 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
720 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
721 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
722 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
723 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
726 ; SSSE3-LABEL: var_shuffle_v16i8:
728 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
731 ; SSE41-LABEL: var_shuffle_v16i8:
733 ; SSE41-NEXT: pshufb %xmm1, %xmm0
736 ; AVX-LABEL: var_shuffle_v16i8:
738 ; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
740 %index0 = extractelement <16 x i8> %indices, i32 0
741 %index1 = extractelement <16 x i8> %indices, i32 1
742 %index2 = extractelement <16 x i8> %indices, i32 2
743 %index3 = extractelement <16 x i8> %indices, i32 3
744 %index4 = extractelement <16 x i8> %indices, i32 4
745 %index5 = extractelement <16 x i8> %indices, i32 5
746 %index6 = extractelement <16 x i8> %indices, i32 6
747 %index7 = extractelement <16 x i8> %indices, i32 7
748 %index8 = extractelement <16 x i8> %indices, i32 8
749 %index9 = extractelement <16 x i8> %indices, i32 9
750 %index10 = extractelement <16 x i8> %indices, i32 10
751 %index11 = extractelement <16 x i8> %indices, i32 11
752 %index12 = extractelement <16 x i8> %indices, i32 12
753 %index13 = extractelement <16 x i8> %indices, i32 13
754 %index14 = extractelement <16 x i8> %indices, i32 14
755 %index15 = extractelement <16 x i8> %indices, i32 15
756 %v0 = extractelement <16 x i8> %v, i8 %index0
757 %v1 = extractelement <16 x i8> %v, i8 %index1
758 %v2 = extractelement <16 x i8> %v, i8 %index2
759 %v3 = extractelement <16 x i8> %v, i8 %index3
760 %v4 = extractelement <16 x i8> %v, i8 %index4
761 %v5 = extractelement <16 x i8> %v, i8 %index5
762 %v6 = extractelement <16 x i8> %v, i8 %index6
763 %v7 = extractelement <16 x i8> %v, i8 %index7
764 %v8 = extractelement <16 x i8> %v, i8 %index8
765 %v9 = extractelement <16 x i8> %v, i8 %index9
766 %v10 = extractelement <16 x i8> %v, i8 %index10
767 %v11 = extractelement <16 x i8> %v, i8 %index11
768 %v12 = extractelement <16 x i8> %v, i8 %index12
769 %v13 = extractelement <16 x i8> %v, i8 %index13
770 %v14 = extractelement <16 x i8> %v, i8 %index14
771 %v15 = extractelement <16 x i8> %v, i8 %index15
772 %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
773 %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
774 %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
775 %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
776 %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
777 %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
778 %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
779 %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
780 %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
781 %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
782 %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
783 %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
784 %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
785 %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
786 %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
787 %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
791 define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
792 ; SSE3-LABEL: var_shuffle_zero_v16i8:
794 ; SSE3-NEXT: movaps %xmm0, %xmm2
795 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
796 ; SSE3-NEXT: pmaxub %xmm1, %xmm0
797 ; SSE3-NEXT: pcmpeqb %xmm1, %xmm0
798 ; SSE3-NEXT: por %xmm0, %xmm1
799 ; SSE3-NEXT: movdqa %xmm1, -40(%rsp)
800 ; SSE3-NEXT: movaps %xmm2, -24(%rsp)
801 ; SSE3-NEXT: movzbl -25(%rsp), %eax
802 ; SSE3-NEXT: andl $15, %eax
803 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
804 ; SSE3-NEXT: movd %eax, %xmm1
805 ; SSE3-NEXT: movzbl -26(%rsp), %eax
806 ; SSE3-NEXT: andl $15, %eax
807 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
808 ; SSE3-NEXT: movd %eax, %xmm2
809 ; SSE3-NEXT: movzbl -27(%rsp), %eax
810 ; SSE3-NEXT: andl $15, %eax
811 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
812 ; SSE3-NEXT: movd %eax, %xmm4
813 ; SSE3-NEXT: movzbl -28(%rsp), %eax
814 ; SSE3-NEXT: andl $15, %eax
815 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
816 ; SSE3-NEXT: movd %eax, %xmm3
817 ; SSE3-NEXT: movzbl -29(%rsp), %eax
818 ; SSE3-NEXT: andl $15, %eax
819 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
820 ; SSE3-NEXT: movd %eax, %xmm6
821 ; SSE3-NEXT: movzbl -30(%rsp), %eax
822 ; SSE3-NEXT: andl $15, %eax
823 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
824 ; SSE3-NEXT: movd %eax, %xmm7
825 ; SSE3-NEXT: movzbl -31(%rsp), %eax
826 ; SSE3-NEXT: andl $15, %eax
827 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
828 ; SSE3-NEXT: movd %eax, %xmm8
829 ; SSE3-NEXT: movzbl -32(%rsp), %eax
830 ; SSE3-NEXT: andl $15, %eax
831 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
832 ; SSE3-NEXT: movd %eax, %xmm5
833 ; SSE3-NEXT: movzbl -33(%rsp), %eax
834 ; SSE3-NEXT: andl $15, %eax
835 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
836 ; SSE3-NEXT: movd %eax, %xmm9
837 ; SSE3-NEXT: movzbl -34(%rsp), %eax
838 ; SSE3-NEXT: andl $15, %eax
839 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
840 ; SSE3-NEXT: movd %eax, %xmm10
841 ; SSE3-NEXT: movzbl -35(%rsp), %eax
842 ; SSE3-NEXT: andl $15, %eax
843 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
844 ; SSE3-NEXT: movd %eax, %xmm12
845 ; SSE3-NEXT: movzbl -36(%rsp), %eax
846 ; SSE3-NEXT: andl $15, %eax
847 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
848 ; SSE3-NEXT: movd %eax, %xmm11
849 ; SSE3-NEXT: movzbl -37(%rsp), %eax
850 ; SSE3-NEXT: andl $15, %eax
851 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
852 ; SSE3-NEXT: movd %eax, %xmm13
853 ; SSE3-NEXT: movzbl -38(%rsp), %eax
854 ; SSE3-NEXT: andl $15, %eax
855 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
856 ; SSE3-NEXT: movd %eax, %xmm14
857 ; SSE3-NEXT: movzbl -39(%rsp), %eax
858 ; SSE3-NEXT: andl $15, %eax
859 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
860 ; SSE3-NEXT: movd %eax, %xmm15
861 ; SSE3-NEXT: movzbl -40(%rsp), %eax
862 ; SSE3-NEXT: andl $15, %eax
863 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
864 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
865 ; SSE3-NEXT: movd %eax, %xmm1
866 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
867 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
868 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
869 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
870 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
871 ; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
872 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
873 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
874 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
875 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
876 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
877 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
878 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
879 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
880 ; SSE3-NEXT: pandn %xmm1, %xmm0
883 ; SSSE3-LABEL: var_shuffle_zero_v16i8:
885 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
886 ; SSSE3-NEXT: pmaxub %xmm1, %xmm2
887 ; SSSE3-NEXT: pcmpeqb %xmm1, %xmm2
888 ; SSSE3-NEXT: por %xmm1, %xmm2
889 ; SSSE3-NEXT: pshufb %xmm2, %xmm0
892 ; SSE41-LABEL: var_shuffle_zero_v16i8:
894 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
895 ; SSE41-NEXT: pmaxub %xmm1, %xmm2
896 ; SSE41-NEXT: pcmpeqb %xmm1, %xmm2
897 ; SSE41-NEXT: por %xmm1, %xmm2
898 ; SSE41-NEXT: pshufb %xmm2, %xmm0
901 ; XOP-LABEL: var_shuffle_zero_v16i8:
903 ; XOP-NEXT: vpcomgtub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
904 ; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1
905 ; XOP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
908 ; AVX1-LABEL: var_shuffle_zero_v16i8:
910 ; AVX1-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
911 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
912 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
913 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
916 ; AVX2-LABEL: var_shuffle_zero_v16i8:
918 ; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
919 ; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
920 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
921 ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
924 ; AVX512VL-LABEL: var_shuffle_zero_v16i8:
926 ; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
927 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
928 ; AVX512VL-NEXT: vmovdqu8 %xmm2, %xmm1 {%k1}
929 ; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
930 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
931 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
932 ; AVX512VL-NEXT: retq
933 %cmp = icmp ugt <16 x i8> %indices, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
934 %or = select <16 x i1> %cmp, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %indices
935 %idx0 = extractelement <16 x i8> %or, i64 0
936 %idx1 = extractelement <16 x i8> %or, i64 1
937 %idx2 = extractelement <16 x i8> %or, i64 2
938 %idx3 = extractelement <16 x i8> %or, i64 3
939 %idx4 = extractelement <16 x i8> %or, i64 4
940 %idx5 = extractelement <16 x i8> %or, i64 5
941 %idx6 = extractelement <16 x i8> %or, i64 6
942 %idx7 = extractelement <16 x i8> %or, i64 7
943 %idx8 = extractelement <16 x i8> %or, i64 8
944 %idx9 = extractelement <16 x i8> %or, i64 9
945 %idxA = extractelement <16 x i8> %or, i64 10
946 %idxB = extractelement <16 x i8> %or, i64 11
947 %idxC = extractelement <16 x i8> %or, i64 12
948 %idxD = extractelement <16 x i8> %or, i64 13
949 %idxE = extractelement <16 x i8> %or, i64 14
950 %idxF = extractelement <16 x i8> %or, i64 15
951 %elt0 = extractelement <16 x i8> %v, i8 %idx0
952 %elt1 = extractelement <16 x i8> %v, i8 %idx1
953 %elt2 = extractelement <16 x i8> %v, i8 %idx2
954 %elt3 = extractelement <16 x i8> %v, i8 %idx3
955 %elt4 = extractelement <16 x i8> %v, i8 %idx4
956 %elt5 = extractelement <16 x i8> %v, i8 %idx5
957 %elt6 = extractelement <16 x i8> %v, i8 %idx6
958 %elt7 = extractelement <16 x i8> %v, i8 %idx7
959 %elt8 = extractelement <16 x i8> %v, i8 %idx8
960 %elt9 = extractelement <16 x i8> %v, i8 %idx9
961 %eltA = extractelement <16 x i8> %v, i8 %idxA
962 %eltB = extractelement <16 x i8> %v, i8 %idxB
963 %eltC = extractelement <16 x i8> %v, i8 %idxC
964 %eltD = extractelement <16 x i8> %v, i8 %idxD
965 %eltE = extractelement <16 x i8> %v, i8 %idxE
966 %eltF = extractelement <16 x i8> %v, i8 %idxF
967 %vec0 = insertelement <16 x i8> poison, i8 %elt0, i64 0
968 %vec1 = insertelement <16 x i8> %vec0, i8 %elt1, i64 1
969 %vec2 = insertelement <16 x i8> %vec1, i8 %elt2, i64 2
970 %vec3 = insertelement <16 x i8> %vec2, i8 %elt3, i64 3
971 %vec4 = insertelement <16 x i8> %vec3, i8 %elt4, i64 4
972 %vec5 = insertelement <16 x i8> %vec4, i8 %elt5, i64 5
973 %vec6 = insertelement <16 x i8> %vec5, i8 %elt6, i64 6
974 %vec7 = insertelement <16 x i8> %vec6, i8 %elt7, i64 7
975 %vec8 = insertelement <16 x i8> %vec7, i8 %elt8, i64 8
976 %vec9 = insertelement <16 x i8> %vec8, i8 %elt9, i64 9
977 %vecA = insertelement <16 x i8> %vec9, i8 %eltA, i64 10
978 %vecB = insertelement <16 x i8> %vecA, i8 %eltB, i64 11
979 %vecC = insertelement <16 x i8> %vecB, i8 %eltC, i64 12
980 %vecD = insertelement <16 x i8> %vecC, i8 %eltD, i64 13
981 %vecE = insertelement <16 x i8> %vecD, i8 %eltE, i64 14
982 %vecF = insertelement <16 x i8> %vecE, i8 %eltF, i64 15
983 %res = select <16 x i1> %cmp, <16 x i8> zeroinitializer, <16 x i8> %vecF
987 define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
988 ; SSE3-LABEL: var_shuffle_v2f64:
990 ; SSE3-NEXT: movq %xmm1, %rax
991 ; SSE3-NEXT: andl $1, %eax
992 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
993 ; SSE3-NEXT: movq %xmm1, %rcx
994 ; SSE3-NEXT: andl $1, %ecx
995 ; SSE3-NEXT: movaps %xmm0, -24(%rsp)
996 ; SSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
997 ; SSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
1000 ; SSSE3-LABEL: var_shuffle_v2f64:
1002 ; SSSE3-NEXT: movq %xmm1, %rax
1003 ; SSSE3-NEXT: andl $1, %eax
1004 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1005 ; SSSE3-NEXT: movq %xmm1, %rcx
1006 ; SSSE3-NEXT: andl $1, %ecx
1007 ; SSSE3-NEXT: movaps %xmm0, -24(%rsp)
1008 ; SSSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
1009 ; SSSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
1012 ; SSE41-LABEL: var_shuffle_v2f64:
1014 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1015 ; SSE41-NEXT: pxor %xmm0, %xmm0
1016 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
1017 ; SSE41-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0]
1018 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1019 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
1020 ; SSE41-NEXT: movapd %xmm2, %xmm0
1023 ; AVX-LABEL: var_shuffle_v2f64:
1025 ; AVX-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1026 ; AVX-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
1028 %index0 = extractelement <2 x i64> %indices, i32 0
1029 %index1 = extractelement <2 x i64> %indices, i32 1
1030 %v0 = extractelement <2 x double> %v, i64 %index0
1031 %v1 = extractelement <2 x double> %v, i64 %index1
1032 %ret0 = insertelement <2 x double> undef, double %v0, i32 0
1033 %ret1 = insertelement <2 x double> %ret0, double %v1, i32 1
1034 ret <2 x double> %ret1
1037 define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
1038 ; SSE3-LABEL: var_shuffle_zero_v2f64:
1040 ; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
1041 ; SSE3-NEXT: pxor %xmm1, %xmm2
1042 ; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1043 ; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1044 ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1045 ; SSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1046 ; SSE3-NEXT: pand %xmm4, %xmm3
1047 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1048 ; SSE3-NEXT: por %xmm3, %xmm2
1049 ; SSE3-NEXT: por %xmm2, %xmm1
1050 ; SSE3-NEXT: movq %xmm1, %rax
1051 ; SSE3-NEXT: andl $1, %eax
1052 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1053 ; SSE3-NEXT: movq %xmm1, %rcx
1054 ; SSE3-NEXT: andl $1, %ecx
1055 ; SSE3-NEXT: movaps %xmm0, -24(%rsp)
1056 ; SSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
1057 ; SSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
1058 ; SSE3-NEXT: pandn %xmm0, %xmm2
1059 ; SSE3-NEXT: movdqa %xmm2, %xmm0
1062 ; SSSE3-LABEL: var_shuffle_zero_v2f64:
1064 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
1065 ; SSSE3-NEXT: pxor %xmm1, %xmm2
1066 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1067 ; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1068 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1069 ; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1070 ; SSSE3-NEXT: pand %xmm4, %xmm3
1071 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1072 ; SSSE3-NEXT: por %xmm3, %xmm2
1073 ; SSSE3-NEXT: por %xmm2, %xmm1
1074 ; SSSE3-NEXT: movq %xmm1, %rax
1075 ; SSSE3-NEXT: andl $1, %eax
1076 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1077 ; SSSE3-NEXT: movq %xmm1, %rcx
1078 ; SSSE3-NEXT: andl $1, %ecx
1079 ; SSSE3-NEXT: movaps %xmm0, -24(%rsp)
1080 ; SSSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
1081 ; SSSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
1082 ; SSSE3-NEXT: pandn %xmm0, %xmm2
1083 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
1086 ; SSE41-LABEL: var_shuffle_zero_v2f64:
1088 ; SSE41-NEXT: movapd %xmm0, %xmm2
1089 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
1090 ; SSE41-NEXT: pxor %xmm1, %xmm0
1091 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1092 ; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1093 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
1094 ; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
1095 ; SSE41-NEXT: pand %xmm3, %xmm4
1096 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1097 ; SSE41-NEXT: por %xmm4, %xmm3
1098 ; SSE41-NEXT: por %xmm3, %xmm1
1099 ; SSE41-NEXT: pxor %xmm0, %xmm0
1100 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
1101 ; SSE41-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0]
1102 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1103 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
1104 ; SSE41-NEXT: pandn %xmm2, %xmm3
1105 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1108 ; XOP-LABEL: var_shuffle_zero_v2f64:
1110 ; XOP-NEXT: vpcomgtuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1111 ; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1
1112 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1113 ; XOP-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
1114 ; XOP-NEXT: vpandn %xmm0, %xmm2, %xmm0
1117 ; AVX1-LABEL: var_shuffle_zero_v2f64:
1119 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1120 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1121 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
1122 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1123 ; AVX1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
1124 ; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0
1127 ; AVX2-LABEL: var_shuffle_zero_v2f64:
1129 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1130 ; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1131 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
1132 ; AVX2-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1133 ; AVX2-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
1134 ; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0
1137 ; AVX512-LABEL: var_shuffle_zero_v2f64:
1139 ; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1140 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
1141 ; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
1142 ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1143 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
1144 ; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1145 ; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
1146 ; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1147 ; AVX512-NEXT: vmovapd %zmm1, %zmm0 {%k1}
1148 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1149 ; AVX512-NEXT: vzeroupper
1152 ; AVX512VL-LABEL: var_shuffle_zero_v2f64:
1153 ; AVX512VL: # %bb.0:
1154 ; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
1155 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1156 ; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
1157 ; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1158 ; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
1159 ; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1160 ; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
1161 ; AVX512VL-NEXT: retq
1162 %cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
1163 %or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
1164 %idx0 = extractelement <2 x i64> %or, i64 0
1165 %idx1 = extractelement <2 x i64> %or, i64 1
1166 %elt0 = extractelement <2 x double> %v, i64 %idx0
1167 %elt1 = extractelement <2 x double> %v, i64 %idx1
1168 %vec0 = insertelement <2 x double> poison, double %elt0, i64 0
1169 %vec1 = insertelement <2 x double> %vec0, double %elt1, i64 1
1170 %res = select <2 x i1> %cmp, <2 x double> zeroinitializer, <2 x double> %vec1
1171 ret <2 x double> %res
1174 define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
1175 ; SSE3-LABEL: var_shuffle_v4f32:
1177 ; SSE3-NEXT: movd %xmm1, %eax
1178 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1179 ; SSE3-NEXT: movd %xmm2, %ecx
1180 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1181 ; SSE3-NEXT: movd %xmm2, %edx
1182 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1183 ; SSE3-NEXT: movd %xmm1, %esi
1184 ; SSE3-NEXT: movaps %xmm0, -24(%rsp)
1185 ; SSE3-NEXT: andl $3, %eax
1186 ; SSE3-NEXT: andl $3, %ecx
1187 ; SSE3-NEXT: andl $3, %edx
1188 ; SSE3-NEXT: andl $3, %esi
1189 ; SSE3-NEXT: movss -24(%rsp,%rsi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
1190 ; SSE3-NEXT: movss -24(%rsp,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
1191 ; SSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1192 ; SSE3-NEXT: movss -24(%rsp,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
1193 ; SSE3-NEXT: movss -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
1194 ; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1195 ; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1198 ; SSSE3-LABEL: var_shuffle_v4f32:
1200 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036]
1201 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
1202 ; SSSE3-NEXT: pmuludq %xmm2, %xmm1
1203 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1204 ; SSSE3-NEXT: pmuludq %xmm2, %xmm3
1205 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1206 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1207 ; SSSE3-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1208 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
1211 ; SSE41-LABEL: var_shuffle_v4f32:
1213 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1214 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1215 ; SSE41-NEXT: pshufb %xmm1, %xmm0
1218 ; AVX-LABEL: var_shuffle_v4f32:
1220 ; AVX-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1222 %index0 = extractelement <4 x i32> %indices, i32 0
1223 %index1 = extractelement <4 x i32> %indices, i32 1
1224 %index2 = extractelement <4 x i32> %indices, i32 2
1225 %index3 = extractelement <4 x i32> %indices, i32 3
1226 %v0 = extractelement <4 x float> %v, i32 %index0
1227 %v1 = extractelement <4 x float> %v, i32 %index1
1228 %v2 = extractelement <4 x float> %v, i32 %index2
1229 %v3 = extractelement <4 x float> %v, i32 %index3
1230 %ret0 = insertelement <4 x float> undef, float %v0, i32 0
1231 %ret1 = insertelement <4 x float> %ret0, float %v1, i32 1
1232 %ret2 = insertelement <4 x float> %ret1, float %v2, i32 2
1233 %ret3 = insertelement <4 x float> %ret2, float %v3, i32 3
1234 ret <4 x float> %ret3
1237 define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
1238 ; SSE3-LABEL: var_shuffle_zero_v4f32:
1240 ; SSE3-NEXT: movaps %xmm0, %xmm2
1241 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
1242 ; SSE3-NEXT: pxor %xmm1, %xmm0
1243 ; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1244 ; SSE3-NEXT: por %xmm0, %xmm1
1245 ; SSE3-NEXT: movd %xmm1, %eax
1246 ; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
1247 ; SSE3-NEXT: movd %xmm3, %ecx
1248 ; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
1249 ; SSE3-NEXT: movd %xmm3, %edx
1250 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1251 ; SSE3-NEXT: movd %xmm1, %esi
1252 ; SSE3-NEXT: movaps %xmm2, -24(%rsp)
1253 ; SSE3-NEXT: andl $3, %eax
1254 ; SSE3-NEXT: andl $3, %ecx
1255 ; SSE3-NEXT: andl $3, %edx
1256 ; SSE3-NEXT: andl $3, %esi
1257 ; SSE3-NEXT: movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
1258 ; SSE3-NEXT: movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
1259 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1260 ; SSE3-NEXT: movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
1261 ; SSE3-NEXT: movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero
1262 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1263 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1264 ; SSE3-NEXT: pandn %xmm1, %xmm0
1267 ; SSSE3-LABEL: var_shuffle_zero_v4f32:
1269 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
1270 ; SSSE3-NEXT: pxor %xmm1, %xmm2
1271 ; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1272 ; SSSE3-NEXT: por %xmm2, %xmm1
1273 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [67372036,67372036,67372036,67372036]
1274 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1275 ; SSSE3-NEXT: pmuludq %xmm3, %xmm1
1276 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1277 ; SSSE3-NEXT: pmuludq %xmm3, %xmm4
1278 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
1279 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1280 ; SSSE3-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1281 ; SSSE3-NEXT: por %xmm2, %xmm1
1282 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
1285 ; SSE41-LABEL: var_shuffle_zero_v4f32:
1287 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [4,4,4,4]
1288 ; SSE41-NEXT: pmaxud %xmm1, %xmm2
1289 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
1290 ; SSE41-NEXT: por %xmm2, %xmm1
1291 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1292 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1293 ; SSE41-NEXT: por %xmm2, %xmm1
1294 ; SSE41-NEXT: pshufb %xmm1, %xmm0
1297 ; XOP-LABEL: var_shuffle_zero_v4f32:
1299 ; XOP-NEXT: vpcomgtud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1300 ; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1
1301 ; XOP-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1302 ; XOP-NEXT: vpandn %xmm0, %xmm2, %xmm0
1305 ; AVX1-LABEL: var_shuffle_zero_v4f32:
1307 ; AVX1-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1308 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
1309 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
1310 ; AVX1-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1311 ; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0
1314 ; AVX2-LABEL: var_shuffle_zero_v4f32:
1316 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
1317 ; AVX2-NEXT: vpmaxud %xmm2, %xmm1, %xmm2
1318 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
1319 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
1320 ; AVX2-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1321 ; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0
1324 ; AVX512-LABEL: var_shuffle_zero_v4f32:
1326 ; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1327 ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
1328 ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1329 ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
1330 ; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1331 ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
1332 ; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
1333 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1334 ; AVX512-NEXT: vzeroupper
1337 ; AVX512VL-LABEL: var_shuffle_zero_v4f32:
1338 ; AVX512VL: # %bb.0:
1339 ; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
1340 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1341 ; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
1342 ; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1343 ; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
1344 ; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1}
1345 ; AVX512VL-NEXT: retq
1346 %cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
1347 %or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
1348 %idx0 = extractelement <4 x i32> %or, i64 0
1349 %idx1 = extractelement <4 x i32> %or, i64 1
1350 %idx2 = extractelement <4 x i32> %or, i64 2
1351 %idx3 = extractelement <4 x i32> %or, i64 3
1352 %elt0 = extractelement <4 x float> %v, i32 %idx0
1353 %elt1 = extractelement <4 x float> %v, i32 %idx1
1354 %elt2 = extractelement <4 x float> %v, i32 %idx2
1355 %elt3 = extractelement <4 x float> %v, i32 %idx3
1356 %vec0 = insertelement <4 x float> poison, float %elt0, i64 0
1357 %vec1 = insertelement <4 x float> %vec0, float %elt1, i64 1
1358 %vec2 = insertelement <4 x float> %vec1, float %elt2, i64 2
1359 %vec3 = insertelement <4 x float> %vec2, float %elt3, i64 3
1360 %res = select <4 x i1> %cmp, <4 x float> zeroinitializer, <4 x float> %vec3
1361 ret <4 x float> %res
1364 define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
1365 ; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
1367 ; SSE3-NEXT: movaps %xmm1, -40(%rsp)
1368 ; SSE3-NEXT: movaps %xmm0, -24(%rsp)
1369 ; SSE3-NEXT: movzbl -25(%rsp), %eax
1370 ; SSE3-NEXT: andl $15, %eax
1371 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1372 ; SSE3-NEXT: movd %eax, %xmm1
1373 ; SSE3-NEXT: movzbl -26(%rsp), %eax
1374 ; SSE3-NEXT: andl $15, %eax
1375 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1376 ; SSE3-NEXT: movd %eax, %xmm2
1377 ; SSE3-NEXT: movzbl -27(%rsp), %eax
1378 ; SSE3-NEXT: andl $15, %eax
1379 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1380 ; SSE3-NEXT: movd %eax, %xmm4
1381 ; SSE3-NEXT: movzbl -28(%rsp), %eax
1382 ; SSE3-NEXT: andl $15, %eax
1383 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1384 ; SSE3-NEXT: movd %eax, %xmm3
1385 ; SSE3-NEXT: movzbl -29(%rsp), %eax
1386 ; SSE3-NEXT: andl $15, %eax
1387 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1388 ; SSE3-NEXT: movd %eax, %xmm6
1389 ; SSE3-NEXT: movzbl -30(%rsp), %eax
1390 ; SSE3-NEXT: andl $15, %eax
1391 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1392 ; SSE3-NEXT: movd %eax, %xmm7
1393 ; SSE3-NEXT: movzbl -31(%rsp), %eax
1394 ; SSE3-NEXT: andl $15, %eax
1395 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1396 ; SSE3-NEXT: movd %eax, %xmm8
1397 ; SSE3-NEXT: movzbl -32(%rsp), %eax
1398 ; SSE3-NEXT: andl $15, %eax
1399 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1400 ; SSE3-NEXT: movd %eax, %xmm5
1401 ; SSE3-NEXT: movzbl -33(%rsp), %eax
1402 ; SSE3-NEXT: andl $15, %eax
1403 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1404 ; SSE3-NEXT: movd %eax, %xmm9
1405 ; SSE3-NEXT: movzbl -34(%rsp), %eax
1406 ; SSE3-NEXT: andl $15, %eax
1407 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1408 ; SSE3-NEXT: movd %eax, %xmm10
1409 ; SSE3-NEXT: movzbl -35(%rsp), %eax
1410 ; SSE3-NEXT: andl $15, %eax
1411 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1412 ; SSE3-NEXT: movd %eax, %xmm12
1413 ; SSE3-NEXT: movzbl -36(%rsp), %eax
1414 ; SSE3-NEXT: andl $15, %eax
1415 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1416 ; SSE3-NEXT: movd %eax, %xmm11
1417 ; SSE3-NEXT: movzbl -37(%rsp), %eax
1418 ; SSE3-NEXT: andl $15, %eax
1419 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1420 ; SSE3-NEXT: movd %eax, %xmm13
1421 ; SSE3-NEXT: movzbl -38(%rsp), %eax
1422 ; SSE3-NEXT: andl $15, %eax
1423 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1424 ; SSE3-NEXT: movd %eax, %xmm14
1425 ; SSE3-NEXT: movzbl -39(%rsp), %eax
1426 ; SSE3-NEXT: andl $15, %eax
1427 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1428 ; SSE3-NEXT: movd %eax, %xmm15
1429 ; SSE3-NEXT: movzbl -40(%rsp), %eax
1430 ; SSE3-NEXT: andl $15, %eax
1431 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
1432 ; SSE3-NEXT: movd %eax, %xmm0
1433 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1434 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1435 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1436 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1437 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1438 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
1439 ; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
1440 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
1441 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
1442 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1443 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
1444 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
1445 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
1446 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
1447 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1450 ; SSSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
1452 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
1455 ; SSE41-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
1457 ; SSE41-NEXT: pshufb %xmm1, %xmm0
1460 ; AVX-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
1462 ; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1463 ; AVX-NEXT: vzeroupper
1465 %index0 = extractelement <32 x i8> %indices, i32 0
1466 %index1 = extractelement <32 x i8> %indices, i32 1
1467 %index2 = extractelement <32 x i8> %indices, i32 2
1468 %index3 = extractelement <32 x i8> %indices, i32 3
1469 %index4 = extractelement <32 x i8> %indices, i32 4
1470 %index5 = extractelement <32 x i8> %indices, i32 5
1471 %index6 = extractelement <32 x i8> %indices, i32 6
1472 %index7 = extractelement <32 x i8> %indices, i32 7
1473 %index8 = extractelement <32 x i8> %indices, i32 8
1474 %index9 = extractelement <32 x i8> %indices, i32 9
1475 %index10 = extractelement <32 x i8> %indices, i32 10
1476 %index11 = extractelement <32 x i8> %indices, i32 11
1477 %index12 = extractelement <32 x i8> %indices, i32 12
1478 %index13 = extractelement <32 x i8> %indices, i32 13
1479 %index14 = extractelement <32 x i8> %indices, i32 14
1480 %index15 = extractelement <32 x i8> %indices, i32 15
1481 %v0 = extractelement <16 x i8> %v, i8 %index0
1482 %v1 = extractelement <16 x i8> %v, i8 %index1
1483 %v2 = extractelement <16 x i8> %v, i8 %index2
1484 %v3 = extractelement <16 x i8> %v, i8 %index3
1485 %v4 = extractelement <16 x i8> %v, i8 %index4
1486 %v5 = extractelement <16 x i8> %v, i8 %index5
1487 %v6 = extractelement <16 x i8> %v, i8 %index6
1488 %v7 = extractelement <16 x i8> %v, i8 %index7
1489 %v8 = extractelement <16 x i8> %v, i8 %index8
1490 %v9 = extractelement <16 x i8> %v, i8 %index9
1491 %v10 = extractelement <16 x i8> %v, i8 %index10
1492 %v11 = extractelement <16 x i8> %v, i8 %index11
1493 %v12 = extractelement <16 x i8> %v, i8 %index12
1494 %v13 = extractelement <16 x i8> %v, i8 %index13
1495 %v14 = extractelement <16 x i8> %v, i8 %index14
1496 %v15 = extractelement <16 x i8> %v, i8 %index15
1497 %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
1498 %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
1499 %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
1500 %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
1501 %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
1502 %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
1503 %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
1504 %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
1505 %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
1506 %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
1507 %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
1508 %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
1509 %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
1510 %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
1511 %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
1512 %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
1513 ret <16 x i8> %ret15
1516 define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %indices) nounwind {
1517 ; SSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1519 ; SSE3-NEXT: pushq %rbp
1520 ; SSE3-NEXT: pushq %r15
1521 ; SSE3-NEXT: pushq %r14
1522 ; SSE3-NEXT: pushq %r13
1523 ; SSE3-NEXT: pushq %r12
1524 ; SSE3-NEXT: pushq %rbx
1525 ; SSE3-NEXT: subq $424, %rsp # imm = 0x1A8
1526 ; SSE3-NEXT: movaps %xmm2, -128(%rsp)
1527 ; SSE3-NEXT: movaps %xmm1, 400(%rsp)
1528 ; SSE3-NEXT: movaps %xmm0, 384(%rsp)
1529 ; SSE3-NEXT: movzbl -128(%rsp), %eax
1530 ; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1531 ; SSE3-NEXT: movaps %xmm1, 368(%rsp)
1532 ; SSE3-NEXT: movaps %xmm0, 352(%rsp)
1533 ; SSE3-NEXT: movzbl -127(%rsp), %ecx
1534 ; SSE3-NEXT: movaps %xmm1, 336(%rsp)
1535 ; SSE3-NEXT: movaps %xmm0, 320(%rsp)
1536 ; SSE3-NEXT: movzbl -126(%rsp), %edx
1537 ; SSE3-NEXT: movaps %xmm1, 304(%rsp)
1538 ; SSE3-NEXT: movaps %xmm0, 288(%rsp)
1539 ; SSE3-NEXT: movzbl -125(%rsp), %esi
1540 ; SSE3-NEXT: movaps %xmm1, 272(%rsp)
1541 ; SSE3-NEXT: movaps %xmm0, 256(%rsp)
1542 ; SSE3-NEXT: movzbl -124(%rsp), %edi
1543 ; SSE3-NEXT: movaps %xmm1, 240(%rsp)
1544 ; SSE3-NEXT: movaps %xmm0, 224(%rsp)
1545 ; SSE3-NEXT: movzbl -123(%rsp), %r8d
1546 ; SSE3-NEXT: movaps %xmm1, 208(%rsp)
1547 ; SSE3-NEXT: movaps %xmm0, 192(%rsp)
1548 ; SSE3-NEXT: movzbl -122(%rsp), %r9d
1549 ; SSE3-NEXT: movaps %xmm1, 176(%rsp)
1550 ; SSE3-NEXT: movaps %xmm0, 160(%rsp)
1551 ; SSE3-NEXT: movzbl -121(%rsp), %r10d
1552 ; SSE3-NEXT: movaps %xmm1, 144(%rsp)
1553 ; SSE3-NEXT: movaps %xmm0, 128(%rsp)
1554 ; SSE3-NEXT: movzbl -120(%rsp), %r11d
1555 ; SSE3-NEXT: movaps %xmm1, 112(%rsp)
1556 ; SSE3-NEXT: movaps %xmm0, 96(%rsp)
1557 ; SSE3-NEXT: movzbl -119(%rsp), %ebx
1558 ; SSE3-NEXT: movaps %xmm1, 80(%rsp)
1559 ; SSE3-NEXT: movaps %xmm0, 64(%rsp)
1560 ; SSE3-NEXT: movzbl -118(%rsp), %r14d
1561 ; SSE3-NEXT: movaps %xmm1, 48(%rsp)
1562 ; SSE3-NEXT: movaps %xmm0, 32(%rsp)
1563 ; SSE3-NEXT: movzbl -117(%rsp), %r15d
1564 ; SSE3-NEXT: movaps %xmm1, 16(%rsp)
1565 ; SSE3-NEXT: movaps %xmm0, (%rsp)
1566 ; SSE3-NEXT: movzbl -116(%rsp), %r12d
1567 ; SSE3-NEXT: movaps %xmm1, -16(%rsp)
1568 ; SSE3-NEXT: movaps %xmm0, -32(%rsp)
1569 ; SSE3-NEXT: movzbl -115(%rsp), %r13d
1570 ; SSE3-NEXT: movaps %xmm1, -48(%rsp)
1571 ; SSE3-NEXT: movaps %xmm0, -64(%rsp)
1572 ; SSE3-NEXT: movzbl -114(%rsp), %ebp
1573 ; SSE3-NEXT: movaps %xmm1, -80(%rsp)
1574 ; SSE3-NEXT: movaps %xmm0, -96(%rsp)
1575 ; SSE3-NEXT: movzbl -113(%rsp), %eax
1576 ; SSE3-NEXT: andl $31, %eax
1577 ; SSE3-NEXT: movzbl -96(%rsp,%rax), %eax
1578 ; SSE3-NEXT: movd %eax, %xmm1
1579 ; SSE3-NEXT: andl $31, %ebp
1580 ; SSE3-NEXT: movzbl -64(%rsp,%rbp), %eax
1581 ; SSE3-NEXT: movd %eax, %xmm2
1582 ; SSE3-NEXT: andl $31, %r13d
1583 ; SSE3-NEXT: movzbl -32(%rsp,%r13), %eax
1584 ; SSE3-NEXT: movd %eax, %xmm4
1585 ; SSE3-NEXT: andl $31, %r12d
1586 ; SSE3-NEXT: movzbl (%rsp,%r12), %eax
1587 ; SSE3-NEXT: movd %eax, %xmm3
1588 ; SSE3-NEXT: andl $31, %r15d
1589 ; SSE3-NEXT: movzbl 32(%rsp,%r15), %eax
1590 ; SSE3-NEXT: movd %eax, %xmm6
1591 ; SSE3-NEXT: andl $31, %r14d
1592 ; SSE3-NEXT: movzbl 64(%rsp,%r14), %eax
1593 ; SSE3-NEXT: movd %eax, %xmm7
1594 ; SSE3-NEXT: andl $31, %ebx
1595 ; SSE3-NEXT: movzbl 96(%rsp,%rbx), %eax
1596 ; SSE3-NEXT: movd %eax, %xmm8
1597 ; SSE3-NEXT: andl $31, %r11d
1598 ; SSE3-NEXT: movzbl 128(%rsp,%r11), %eax
1599 ; SSE3-NEXT: movd %eax, %xmm5
1600 ; SSE3-NEXT: andl $31, %r10d
1601 ; SSE3-NEXT: movzbl 160(%rsp,%r10), %eax
1602 ; SSE3-NEXT: movd %eax, %xmm9
1603 ; SSE3-NEXT: andl $31, %r9d
1604 ; SSE3-NEXT: movzbl 192(%rsp,%r9), %eax
1605 ; SSE3-NEXT: movd %eax, %xmm10
1606 ; SSE3-NEXT: andl $31, %r8d
1607 ; SSE3-NEXT: movzbl 224(%rsp,%r8), %eax
1608 ; SSE3-NEXT: movd %eax, %xmm12
1609 ; SSE3-NEXT: andl $31, %edi
1610 ; SSE3-NEXT: movzbl 256(%rsp,%rdi), %eax
1611 ; SSE3-NEXT: movd %eax, %xmm11
1612 ; SSE3-NEXT: andl $31, %esi
1613 ; SSE3-NEXT: movzbl 288(%rsp,%rsi), %eax
1614 ; SSE3-NEXT: movd %eax, %xmm13
1615 ; SSE3-NEXT: andl $31, %edx
1616 ; SSE3-NEXT: movzbl 320(%rsp,%rdx), %eax
1617 ; SSE3-NEXT: movd %eax, %xmm14
1618 ; SSE3-NEXT: andl $31, %ecx
1619 ; SSE3-NEXT: movzbl 352(%rsp,%rcx), %eax
1620 ; SSE3-NEXT: movd %eax, %xmm15
1621 ; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1622 ; SSE3-NEXT: andl $31, %eax
1623 ; SSE3-NEXT: movzbl 384(%rsp,%rax), %eax
1624 ; SSE3-NEXT: movd %eax, %xmm0
1625 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1626 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1627 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1628 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1629 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1630 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
1631 ; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
1632 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
1633 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
1634 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1635 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
1636 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
1637 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
1638 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
1639 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1640 ; SSE3-NEXT: addq $424, %rsp # imm = 0x1A8
1641 ; SSE3-NEXT: popq %rbx
1642 ; SSE3-NEXT: popq %r12
1643 ; SSE3-NEXT: popq %r13
1644 ; SSE3-NEXT: popq %r14
1645 ; SSE3-NEXT: popq %r15
1646 ; SSE3-NEXT: popq %rbp
1649 ; SSSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1651 ; SSSE3-NEXT: pushq %rbp
1652 ; SSSE3-NEXT: pushq %r15
1653 ; SSSE3-NEXT: pushq %r14
1654 ; SSSE3-NEXT: pushq %r13
1655 ; SSSE3-NEXT: pushq %r12
1656 ; SSSE3-NEXT: pushq %rbx
1657 ; SSSE3-NEXT: subq $424, %rsp # imm = 0x1A8
1658 ; SSSE3-NEXT: movaps %xmm2, -128(%rsp)
1659 ; SSSE3-NEXT: movaps %xmm1, 400(%rsp)
1660 ; SSSE3-NEXT: movaps %xmm0, 384(%rsp)
1661 ; SSSE3-NEXT: movzbl -128(%rsp), %eax
1662 ; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1663 ; SSSE3-NEXT: movaps %xmm1, 368(%rsp)
1664 ; SSSE3-NEXT: movaps %xmm0, 352(%rsp)
1665 ; SSSE3-NEXT: movzbl -127(%rsp), %ecx
1666 ; SSSE3-NEXT: movaps %xmm1, 336(%rsp)
1667 ; SSSE3-NEXT: movaps %xmm0, 320(%rsp)
1668 ; SSSE3-NEXT: movzbl -126(%rsp), %edx
1669 ; SSSE3-NEXT: movaps %xmm1, 304(%rsp)
1670 ; SSSE3-NEXT: movaps %xmm0, 288(%rsp)
1671 ; SSSE3-NEXT: movzbl -125(%rsp), %esi
1672 ; SSSE3-NEXT: movaps %xmm1, 272(%rsp)
1673 ; SSSE3-NEXT: movaps %xmm0, 256(%rsp)
1674 ; SSSE3-NEXT: movzbl -124(%rsp), %edi
1675 ; SSSE3-NEXT: movaps %xmm1, 240(%rsp)
1676 ; SSSE3-NEXT: movaps %xmm0, 224(%rsp)
1677 ; SSSE3-NEXT: movzbl -123(%rsp), %r8d
1678 ; SSSE3-NEXT: movaps %xmm1, 208(%rsp)
1679 ; SSSE3-NEXT: movaps %xmm0, 192(%rsp)
1680 ; SSSE3-NEXT: movzbl -122(%rsp), %r9d
1681 ; SSSE3-NEXT: movaps %xmm1, 176(%rsp)
1682 ; SSSE3-NEXT: movaps %xmm0, 160(%rsp)
1683 ; SSSE3-NEXT: movzbl -121(%rsp), %r10d
1684 ; SSSE3-NEXT: movaps %xmm1, 144(%rsp)
1685 ; SSSE3-NEXT: movaps %xmm0, 128(%rsp)
1686 ; SSSE3-NEXT: movzbl -120(%rsp), %r11d
1687 ; SSSE3-NEXT: movaps %xmm1, 112(%rsp)
1688 ; SSSE3-NEXT: movaps %xmm0, 96(%rsp)
1689 ; SSSE3-NEXT: movzbl -119(%rsp), %ebx
1690 ; SSSE3-NEXT: movaps %xmm1, 80(%rsp)
1691 ; SSSE3-NEXT: movaps %xmm0, 64(%rsp)
1692 ; SSSE3-NEXT: movzbl -118(%rsp), %r14d
1693 ; SSSE3-NEXT: movaps %xmm1, 48(%rsp)
1694 ; SSSE3-NEXT: movaps %xmm0, 32(%rsp)
1695 ; SSSE3-NEXT: movzbl -117(%rsp), %r15d
1696 ; SSSE3-NEXT: movaps %xmm1, 16(%rsp)
1697 ; SSSE3-NEXT: movaps %xmm0, (%rsp)
1698 ; SSSE3-NEXT: movzbl -116(%rsp), %r12d
1699 ; SSSE3-NEXT: movaps %xmm1, -16(%rsp)
1700 ; SSSE3-NEXT: movaps %xmm0, -32(%rsp)
1701 ; SSSE3-NEXT: movzbl -115(%rsp), %r13d
1702 ; SSSE3-NEXT: movaps %xmm1, -48(%rsp)
1703 ; SSSE3-NEXT: movaps %xmm0, -64(%rsp)
1704 ; SSSE3-NEXT: movzbl -114(%rsp), %ebp
1705 ; SSSE3-NEXT: movaps %xmm1, -80(%rsp)
1706 ; SSSE3-NEXT: movaps %xmm0, -96(%rsp)
1707 ; SSSE3-NEXT: movzbl -113(%rsp), %eax
1708 ; SSSE3-NEXT: andl $31, %eax
1709 ; SSSE3-NEXT: movzbl -96(%rsp,%rax), %eax
1710 ; SSSE3-NEXT: movd %eax, %xmm1
1711 ; SSSE3-NEXT: andl $31, %ebp
1712 ; SSSE3-NEXT: movzbl -64(%rsp,%rbp), %eax
1713 ; SSSE3-NEXT: movd %eax, %xmm2
1714 ; SSSE3-NEXT: andl $31, %r13d
1715 ; SSSE3-NEXT: movzbl -32(%rsp,%r13), %eax
1716 ; SSSE3-NEXT: movd %eax, %xmm4
1717 ; SSSE3-NEXT: andl $31, %r12d
1718 ; SSSE3-NEXT: movzbl (%rsp,%r12), %eax
1719 ; SSSE3-NEXT: movd %eax, %xmm3
1720 ; SSSE3-NEXT: andl $31, %r15d
1721 ; SSSE3-NEXT: movzbl 32(%rsp,%r15), %eax
1722 ; SSSE3-NEXT: movd %eax, %xmm6
1723 ; SSSE3-NEXT: andl $31, %r14d
1724 ; SSSE3-NEXT: movzbl 64(%rsp,%r14), %eax
1725 ; SSSE3-NEXT: movd %eax, %xmm7
1726 ; SSSE3-NEXT: andl $31, %ebx
1727 ; SSSE3-NEXT: movzbl 96(%rsp,%rbx), %eax
1728 ; SSSE3-NEXT: movd %eax, %xmm8
1729 ; SSSE3-NEXT: andl $31, %r11d
1730 ; SSSE3-NEXT: movzbl 128(%rsp,%r11), %eax
1731 ; SSSE3-NEXT: movd %eax, %xmm5
1732 ; SSSE3-NEXT: andl $31, %r10d
1733 ; SSSE3-NEXT: movzbl 160(%rsp,%r10), %eax
1734 ; SSSE3-NEXT: movd %eax, %xmm9
1735 ; SSSE3-NEXT: andl $31, %r9d
1736 ; SSSE3-NEXT: movzbl 192(%rsp,%r9), %eax
1737 ; SSSE3-NEXT: movd %eax, %xmm10
1738 ; SSSE3-NEXT: andl $31, %r8d
1739 ; SSSE3-NEXT: movzbl 224(%rsp,%r8), %eax
1740 ; SSSE3-NEXT: movd %eax, %xmm12
1741 ; SSSE3-NEXT: andl $31, %edi
1742 ; SSSE3-NEXT: movzbl 256(%rsp,%rdi), %eax
1743 ; SSSE3-NEXT: movd %eax, %xmm11
1744 ; SSSE3-NEXT: andl $31, %esi
1745 ; SSSE3-NEXT: movzbl 288(%rsp,%rsi), %eax
1746 ; SSSE3-NEXT: movd %eax, %xmm13
1747 ; SSSE3-NEXT: andl $31, %edx
1748 ; SSSE3-NEXT: movzbl 320(%rsp,%rdx), %eax
1749 ; SSSE3-NEXT: movd %eax, %xmm14
1750 ; SSSE3-NEXT: andl $31, %ecx
1751 ; SSSE3-NEXT: movzbl 352(%rsp,%rcx), %eax
1752 ; SSSE3-NEXT: movd %eax, %xmm15
1753 ; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1754 ; SSSE3-NEXT: andl $31, %eax
1755 ; SSSE3-NEXT: movzbl 384(%rsp,%rax), %eax
1756 ; SSSE3-NEXT: movd %eax, %xmm0
1757 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1758 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1759 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1760 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1761 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1762 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
1763 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
1764 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
1765 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
1766 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1767 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
1768 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
1769 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
1770 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
1771 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1772 ; SSSE3-NEXT: addq $424, %rsp # imm = 0x1A8
1773 ; SSSE3-NEXT: popq %rbx
1774 ; SSSE3-NEXT: popq %r12
1775 ; SSSE3-NEXT: popq %r13
1776 ; SSSE3-NEXT: popq %r14
1777 ; SSSE3-NEXT: popq %r15
1778 ; SSSE3-NEXT: popq %rbp
1781 ; SSE41-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1783 ; SSE41-NEXT: subq $392, %rsp # imm = 0x188
1784 ; SSE41-NEXT: movd %xmm2, %eax
1785 ; SSE41-NEXT: movaps %xmm1, 368(%rsp)
1786 ; SSE41-NEXT: movaps %xmm0, 352(%rsp)
1787 ; SSE41-NEXT: andl $31, %eax
1788 ; SSE41-NEXT: movaps %xmm1, 336(%rsp)
1789 ; SSE41-NEXT: movaps %xmm0, 320(%rsp)
1790 ; SSE41-NEXT: movaps %xmm1, 304(%rsp)
1791 ; SSE41-NEXT: movaps %xmm0, 288(%rsp)
1792 ; SSE41-NEXT: movaps %xmm1, 272(%rsp)
1793 ; SSE41-NEXT: movaps %xmm0, 256(%rsp)
1794 ; SSE41-NEXT: movaps %xmm1, 240(%rsp)
1795 ; SSE41-NEXT: movaps %xmm0, 224(%rsp)
1796 ; SSE41-NEXT: movaps %xmm1, 208(%rsp)
1797 ; SSE41-NEXT: movaps %xmm0, 192(%rsp)
1798 ; SSE41-NEXT: movaps %xmm1, 176(%rsp)
1799 ; SSE41-NEXT: movaps %xmm0, 160(%rsp)
1800 ; SSE41-NEXT: movaps %xmm1, 144(%rsp)
1801 ; SSE41-NEXT: movaps %xmm0, 128(%rsp)
1802 ; SSE41-NEXT: movaps %xmm1, 112(%rsp)
1803 ; SSE41-NEXT: movaps %xmm0, 96(%rsp)
1804 ; SSE41-NEXT: movaps %xmm1, 80(%rsp)
1805 ; SSE41-NEXT: movaps %xmm0, 64(%rsp)
1806 ; SSE41-NEXT: movaps %xmm1, 48(%rsp)
1807 ; SSE41-NEXT: movaps %xmm0, 32(%rsp)
1808 ; SSE41-NEXT: movaps %xmm1, 16(%rsp)
1809 ; SSE41-NEXT: movaps %xmm0, (%rsp)
1810 ; SSE41-NEXT: movaps %xmm1, -16(%rsp)
1811 ; SSE41-NEXT: movaps %xmm0, -32(%rsp)
1812 ; SSE41-NEXT: movaps %xmm1, -48(%rsp)
1813 ; SSE41-NEXT: movaps %xmm0, -64(%rsp)
1814 ; SSE41-NEXT: movaps %xmm1, -80(%rsp)
1815 ; SSE41-NEXT: movaps %xmm0, -96(%rsp)
1816 ; SSE41-NEXT: movaps %xmm1, -112(%rsp)
1817 ; SSE41-NEXT: movaps %xmm0, -128(%rsp)
1818 ; SSE41-NEXT: movzbl 352(%rsp,%rax), %eax
1819 ; SSE41-NEXT: movd %eax, %xmm0
1820 ; SSE41-NEXT: pextrb $1, %xmm2, %eax
1821 ; SSE41-NEXT: andl $31, %eax
1822 ; SSE41-NEXT: pinsrb $1, 320(%rsp,%rax), %xmm0
1823 ; SSE41-NEXT: pextrb $2, %xmm2, %eax
1824 ; SSE41-NEXT: andl $31, %eax
1825 ; SSE41-NEXT: pinsrb $2, 288(%rsp,%rax), %xmm0
1826 ; SSE41-NEXT: pextrb $3, %xmm2, %eax
1827 ; SSE41-NEXT: andl $31, %eax
1828 ; SSE41-NEXT: pinsrb $3, 256(%rsp,%rax), %xmm0
1829 ; SSE41-NEXT: pextrb $4, %xmm2, %eax
1830 ; SSE41-NEXT: andl $31, %eax
1831 ; SSE41-NEXT: pinsrb $4, 224(%rsp,%rax), %xmm0
1832 ; SSE41-NEXT: pextrb $5, %xmm2, %eax
1833 ; SSE41-NEXT: andl $31, %eax
1834 ; SSE41-NEXT: pinsrb $5, 192(%rsp,%rax), %xmm0
1835 ; SSE41-NEXT: pextrb $6, %xmm2, %eax
1836 ; SSE41-NEXT: andl $31, %eax
1837 ; SSE41-NEXT: pinsrb $6, 160(%rsp,%rax), %xmm0
1838 ; SSE41-NEXT: pextrb $7, %xmm2, %eax
1839 ; SSE41-NEXT: andl $31, %eax
1840 ; SSE41-NEXT: pinsrb $7, 128(%rsp,%rax), %xmm0
1841 ; SSE41-NEXT: pextrb $8, %xmm2, %eax
1842 ; SSE41-NEXT: andl $31, %eax
1843 ; SSE41-NEXT: pinsrb $8, 96(%rsp,%rax), %xmm0
1844 ; SSE41-NEXT: pextrb $9, %xmm2, %eax
1845 ; SSE41-NEXT: andl $31, %eax
1846 ; SSE41-NEXT: pinsrb $9, 64(%rsp,%rax), %xmm0
1847 ; SSE41-NEXT: pextrb $10, %xmm2, %eax
1848 ; SSE41-NEXT: andl $31, %eax
1849 ; SSE41-NEXT: pinsrb $10, 32(%rsp,%rax), %xmm0
1850 ; SSE41-NEXT: pextrb $11, %xmm2, %eax
1851 ; SSE41-NEXT: andl $31, %eax
1852 ; SSE41-NEXT: pinsrb $11, (%rsp,%rax), %xmm0
1853 ; SSE41-NEXT: pextrb $12, %xmm2, %eax
1854 ; SSE41-NEXT: andl $31, %eax
1855 ; SSE41-NEXT: pinsrb $12, -32(%rsp,%rax), %xmm0
1856 ; SSE41-NEXT: pextrb $13, %xmm2, %eax
1857 ; SSE41-NEXT: andl $31, %eax
1858 ; SSE41-NEXT: pinsrb $13, -64(%rsp,%rax), %xmm0
1859 ; SSE41-NEXT: pextrb $14, %xmm2, %eax
1860 ; SSE41-NEXT: andl $31, %eax
1861 ; SSE41-NEXT: pinsrb $14, -96(%rsp,%rax), %xmm0
1862 ; SSE41-NEXT: pextrb $15, %xmm2, %eax
1863 ; SSE41-NEXT: andl $31, %eax
1864 ; SSE41-NEXT: pinsrb $15, -128(%rsp,%rax), %xmm0
1865 ; SSE41-NEXT: addq $392, %rsp # imm = 0x188
1868 ; XOP-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1870 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1871 ; XOP-NEXT: vpperm %xmm1, %xmm2, %xmm0, %xmm0
1872 ; XOP-NEXT: vzeroupper
1875 ; AVX1-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1877 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1878 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2
1879 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1880 ; AVX1-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1881 ; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1882 ; AVX1-NEXT: vzeroupper
1885 ; AVX2-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1887 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1888 ; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm2
1889 ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1890 ; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1891 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1892 ; AVX2-NEXT: vzeroupper
1895 ; AVX512-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1897 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
1898 ; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2
1899 ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1900 ; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1901 ; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1902 ; AVX512-NEXT: vzeroupper
1905 ; AVX512VLBW-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1906 ; AVX512VLBW: # %bb.0:
1907 ; AVX512VLBW-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1908 ; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm2
1909 ; AVX512VLBW-NEXT: vpshufb %xmm1, %xmm2, %xmm2
1910 ; AVX512VLBW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1911 ; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
1912 ; AVX512VLBW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1}
1913 ; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1914 ; AVX512VLBW-NEXT: vzeroupper
1915 ; AVX512VLBW-NEXT: retq
1917 ; VLVBMI-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1919 ; VLVBMI-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1920 ; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
1921 ; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1922 ; VLVBMI-NEXT: vzeroupper
1924 %index0 = extractelement <16 x i8> %indices, i32 0
1925 %index1 = extractelement <16 x i8> %indices, i32 1
1926 %index2 = extractelement <16 x i8> %indices, i32 2
1927 %index3 = extractelement <16 x i8> %indices, i32 3
1928 %index4 = extractelement <16 x i8> %indices, i32 4
1929 %index5 = extractelement <16 x i8> %indices, i32 5
1930 %index6 = extractelement <16 x i8> %indices, i32 6
1931 %index7 = extractelement <16 x i8> %indices, i32 7
1932 %index8 = extractelement <16 x i8> %indices, i32 8
1933 %index9 = extractelement <16 x i8> %indices, i32 9
1934 %index10 = extractelement <16 x i8> %indices, i32 10
1935 %index11 = extractelement <16 x i8> %indices, i32 11
1936 %index12 = extractelement <16 x i8> %indices, i32 12
1937 %index13 = extractelement <16 x i8> %indices, i32 13
1938 %index14 = extractelement <16 x i8> %indices, i32 14
1939 %index15 = extractelement <16 x i8> %indices, i32 15
1940 %v0 = extractelement <32 x i8> %v, i8 %index0
1941 %v1 = extractelement <32 x i8> %v, i8 %index1
1942 %v2 = extractelement <32 x i8> %v, i8 %index2
1943 %v3 = extractelement <32 x i8> %v, i8 %index3
1944 %v4 = extractelement <32 x i8> %v, i8 %index4
1945 %v5 = extractelement <32 x i8> %v, i8 %index5
1946 %v6 = extractelement <32 x i8> %v, i8 %index6
1947 %v7 = extractelement <32 x i8> %v, i8 %index7
1948 %v8 = extractelement <32 x i8> %v, i8 %index8
1949 %v9 = extractelement <32 x i8> %v, i8 %index9
1950 %v10 = extractelement <32 x i8> %v, i8 %index10
1951 %v11 = extractelement <32 x i8> %v, i8 %index11
1952 %v12 = extractelement <32 x i8> %v, i8 %index12
1953 %v13 = extractelement <32 x i8> %v, i8 %index13
1954 %v14 = extractelement <32 x i8> %v, i8 %index14
1955 %v15 = extractelement <32 x i8> %v, i8 %index15
1956 %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
1957 %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
1958 %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
1959 %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
1960 %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
1961 %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
1962 %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
1963 %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
1964 %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
1965 %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
1966 %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
1967 %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
1968 %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
1969 %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
1970 %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
1971 %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
1972 ret <16 x i8> %ret15
1975 define void @indices_convert() {
1976 ; SSE3-LABEL: indices_convert:
1977 ; SSE3: # %bb.0: # %bb
1978 ; SSE3-NEXT: movaps (%rax), %xmm0
1979 ; SSE3-NEXT: movaps %xmm0, -24(%rsp)
1980 ; SSE3-NEXT: movaps %xmm0, -40(%rsp)
1981 ; SSE3-NEXT: movl (%rax), %eax
1982 ; SSE3-NEXT: movaps %xmm0, -56(%rsp)
1983 ; SSE3-NEXT: movaps %xmm0, -72(%rsp)
1984 ; SSE3-NEXT: andl $3, %eax
1985 ; SSE3-NEXT: shll $3, %eax
1986 ; SSE3-NEXT: movsd -72(%rsp,%rax), %xmm0 # xmm0 = mem[0],zero
1987 ; SSE3-NEXT: movsd -40(%rsp,%rax), %xmm1 # xmm1 = mem[0],zero
1988 ; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1989 ; SSE3-NEXT: movups %xmm1, (%rax)
1992 ; SSSE3-LABEL: indices_convert:
1993 ; SSSE3: # %bb.0: # %bb
1994 ; SSSE3-NEXT: movaps (%rax), %xmm0
1995 ; SSSE3-NEXT: movaps %xmm0, -24(%rsp)
1996 ; SSSE3-NEXT: movaps %xmm0, -40(%rsp)
1997 ; SSSE3-NEXT: movl (%rax), %eax
1998 ; SSSE3-NEXT: movaps %xmm0, -56(%rsp)
1999 ; SSSE3-NEXT: movaps %xmm0, -72(%rsp)
2000 ; SSSE3-NEXT: andl $3, %eax
2001 ; SSSE3-NEXT: shll $3, %eax
2002 ; SSSE3-NEXT: movsd -72(%rsp,%rax), %xmm0 # xmm0 = mem[0],zero
2003 ; SSSE3-NEXT: movsd -40(%rsp,%rax), %xmm1 # xmm1 = mem[0],zero
2004 ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2005 ; SSSE3-NEXT: movups %xmm1, (%rax)
2008 ; SSE41-LABEL: indices_convert:
2009 ; SSE41: # %bb.0: # %bb
2010 ; SSE41-NEXT: movaps (%rax), %xmm0
2011 ; SSE41-NEXT: extractps $2, %xmm0, %eax
2012 ; SSE41-NEXT: movaps %xmm0, -24(%rsp)
2013 ; SSE41-NEXT: movaps %xmm0, -40(%rsp)
2014 ; SSE41-NEXT: andl $3, %eax
2015 ; SSE41-NEXT: extractps $3, %xmm0, %ecx
2016 ; SSE41-NEXT: movaps %xmm0, -56(%rsp)
2017 ; SSE41-NEXT: movaps %xmm0, -72(%rsp)
2018 ; SSE41-NEXT: andl $3, %ecx
2019 ; SSE41-NEXT: movsd -72(%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero
2020 ; SSE41-NEXT: movsd -40(%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero
2021 ; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2022 ; SSE41-NEXT: movups %xmm1, (%rax)
2025 ; XOP-LABEL: indices_convert:
2026 ; XOP: # %bb.0: # %bb
2027 ; XOP-NEXT: vmovdqa (%rax), %xmm0
2028 ; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2029 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2030 ; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
2031 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
2032 ; XOP-NEXT: vpermil2pd $0, %xmm1, %xmm0, %xmm0, %xmm0
2033 ; XOP-NEXT: vmovupd %xmm0, (%rax)
2036 ; AVX1-LABEL: indices_convert:
2037 ; AVX1: # %bb.0: # %bb
2038 ; AVX1-NEXT: vmovdqa (%rax), %xmm0
2039 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2040 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2041 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
2042 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
2043 ; AVX1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
2044 ; AVX1-NEXT: vmovupd %xmm0, (%rax)
2047 ; AVX2-LABEL: indices_convert:
2048 ; AVX2: # %bb.0: # %bb
2049 ; AVX2-NEXT: vpbroadcastq (%rax), %xmm0
2050 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
2051 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
2052 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
2053 ; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
2054 ; AVX2-NEXT: vmovapd (%rax), %xmm1
2055 ; AVX2-NEXT: vpermilpd %xmm0, %xmm1, %xmm0
2056 ; AVX2-NEXT: vmovupd %xmm0, (%rax)
2059 ; AVX512-LABEL: indices_convert:
2060 ; AVX512: # %bb.0: # %bb
2061 ; AVX512-NEXT: vmovdqa (%rax), %ymm0
2062 ; AVX512-NEXT: vpbroadcastq (%rax), %xmm1
2063 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
2064 ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
2065 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
2066 ; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0
2067 ; AVX512-NEXT: vmovdqu %xmm0, (%rax)
2068 ; AVX512-NEXT: vzeroupper
2071 ; AVX512VL-LABEL: indices_convert:
2072 ; AVX512VL: # %bb.0: # %bb
2073 ; AVX512VL-NEXT: vpbroadcastq (%rax), %xmm0
2074 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
2075 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
2076 ; AVX512VL-NEXT: vpermq (%rax), %ymm0, %ymm0
2077 ; AVX512VL-NEXT: vmovdqu %xmm0, (%rax)
2078 ; AVX512VL-NEXT: vzeroupper
2079 ; AVX512VL-NEXT: retq
2081 %0 = load <4 x i64>, ptr undef, align 32
2082 %1 = bitcast <4 x i64> %0 to <8 x i32>
2083 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <2 x i32> <i32 2, i32 12>
2084 %3 = and <2 x i32> %2, <i32 7, i32 7>
2085 %4 = extractelement <2 x i32> %3, i32 0
2086 %vecext.i8.1 = extractelement <4 x i64> %0, i32 %4
2087 %5 = extractelement <2 x i32> %3, i32 1
2088 %vecext.i8.2 = extractelement <4 x i64> %0, i32 %5
2089 %6 = insertelement <2 x i64> poison, i64 %vecext.i8.1, i32 0
2090 %7 = insertelement <2 x i64> %6, i64 %vecext.i8.2, i32 1
2091 %8 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> %7
2092 store <2 x i64> %8, ptr undef, align 8