1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --no_x86_scrub_mem_shuffle
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefix=XOP
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=INT256,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=INT256,AVX512
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=INT256,AVX512
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=INT256,AVX512
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=INT256,AVX512
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLDQ
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLBW
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=INT256,AVX512VL,VLVBMI
14 define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
15 ; XOP-LABEL: var_shuffle_v4i64:
17 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
18 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
19 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3
20 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
21 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
22 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
23 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0
26 ; AVX1-LABEL: var_shuffle_v4i64:
28 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
29 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3
30 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
31 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
32 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4
33 ; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2
34 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
35 ; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
36 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
37 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
38 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
39 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
42 ; AVX2-LABEL: var_shuffle_v4i64:
44 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
45 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
46 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
47 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
48 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
49 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
50 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
51 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
54 ; AVX512-LABEL: var_shuffle_v4i64:
56 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
57 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
58 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
59 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
62 ; AVX512VL-LABEL: var_shuffle_v4i64:
64 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
66 %index0 = extractelement <4 x i64> %indices, i32 0
67 %index1 = extractelement <4 x i64> %indices, i32 1
68 %index2 = extractelement <4 x i64> %indices, i32 2
69 %index3 = extractelement <4 x i64> %indices, i32 3
70 %v0 = extractelement <4 x i64> %v, i64 %index0
71 %v1 = extractelement <4 x i64> %v, i64 %index1
72 %v2 = extractelement <4 x i64> %v, i64 %index2
73 %v3 = extractelement <4 x i64> %v, i64 %index3
74 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
75 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
76 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
77 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3
81 define <4 x i64> @var_shuffle_zero_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
82 ; XOP-LABEL: var_shuffle_zero_v4i64:
84 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
85 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,3]
86 ; XOP-NEXT: vpcomgtuq %xmm3, %xmm2, %xmm2
87 ; XOP-NEXT: vpcomgtuq %xmm3, %xmm1, %xmm3
88 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
89 ; XOP-NEXT: vorps %ymm1, %ymm2, %ymm1
90 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3
91 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
92 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
93 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
94 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
95 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
96 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm3, %ymm0, %ymm0
97 ; XOP-NEXT: vandnps %ymm0, %ymm2, %ymm0
100 ; AVX1-LABEL: var_shuffle_zero_v4i64:
102 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
103 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
104 ; AVX1-NEXT: # xmm3 = mem[0,0]
105 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
106 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775811,9223372036854775811]
107 ; AVX1-NEXT: # xmm4 = mem[0,0]
108 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2
109 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm3
110 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
111 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
112 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
113 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3
114 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm4
115 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
116 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
117 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm5
118 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
119 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
120 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
121 ; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
122 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
123 ; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
124 ; AVX1-NEXT: vblendvpd %ymm4, %ymm3, %ymm0, %ymm0
125 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
128 ; AVX2-LABEL: var_shuffle_zero_v4i64:
130 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
131 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
132 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775811,9223372036854775811,9223372036854775811,9223372036854775811]
133 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
134 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
135 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
136 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [2,2,2,2]
137 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm3
138 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,3,2,3]
139 ; AVX2-NEXT: vpermilpd %ymm1, %ymm4, %ymm4
140 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
141 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
142 ; AVX2-NEXT: vblendvpd %ymm3, %ymm4, %ymm0, %ymm0
143 ; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0
146 ; AVX512-LABEL: var_shuffle_zero_v4i64:
148 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
149 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
150 ; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
151 ; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
152 ; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k2
153 ; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
154 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2}
155 ; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
156 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
159 ; AVX512VL-LABEL: var_shuffle_zero_v4i64:
161 ; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %k1
162 ; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
163 ; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1}
164 ; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0
165 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
166 ; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
167 ; AVX512VL-NEXT: retq
168 %cmp = icmp ugt <4 x i64> %indices, <i64 3, i64 3, i64 3, i64 3>
169 %or = select <4 x i1> %cmp, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %indices
170 %idx0 = extractelement <4 x i64> %or, i64 0
171 %idx1 = extractelement <4 x i64> %or, i64 1
172 %idx2 = extractelement <4 x i64> %or, i64 2
173 %idx3 = extractelement <4 x i64> %or, i64 3
174 %elt0 = extractelement <4 x i64> %v, i64 %idx0
175 %elt1 = extractelement <4 x i64> %v, i64 %idx1
176 %elt2 = extractelement <4 x i64> %v, i64 %idx2
177 %elt3 = extractelement <4 x i64> %v, i64 %idx3
178 %vec0 = insertelement <4 x i64> poison, i64 %elt0, i64 0
179 %vec1 = insertelement <4 x i64> %vec0, i64 %elt1, i64 1
180 %vec2 = insertelement <4 x i64> %vec1, i64 %elt2, i64 2
181 %vec3 = insertelement <4 x i64> %vec2, i64 %elt3, i64 3
182 %res = select <4 x i1> %cmp, <4 x i64> zeroinitializer, <4 x i64> %vec3
186 define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
187 ; XOP-LABEL: var_shuffle_v8i32:
189 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
190 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
191 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
194 ; AVX1-LABEL: var_shuffle_v8i32:
196 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
197 ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
198 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
199 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
200 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
201 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
202 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
203 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
204 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
207 ; INT256-LABEL: var_shuffle_v8i32:
209 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
211 %index0 = extractelement <8 x i32> %indices, i32 0
212 %index1 = extractelement <8 x i32> %indices, i32 1
213 %index2 = extractelement <8 x i32> %indices, i32 2
214 %index3 = extractelement <8 x i32> %indices, i32 3
215 %index4 = extractelement <8 x i32> %indices, i32 4
216 %index5 = extractelement <8 x i32> %indices, i32 5
217 %index6 = extractelement <8 x i32> %indices, i32 6
218 %index7 = extractelement <8 x i32> %indices, i32 7
219 %v0 = extractelement <8 x i32> %v, i32 %index0
220 %v1 = extractelement <8 x i32> %v, i32 %index1
221 %v2 = extractelement <8 x i32> %v, i32 %index2
222 %v3 = extractelement <8 x i32> %v, i32 %index3
223 %v4 = extractelement <8 x i32> %v, i32 %index4
224 %v5 = extractelement <8 x i32> %v, i32 %index5
225 %v6 = extractelement <8 x i32> %v, i32 %index6
226 %v7 = extractelement <8 x i32> %v, i32 %index7
227 %ret0 = insertelement <8 x i32> undef, i32 %v0, i32 0
228 %ret1 = insertelement <8 x i32> %ret0, i32 %v1, i32 1
229 %ret2 = insertelement <8 x i32> %ret1, i32 %v2, i32 2
230 %ret3 = insertelement <8 x i32> %ret2, i32 %v3, i32 3
231 %ret4 = insertelement <8 x i32> %ret3, i32 %v4, i32 4
232 %ret5 = insertelement <8 x i32> %ret4, i32 %v5, i32 5
233 %ret6 = insertelement <8 x i32> %ret5, i32 %v6, i32 6
234 %ret7 = insertelement <8 x i32> %ret6, i32 %v7, i32 7
238 define <8 x i32> @var_shuffle_zero_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
239 ; XOP-LABEL: var_shuffle_zero_v8i32:
241 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
242 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7]
243 ; XOP-NEXT: vpcomgtud %xmm3, %xmm2, %xmm2
244 ; XOP-NEXT: vpcomgtud %xmm3, %xmm1, %xmm3
245 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
246 ; XOP-NEXT: vorps %ymm1, %ymm2, %ymm1
247 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
248 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
249 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm3, %ymm0, %ymm0
250 ; XOP-NEXT: vandnps %ymm0, %ymm2, %ymm0
253 ; AVX1-LABEL: var_shuffle_zero_v8i32:
255 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
256 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [8,8,8,8]
257 ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm4
258 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
259 ; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm3
260 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3
261 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
262 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
263 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
264 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
265 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm4, %xmm4
266 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
267 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3]
268 ; AVX1-NEXT: vpermilps %ymm1, %ymm4, %ymm4
269 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
270 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
271 ; AVX1-NEXT: vblendvps %ymm3, %ymm4, %ymm0, %ymm0
272 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
275 ; AVX2-LABEL: var_shuffle_zero_v8i32:
277 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8]
278 ; AVX2-NEXT: vpmaxud %ymm2, %ymm1, %ymm2
279 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2
280 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
281 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
282 ; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0
285 ; AVX512-LABEL: var_shuffle_zero_v8i32:
287 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
288 ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
289 ; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
290 ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
291 ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
292 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
293 ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
294 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
297 ; AVX512VL-LABEL: var_shuffle_zero_v8i32:
299 ; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
300 ; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
301 ; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1}
302 ; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
303 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
304 ; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
305 ; AVX512VL-NEXT: retq
306 %cmp = icmp ugt <8 x i32> %indices, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
307 %or = select <8 x i1> %cmp, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %indices
308 %idx0 = extractelement <8 x i32> %or, i64 0
309 %idx1 = extractelement <8 x i32> %or, i64 1
310 %idx2 = extractelement <8 x i32> %or, i64 2
311 %idx3 = extractelement <8 x i32> %or, i64 3
312 %idx4 = extractelement <8 x i32> %or, i64 4
313 %idx5 = extractelement <8 x i32> %or, i64 5
314 %idx6 = extractelement <8 x i32> %or, i64 6
315 %idx7 = extractelement <8 x i32> %or, i64 7
316 %elt0 = extractelement <8 x i32> %v, i32 %idx0
317 %elt1 = extractelement <8 x i32> %v, i32 %idx1
318 %elt2 = extractelement <8 x i32> %v, i32 %idx2
319 %elt3 = extractelement <8 x i32> %v, i32 %idx3
320 %elt4 = extractelement <8 x i32> %v, i32 %idx4
321 %elt5 = extractelement <8 x i32> %v, i32 %idx5
322 %elt6 = extractelement <8 x i32> %v, i32 %idx6
323 %elt7 = extractelement <8 x i32> %v, i32 %idx7
324 %vec0 = insertelement <8 x i32> poison, i32 %elt0, i64 0
325 %vec1 = insertelement <8 x i32> %vec0, i32 %elt1, i64 1
326 %vec2 = insertelement <8 x i32> %vec1, i32 %elt2, i64 2
327 %vec3 = insertelement <8 x i32> %vec2, i32 %elt3, i64 3
328 %vec4 = insertelement <8 x i32> %vec3, i32 %elt4, i64 4
329 %vec5 = insertelement <8 x i32> %vec4, i32 %elt5, i64 5
330 %vec6 = insertelement <8 x i32> %vec5, i32 %elt6, i64 6
331 %vec7 = insertelement <8 x i32> %vec6, i32 %elt7, i64 7
332 %res = select <8 x i1> %cmp, <8 x i32> zeroinitializer, <8 x i32> %vec7
336 define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
337 ; XOP-LABEL: var_shuffle_v16i16:
339 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256]
340 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
341 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4
342 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
343 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1
344 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
345 ; XOP-NEXT: vpperm %xmm1, %xmm2, %xmm0, %xmm1
346 ; XOP-NEXT: vpperm %xmm4, %xmm2, %xmm0, %xmm0
347 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
350 ; AVX1-LABEL: var_shuffle_v16i16:
352 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514]
353 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
354 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256]
355 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
356 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
357 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
358 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1
359 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
360 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4
361 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
362 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm6
363 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
364 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
365 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
366 ; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm4
367 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
368 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
369 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
372 ; AVX2-LABEL: var_shuffle_v16i16:
374 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
375 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
376 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
377 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2
378 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
379 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
380 ; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
381 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
384 ; AVX512-LABEL: var_shuffle_v16i16:
386 ; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
387 ; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
388 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
389 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2
390 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
391 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
392 ; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
393 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
396 ; AVX512VLDQ-LABEL: var_shuffle_v16i16:
397 ; AVX512VLDQ: # %bb.0:
398 ; AVX512VLDQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
399 ; AVX512VLDQ-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
400 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
401 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2
402 ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
403 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3
404 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
405 ; AVX512VLDQ-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm0
406 ; AVX512VLDQ-NEXT: retq
408 ; AVX512VLBW-LABEL: var_shuffle_v16i16:
409 ; AVX512VLBW: # %bb.0:
410 ; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
411 ; AVX512VLBW-NEXT: retq
413 ; VLVBMI-LABEL: var_shuffle_v16i16:
415 ; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0
417 %index0 = extractelement <16 x i16> %indices, i32 0
418 %index1 = extractelement <16 x i16> %indices, i32 1
419 %index2 = extractelement <16 x i16> %indices, i32 2
420 %index3 = extractelement <16 x i16> %indices, i32 3
421 %index4 = extractelement <16 x i16> %indices, i32 4
422 %index5 = extractelement <16 x i16> %indices, i32 5
423 %index6 = extractelement <16 x i16> %indices, i32 6
424 %index7 = extractelement <16 x i16> %indices, i32 7
425 %index8 = extractelement <16 x i16> %indices, i32 8
426 %index9 = extractelement <16 x i16> %indices, i32 9
427 %index10 = extractelement <16 x i16> %indices, i32 10
428 %index11 = extractelement <16 x i16> %indices, i32 11
429 %index12 = extractelement <16 x i16> %indices, i32 12
430 %index13 = extractelement <16 x i16> %indices, i32 13
431 %index14 = extractelement <16 x i16> %indices, i32 14
432 %index15 = extractelement <16 x i16> %indices, i32 15
433 %v0 = extractelement <16 x i16> %v, i16 %index0
434 %v1 = extractelement <16 x i16> %v, i16 %index1
435 %v2 = extractelement <16 x i16> %v, i16 %index2
436 %v3 = extractelement <16 x i16> %v, i16 %index3
437 %v4 = extractelement <16 x i16> %v, i16 %index4
438 %v5 = extractelement <16 x i16> %v, i16 %index5
439 %v6 = extractelement <16 x i16> %v, i16 %index6
440 %v7 = extractelement <16 x i16> %v, i16 %index7
441 %v8 = extractelement <16 x i16> %v, i16 %index8
442 %v9 = extractelement <16 x i16> %v, i16 %index9
443 %v10 = extractelement <16 x i16> %v, i16 %index10
444 %v11 = extractelement <16 x i16> %v, i16 %index11
445 %v12 = extractelement <16 x i16> %v, i16 %index12
446 %v13 = extractelement <16 x i16> %v, i16 %index13
447 %v14 = extractelement <16 x i16> %v, i16 %index14
448 %v15 = extractelement <16 x i16> %v, i16 %index15
449 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0
450 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1
451 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2
452 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3
453 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4
454 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5
455 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6
456 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7
457 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8
458 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9
459 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10
460 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11
461 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12
462 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13
463 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14
464 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15
465 ret <16 x i16> %ret15
468 define <16 x i16> @var_shuffle_zero_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
469 ; XOP-LABEL: var_shuffle_zero_v16i16:
471 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
472 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
473 ; XOP-NEXT: vpcomgtuw %xmm3, %xmm2, %xmm2
474 ; XOP-NEXT: vpcomgtuw %xmm3, %xmm1, %xmm3
475 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
476 ; XOP-NEXT: vorps %ymm1, %ymm2, %ymm1
477 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [256,256,256,256,256,256,256,256]
478 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm4 = [514,514,514,514,514,514,514,514]
479 ; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm1, %xmm5
480 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
481 ; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm1, %xmm1
482 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
483 ; XOP-NEXT: vpperm %xmm1, %xmm3, %xmm0, %xmm1
484 ; XOP-NEXT: vpperm %xmm5, %xmm3, %xmm0, %xmm0
485 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
486 ; XOP-NEXT: vandnps %ymm0, %ymm2, %ymm0
489 ; AVX1-LABEL: var_shuffle_zero_v16i16:
491 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
492 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
493 ; AVX1-NEXT: vpmaxuw %xmm3, %xmm2, %xmm4
494 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
495 ; AVX1-NEXT: vpmaxuw %xmm3, %xmm1, %xmm3
496 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm3
497 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
498 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
499 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
500 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm4
501 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [256,256,256,256,256,256,256,256]
502 ; AVX1-NEXT: vpaddw %xmm5, %xmm4, %xmm4
503 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
504 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
505 ; AVX1-NEXT: vpaddw %xmm5, %xmm1, %xmm1
506 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
507 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm5
508 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
509 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm7
510 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
511 ; AVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm1, %xmm1
512 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm3
513 ; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm5
514 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
515 ; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm0, %xmm0
516 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
517 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
520 ; AVX2-LABEL: var_shuffle_zero_v16i16:
522 ; AVX2-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
523 ; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm2
524 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
525 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
526 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
527 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
528 ; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm3
529 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
530 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
531 ; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
532 ; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
533 ; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0
536 ; AVX512VLDQ-LABEL: var_shuffle_zero_v16i16:
537 ; AVX512VLDQ: # %bb.0:
538 ; AVX512VLDQ-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
539 ; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm2
540 ; AVX512VLDQ-NEXT: vpor %ymm1, %ymm2, %ymm1
541 ; AVX512VLDQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
542 ; AVX512VLDQ-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
543 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
544 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3
545 ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
546 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
547 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
548 ; AVX512VLDQ-NEXT: vpternlogq $202, %ymm3, %ymm0, %ymm1
549 ; AVX512VLDQ-NEXT: vpandn %ymm1, %ymm2, %ymm0
550 ; AVX512VLDQ-NEXT: retq
552 ; AVX512VLBW-LABEL: var_shuffle_zero_v16i16:
553 ; AVX512VLBW: # %bb.0:
554 ; AVX512VLBW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
555 ; AVX512VLBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
556 ; AVX512VLBW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1}
557 ; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
558 ; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
559 ; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
560 ; AVX512VLBW-NEXT: retq
562 ; VLVBMI-LABEL: var_shuffle_zero_v16i16:
564 ; VLVBMI-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
565 ; VLVBMI-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
566 ; VLVBMI-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1}
567 ; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0
568 ; VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
569 ; VLVBMI-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
571 %cmp = icmp ugt <16 x i16> %indices, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
572 %or = select <16 x i1> %cmp, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %indices
573 %idx0 = extractelement <16 x i16> %or, i64 0
574 %idx1 = extractelement <16 x i16> %or, i64 1
575 %idx2 = extractelement <16 x i16> %or, i64 2
576 %idx3 = extractelement <16 x i16> %or, i64 3
577 %idx4 = extractelement <16 x i16> %or, i64 4
578 %idx5 = extractelement <16 x i16> %or, i64 5
579 %idx6 = extractelement <16 x i16> %or, i64 6
580 %idx7 = extractelement <16 x i16> %or, i64 7
581 %idx8 = extractelement <16 x i16> %or, i64 8
582 %idx9 = extractelement <16 x i16> %or, i64 9
583 %idxA = extractelement <16 x i16> %or, i64 10
584 %idxB = extractelement <16 x i16> %or, i64 11
585 %idxC = extractelement <16 x i16> %or, i64 12
586 %idxD = extractelement <16 x i16> %or, i64 13
587 %idxE = extractelement <16 x i16> %or, i64 14
588 %idxF = extractelement <16 x i16> %or, i64 15
589 %elt0 = extractelement <16 x i16> %v, i16 %idx0
590 %elt1 = extractelement <16 x i16> %v, i16 %idx1
591 %elt2 = extractelement <16 x i16> %v, i16 %idx2
592 %elt3 = extractelement <16 x i16> %v, i16 %idx3
593 %elt4 = extractelement <16 x i16> %v, i16 %idx4
594 %elt5 = extractelement <16 x i16> %v, i16 %idx5
595 %elt6 = extractelement <16 x i16> %v, i16 %idx6
596 %elt7 = extractelement <16 x i16> %v, i16 %idx7
597 %elt8 = extractelement <16 x i16> %v, i16 %idx8
598 %elt9 = extractelement <16 x i16> %v, i16 %idx9
599 %eltA = extractelement <16 x i16> %v, i16 %idxA
600 %eltB = extractelement <16 x i16> %v, i16 %idxB
601 %eltC = extractelement <16 x i16> %v, i16 %idxC
602 %eltD = extractelement <16 x i16> %v, i16 %idxD
603 %eltE = extractelement <16 x i16> %v, i16 %idxE
604 %eltF = extractelement <16 x i16> %v, i16 %idxF
605 %vec0 = insertelement <16 x i16> poison, i16 %elt0, i64 0
606 %vec1 = insertelement <16 x i16> %vec0, i16 %elt1, i64 1
607 %vec2 = insertelement <16 x i16> %vec1, i16 %elt2, i64 2
608 %vec3 = insertelement <16 x i16> %vec2, i16 %elt3, i64 3
609 %vec4 = insertelement <16 x i16> %vec3, i16 %elt4, i64 4
610 %vec5 = insertelement <16 x i16> %vec4, i16 %elt5, i64 5
611 %vec6 = insertelement <16 x i16> %vec5, i16 %elt6, i64 6
612 %vec7 = insertelement <16 x i16> %vec6, i16 %elt7, i64 7
613 %vec8 = insertelement <16 x i16> %vec7, i16 %elt8, i64 8
614 %vec9 = insertelement <16 x i16> %vec8, i16 %elt9, i64 9
615 %vecA = insertelement <16 x i16> %vec9, i16 %eltA, i64 10
616 %vecB = insertelement <16 x i16> %vecA, i16 %eltB, i64 11
617 %vecC = insertelement <16 x i16> %vecB, i16 %eltC, i64 12
618 %vecD = insertelement <16 x i16> %vecC, i16 %eltD, i64 13
619 %vecE = insertelement <16 x i16> %vecD, i16 %eltE, i64 14
620 %vecF = insertelement <16 x i16> %vecE, i16 %eltF, i64 15
621 %res = select <16 x i1> %cmp, <16 x i16> zeroinitializer, <16 x i16> %vecF
625 define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
626 ; XOP-LABEL: var_shuffle_v32i8:
628 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
629 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
630 ; XOP-NEXT: vpperm %xmm2, %xmm3, %xmm0, %xmm2
631 ; XOP-NEXT: vpperm %xmm1, %xmm3, %xmm0, %xmm0
632 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
635 ; AVX1-LABEL: var_shuffle_v32i8:
637 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
638 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
639 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
640 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
641 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm6
642 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
643 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm2, %xmm2
644 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3
645 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm4
646 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
647 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm0
648 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
651 ; AVX2-LABEL: var_shuffle_v32i8:
653 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
654 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2
655 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
656 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
657 ; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
658 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
661 ; AVX512-LABEL: var_shuffle_v32i8:
663 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
664 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2
665 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
666 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
667 ; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
668 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
671 ; AVX512VLDQ-LABEL: var_shuffle_v32i8:
672 ; AVX512VLDQ: # %bb.0:
673 ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
674 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2
675 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
676 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3
677 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
678 ; AVX512VLDQ-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm0
679 ; AVX512VLDQ-NEXT: retq
681 ; AVX512VLBW-LABEL: var_shuffle_v32i8:
682 ; AVX512VLBW: # %bb.0:
683 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
684 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
685 ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
686 ; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
687 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1}
688 ; AVX512VLBW-NEXT: vmovdqa %ymm2, %ymm0
689 ; AVX512VLBW-NEXT: retq
691 ; VLVBMI-LABEL: var_shuffle_v32i8:
693 ; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
695 %index0 = extractelement <32 x i8> %indices, i32 0
696 %index1 = extractelement <32 x i8> %indices, i32 1
697 %index2 = extractelement <32 x i8> %indices, i32 2
698 %index3 = extractelement <32 x i8> %indices, i32 3
699 %index4 = extractelement <32 x i8> %indices, i32 4
700 %index5 = extractelement <32 x i8> %indices, i32 5
701 %index6 = extractelement <32 x i8> %indices, i32 6
702 %index7 = extractelement <32 x i8> %indices, i32 7
703 %index8 = extractelement <32 x i8> %indices, i32 8
704 %index9 = extractelement <32 x i8> %indices, i32 9
705 %index10 = extractelement <32 x i8> %indices, i32 10
706 %index11 = extractelement <32 x i8> %indices, i32 11
707 %index12 = extractelement <32 x i8> %indices, i32 12
708 %index13 = extractelement <32 x i8> %indices, i32 13
709 %index14 = extractelement <32 x i8> %indices, i32 14
710 %index15 = extractelement <32 x i8> %indices, i32 15
711 %index16 = extractelement <32 x i8> %indices, i32 16
712 %index17 = extractelement <32 x i8> %indices, i32 17
713 %index18 = extractelement <32 x i8> %indices, i32 18
714 %index19 = extractelement <32 x i8> %indices, i32 19
715 %index20 = extractelement <32 x i8> %indices, i32 20
716 %index21 = extractelement <32 x i8> %indices, i32 21
717 %index22 = extractelement <32 x i8> %indices, i32 22
718 %index23 = extractelement <32 x i8> %indices, i32 23
719 %index24 = extractelement <32 x i8> %indices, i32 24
720 %index25 = extractelement <32 x i8> %indices, i32 25
721 %index26 = extractelement <32 x i8> %indices, i32 26
722 %index27 = extractelement <32 x i8> %indices, i32 27
723 %index28 = extractelement <32 x i8> %indices, i32 28
724 %index29 = extractelement <32 x i8> %indices, i32 29
725 %index30 = extractelement <32 x i8> %indices, i32 30
726 %index31 = extractelement <32 x i8> %indices, i32 31
727 %v0 = extractelement <32 x i8> %v, i8 %index0
728 %v1 = extractelement <32 x i8> %v, i8 %index1
729 %v2 = extractelement <32 x i8> %v, i8 %index2
730 %v3 = extractelement <32 x i8> %v, i8 %index3
731 %v4 = extractelement <32 x i8> %v, i8 %index4
732 %v5 = extractelement <32 x i8> %v, i8 %index5
733 %v6 = extractelement <32 x i8> %v, i8 %index6
734 %v7 = extractelement <32 x i8> %v, i8 %index7
735 %v8 = extractelement <32 x i8> %v, i8 %index8
736 %v9 = extractelement <32 x i8> %v, i8 %index9
737 %v10 = extractelement <32 x i8> %v, i8 %index10
738 %v11 = extractelement <32 x i8> %v, i8 %index11
739 %v12 = extractelement <32 x i8> %v, i8 %index12
740 %v13 = extractelement <32 x i8> %v, i8 %index13
741 %v14 = extractelement <32 x i8> %v, i8 %index14
742 %v15 = extractelement <32 x i8> %v, i8 %index15
743 %v16 = extractelement <32 x i8> %v, i8 %index16
744 %v17 = extractelement <32 x i8> %v, i8 %index17
745 %v18 = extractelement <32 x i8> %v, i8 %index18
746 %v19 = extractelement <32 x i8> %v, i8 %index19
747 %v20 = extractelement <32 x i8> %v, i8 %index20
748 %v21 = extractelement <32 x i8> %v, i8 %index21
749 %v22 = extractelement <32 x i8> %v, i8 %index22
750 %v23 = extractelement <32 x i8> %v, i8 %index23
751 %v24 = extractelement <32 x i8> %v, i8 %index24
752 %v25 = extractelement <32 x i8> %v, i8 %index25
753 %v26 = extractelement <32 x i8> %v, i8 %index26
754 %v27 = extractelement <32 x i8> %v, i8 %index27
755 %v28 = extractelement <32 x i8> %v, i8 %index28
756 %v29 = extractelement <32 x i8> %v, i8 %index29
757 %v30 = extractelement <32 x i8> %v, i8 %index30
758 %v31 = extractelement <32 x i8> %v, i8 %index31
759 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0
760 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1
761 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2
762 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3
763 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4
764 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5
765 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6
766 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7
767 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8
768 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9
769 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10
770 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11
771 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12
772 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13
773 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14
774 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15
775 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16
776 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17
777 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18
778 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19
779 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20
780 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21
781 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22
782 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23
783 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24
784 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25
785 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26
786 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27
787 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28
788 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29
789 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30
790 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31
794 define <32 x i8> @var_shuffle_zero_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
795 ; XOP-LABEL: var_shuffle_zero_v32i8:
797 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
798 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
799 ; XOP-NEXT: vpcomgtub %xmm3, %xmm2, %xmm2
800 ; XOP-NEXT: vpcomgtub %xmm3, %xmm1, %xmm3
801 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
802 ; XOP-NEXT: vorps %ymm1, %ymm2, %ymm1
803 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm3
804 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm4
805 ; XOP-NEXT: vpperm %xmm3, %xmm4, %xmm0, %xmm3
806 ; XOP-NEXT: vpperm %xmm1, %xmm4, %xmm0, %xmm0
807 ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
808 ; XOP-NEXT: vandnps %ymm0, %ymm2, %ymm0
811 ; AVX1-LABEL: var_shuffle_zero_v32i8:
813 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
814 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
815 ; AVX1-NEXT: vpmaxub %xmm3, %xmm2, %xmm4
816 ; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2
817 ; AVX1-NEXT: vpmaxub %xmm3, %xmm1, %xmm3
818 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm3
819 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
820 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
821 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
822 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
823 ; AVX1-NEXT: vpcmpgtb %xmm4, %xmm3, %xmm5
824 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
825 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm7
826 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm3
827 ; AVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm3, %xmm3
828 ; AVX1-NEXT: vpcmpgtb %xmm4, %xmm1, %xmm4
829 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm5
830 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
831 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm0, %xmm0
832 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
833 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
836 ; AVX2-LABEL: var_shuffle_zero_v32i8:
838 ; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
839 ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm2
840 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
841 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
842 ; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm3
843 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
844 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
845 ; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
846 ; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
847 ; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0
850 ; AVX512VLDQ-LABEL: var_shuffle_zero_v32i8:
851 ; AVX512VLDQ: # %bb.0:
852 ; AVX512VLDQ-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
853 ; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm2
854 ; AVX512VLDQ-NEXT: vpor %ymm1, %ymm2, %ymm1
855 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
856 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3
857 ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
858 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
859 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
860 ; AVX512VLDQ-NEXT: vpternlogq $202, %ymm3, %ymm0, %ymm1
861 ; AVX512VLDQ-NEXT: vpandn %ymm1, %ymm2, %ymm0
862 ; AVX512VLDQ-NEXT: retq
864 ; AVX512VLBW-LABEL: var_shuffle_zero_v32i8:
865 ; AVX512VLBW: # %bb.0:
866 ; AVX512VLBW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
867 ; AVX512VLBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
868 ; AVX512VLBW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
869 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
870 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
871 ; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k2
872 ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
873 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k2}
874 ; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
875 ; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1}
876 ; AVX512VLBW-NEXT: vmovdqa %ymm2, %ymm0
877 ; AVX512VLBW-NEXT: retq
879 ; VLVBMI-LABEL: var_shuffle_zero_v32i8:
881 ; VLVBMI-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
882 ; VLVBMI-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
883 ; VLVBMI-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
884 ; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
885 ; VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
886 ; VLVBMI-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
888 %cmp = icmp ugt <32 x i8> %indices, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
889 %or = select <32 x i1> %cmp, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %indices
890 %idx00 = extractelement <32 x i8> %or, i64 0
891 %idx01 = extractelement <32 x i8> %or, i64 1
892 %idx02 = extractelement <32 x i8> %or, i64 2
893 %idx03 = extractelement <32 x i8> %or, i64 3
894 %idx04 = extractelement <32 x i8> %or, i64 4
895 %idx05 = extractelement <32 x i8> %or, i64 5
896 %idx06 = extractelement <32 x i8> %or, i64 6
897 %idx07 = extractelement <32 x i8> %or, i64 7
898 %idx08 = extractelement <32 x i8> %or, i64 8
899 %idx09 = extractelement <32 x i8> %or, i64 9
900 %idx0A = extractelement <32 x i8> %or, i64 10
901 %idx0B = extractelement <32 x i8> %or, i64 11
902 %idx0C = extractelement <32 x i8> %or, i64 12
903 %idx0D = extractelement <32 x i8> %or, i64 13
904 %idx0E = extractelement <32 x i8> %or, i64 14
905 %idx0F = extractelement <32 x i8> %or, i64 15
906 %idx10 = extractelement <32 x i8> %or, i64 16
907 %idx11 = extractelement <32 x i8> %or, i64 17
908 %idx12 = extractelement <32 x i8> %or, i64 18
909 %idx13 = extractelement <32 x i8> %or, i64 19
910 %idx14 = extractelement <32 x i8> %or, i64 20
911 %idx15 = extractelement <32 x i8> %or, i64 21
912 %idx16 = extractelement <32 x i8> %or, i64 22
913 %idx17 = extractelement <32 x i8> %or, i64 23
914 %idx18 = extractelement <32 x i8> %or, i64 24
915 %idx19 = extractelement <32 x i8> %or, i64 25
916 %idx1A = extractelement <32 x i8> %or, i64 26
917 %idx1B = extractelement <32 x i8> %or, i64 27
918 %idx1C = extractelement <32 x i8> %or, i64 28
919 %idx1D = extractelement <32 x i8> %or, i64 29
920 %idx1E = extractelement <32 x i8> %or, i64 30
921 %idx1F = extractelement <32 x i8> %or, i64 31
922 %elt00 = extractelement <32 x i8> %v, i8 %idx00
923 %elt01 = extractelement <32 x i8> %v, i8 %idx01
924 %elt02 = extractelement <32 x i8> %v, i8 %idx02
925 %elt03 = extractelement <32 x i8> %v, i8 %idx03
926 %elt04 = extractelement <32 x i8> %v, i8 %idx04
927 %elt05 = extractelement <32 x i8> %v, i8 %idx05
928 %elt06 = extractelement <32 x i8> %v, i8 %idx06
929 %elt07 = extractelement <32 x i8> %v, i8 %idx07
930 %elt08 = extractelement <32 x i8> %v, i8 %idx08
931 %elt09 = extractelement <32 x i8> %v, i8 %idx09
932 %elt0A = extractelement <32 x i8> %v, i8 %idx0A
933 %elt0B = extractelement <32 x i8> %v, i8 %idx0B
934 %elt0C = extractelement <32 x i8> %v, i8 %idx0C
935 %elt0D = extractelement <32 x i8> %v, i8 %idx0D
936 %elt0E = extractelement <32 x i8> %v, i8 %idx0E
937 %elt0F = extractelement <32 x i8> %v, i8 %idx0F
938 %elt10 = extractelement <32 x i8> %v, i8 %idx10
939 %elt11 = extractelement <32 x i8> %v, i8 %idx11
940 %elt12 = extractelement <32 x i8> %v, i8 %idx12
941 %elt13 = extractelement <32 x i8> %v, i8 %idx13
942 %elt14 = extractelement <32 x i8> %v, i8 %idx14
943 %elt15 = extractelement <32 x i8> %v, i8 %idx15
944 %elt16 = extractelement <32 x i8> %v, i8 %idx16
945 %elt17 = extractelement <32 x i8> %v, i8 %idx17
946 %elt18 = extractelement <32 x i8> %v, i8 %idx18
947 %elt19 = extractelement <32 x i8> %v, i8 %idx19
948 %elt1A = extractelement <32 x i8> %v, i8 %idx1A
949 %elt1B = extractelement <32 x i8> %v, i8 %idx1B
950 %elt1C = extractelement <32 x i8> %v, i8 %idx1C
951 %elt1D = extractelement <32 x i8> %v, i8 %idx1D
952 %elt1E = extractelement <32 x i8> %v, i8 %idx1E
953 %elt1F = extractelement <32 x i8> %v, i8 %idx1F
954 %vec00 = insertelement <32 x i8> poison, i8 %elt00, i64 0
955 %vec01 = insertelement <32 x i8> %vec00, i8 %elt01, i64 1
956 %vec02 = insertelement <32 x i8> %vec01, i8 %elt02, i64 2
957 %vec03 = insertelement <32 x i8> %vec02, i8 %elt03, i64 3
958 %vec04 = insertelement <32 x i8> %vec03, i8 %elt04, i64 4
959 %vec05 = insertelement <32 x i8> %vec04, i8 %elt05, i64 5
960 %vec06 = insertelement <32 x i8> %vec05, i8 %elt06, i64 6
961 %vec07 = insertelement <32 x i8> %vec06, i8 %elt07, i64 7
962 %vec08 = insertelement <32 x i8> %vec07, i8 %elt08, i64 8
963 %vec09 = insertelement <32 x i8> %vec08, i8 %elt09, i64 9
964 %vec0A = insertelement <32 x i8> %vec09, i8 %elt0A, i64 10
965 %vec0B = insertelement <32 x i8> %vec0A, i8 %elt0B, i64 11
966 %vec0C = insertelement <32 x i8> %vec0B, i8 %elt0C, i64 12
967 %vec0D = insertelement <32 x i8> %vec0C, i8 %elt0D, i64 13
968 %vec0E = insertelement <32 x i8> %vec0D, i8 %elt0E, i64 14
969 %vec0F = insertelement <32 x i8> %vec0E, i8 %elt0F, i64 15
970 %vec10 = insertelement <32 x i8> %vec0F, i8 %elt10, i64 16
971 %vec11 = insertelement <32 x i8> %vec10, i8 %elt11, i64 17
972 %vec12 = insertelement <32 x i8> %vec11, i8 %elt12, i64 18
973 %vec13 = insertelement <32 x i8> %vec12, i8 %elt13, i64 19
974 %vec14 = insertelement <32 x i8> %vec13, i8 %elt14, i64 20
975 %vec15 = insertelement <32 x i8> %vec14, i8 %elt15, i64 21
976 %vec16 = insertelement <32 x i8> %vec15, i8 %elt16, i64 22
977 %vec17 = insertelement <32 x i8> %vec16, i8 %elt17, i64 23
978 %vec18 = insertelement <32 x i8> %vec17, i8 %elt18, i64 24
979 %vec19 = insertelement <32 x i8> %vec18, i8 %elt19, i64 25
980 %vec1A = insertelement <32 x i8> %vec19, i8 %elt1A, i64 26
981 %vec1B = insertelement <32 x i8> %vec1A, i8 %elt1B, i64 27
982 %vec1C = insertelement <32 x i8> %vec1B, i8 %elt1C, i64 28
983 %vec1D = insertelement <32 x i8> %vec1C, i8 %elt1D, i64 29
984 %vec1E = insertelement <32 x i8> %vec1D, i8 %elt1E, i64 30
985 %vec1F = insertelement <32 x i8> %vec1E, i8 %elt1F, i64 31
986 %res = select <32 x i1> %cmp, <32 x i8> zeroinitializer, <32 x i8> %vec1F
990 define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind {
991 ; XOP-LABEL: var_shuffle_v4f64:
993 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
994 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
995 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3
996 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
997 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
998 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
999 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0
1002 ; AVX1-LABEL: var_shuffle_v4f64:
1004 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1005 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3
1006 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1007 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1008 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4
1009 ; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2
1010 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1011 ; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
1012 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
1013 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
1014 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
1015 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
1018 ; AVX2-LABEL: var_shuffle_v4f64:
1020 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
1021 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
1022 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
1023 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
1024 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
1025 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1026 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1027 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
1030 ; AVX512-LABEL: var_shuffle_v4f64:
1032 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1033 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1034 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
1035 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1038 ; AVX512VL-LABEL: var_shuffle_v4f64:
1039 ; AVX512VL: # %bb.0:
1040 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
1041 ; AVX512VL-NEXT: retq
1042 %index0 = extractelement <4 x i64> %indices, i32 0
1043 %index1 = extractelement <4 x i64> %indices, i32 1
1044 %index2 = extractelement <4 x i64> %indices, i32 2
1045 %index3 = extractelement <4 x i64> %indices, i32 3
1046 %v0 = extractelement <4 x double> %v, i64 %index0
1047 %v1 = extractelement <4 x double> %v, i64 %index1
1048 %v2 = extractelement <4 x double> %v, i64 %index2
1049 %v3 = extractelement <4 x double> %v, i64 %index3
1050 %ret0 = insertelement <4 x double> undef, double %v0, i32 0
1051 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1
1052 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2
1053 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3
1054 ret <4 x double> %ret3
1057 define <4 x double> @var_shuffle_zero_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind {
1058 ; XOP-LABEL: var_shuffle_zero_v4f64:
1060 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1061 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,3]
1062 ; XOP-NEXT: vpcomgtuq %xmm3, %xmm2, %xmm2
1063 ; XOP-NEXT: vpcomgtuq %xmm3, %xmm1, %xmm3
1064 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1065 ; XOP-NEXT: vorps %ymm1, %ymm2, %ymm1
1066 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3
1067 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
1068 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1069 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
1070 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
1071 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1072 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm3, %ymm0, %ymm0
1073 ; XOP-NEXT: vandnps %ymm0, %ymm2, %ymm0
1076 ; AVX1-LABEL: var_shuffle_zero_v4f64:
1078 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1079 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
1080 ; AVX1-NEXT: # xmm3 = mem[0,0]
1081 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
1082 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775811,9223372036854775811]
1083 ; AVX1-NEXT: # xmm4 = mem[0,0]
1084 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2
1085 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm3
1086 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
1087 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1088 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
1089 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3
1090 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm4
1091 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1092 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1093 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm5
1094 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
1095 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
1096 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
1097 ; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
1098 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1099 ; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1100 ; AVX1-NEXT: vblendvpd %ymm4, %ymm3, %ymm0, %ymm0
1101 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
1104 ; AVX2-LABEL: var_shuffle_zero_v4f64:
1106 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1107 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
1108 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775811,9223372036854775811,9223372036854775811,9223372036854775811]
1109 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
1110 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
1111 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
1112 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [2,2,2,2]
1113 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm3
1114 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,3,2,3]
1115 ; AVX2-NEXT: vpermilpd %ymm1, %ymm4, %ymm4
1116 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1117 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1118 ; AVX2-NEXT: vblendvpd %ymm3, %ymm4, %ymm0, %ymm0
1119 ; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0
1122 ; AVX512-LABEL: var_shuffle_zero_v4f64:
1124 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1125 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1126 ; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
1127 ; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
1128 ; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k2
1129 ; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
1130 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2}
1131 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
1132 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1135 ; AVX512VL-LABEL: var_shuffle_zero_v4f64:
1136 ; AVX512VL: # %bb.0:
1137 ; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %k1
1138 ; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
1139 ; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1}
1140 ; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0
1141 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1142 ; AVX512VL-NEXT: vmovapd %ymm1, %ymm0 {%k1}
1143 ; AVX512VL-NEXT: retq
1144 %cmp = icmp ugt <4 x i64> %indices, <i64 3, i64 3, i64 3, i64 3>
1145 %or = select <4 x i1> %cmp, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %indices
1146 %idx0 = extractelement <4 x i64> %or, i64 0
1147 %idx1 = extractelement <4 x i64> %or, i64 1
1148 %idx2 = extractelement <4 x i64> %or, i64 2
1149 %idx3 = extractelement <4 x i64> %or, i64 3
1150 %elt0 = extractelement <4 x double> %v, i64 %idx0
1151 %elt1 = extractelement <4 x double> %v, i64 %idx1
1152 %elt2 = extractelement <4 x double> %v, i64 %idx2
1153 %elt3 = extractelement <4 x double> %v, i64 %idx3
1154 %vec0 = insertelement <4 x double> poison, double %elt0, i64 0
1155 %vec1 = insertelement <4 x double> %vec0, double %elt1, i64 1
1156 %vec2 = insertelement <4 x double> %vec1, double %elt2, i64 2
1157 %vec3 = insertelement <4 x double> %vec2, double %elt3, i64 3
1158 %res = select <4 x i1> %cmp, <4 x double> zeroinitializer, <4 x double> %vec3
1159 ret <4 x double> %res
1162 define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind {
1163 ; XOP-LABEL: var_shuffle_v8f32:
1165 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1166 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1167 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
1170 ; AVX1-LABEL: var_shuffle_v8f32:
1172 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1173 ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
1174 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1175 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
1176 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
1177 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1178 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
1179 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
1180 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
1183 ; INT256-LABEL: var_shuffle_v8f32:
1185 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
1187 %index0 = extractelement <8 x i32> %indices, i32 0
1188 %index1 = extractelement <8 x i32> %indices, i32 1
1189 %index2 = extractelement <8 x i32> %indices, i32 2
1190 %index3 = extractelement <8 x i32> %indices, i32 3
1191 %index4 = extractelement <8 x i32> %indices, i32 4
1192 %index5 = extractelement <8 x i32> %indices, i32 5
1193 %index6 = extractelement <8 x i32> %indices, i32 6
1194 %index7 = extractelement <8 x i32> %indices, i32 7
1195 %v0 = extractelement <8 x float> %v, i32 %index0
1196 %v1 = extractelement <8 x float> %v, i32 %index1
1197 %v2 = extractelement <8 x float> %v, i32 %index2
1198 %v3 = extractelement <8 x float> %v, i32 %index3
1199 %v4 = extractelement <8 x float> %v, i32 %index4
1200 %v5 = extractelement <8 x float> %v, i32 %index5
1201 %v6 = extractelement <8 x float> %v, i32 %index6
1202 %v7 = extractelement <8 x float> %v, i32 %index7
1203 %ret0 = insertelement <8 x float> undef, float %v0, i32 0
1204 %ret1 = insertelement <8 x float> %ret0, float %v1, i32 1
1205 %ret2 = insertelement <8 x float> %ret1, float %v2, i32 2
1206 %ret3 = insertelement <8 x float> %ret2, float %v3, i32 3
1207 %ret4 = insertelement <8 x float> %ret3, float %v4, i32 4
1208 %ret5 = insertelement <8 x float> %ret4, float %v5, i32 5
1209 %ret6 = insertelement <8 x float> %ret5, float %v6, i32 6
1210 %ret7 = insertelement <8 x float> %ret6, float %v7, i32 7
1211 ret <8 x float> %ret7
1214 define <8 x float> @var_shuffle_zero_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind {
1215 ; XOP-LABEL: var_shuffle_zero_v8f32:
1217 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1218 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7]
1219 ; XOP-NEXT: vpcomgtud %xmm3, %xmm2, %xmm2
1220 ; XOP-NEXT: vpcomgtud %xmm3, %xmm1, %xmm3
1221 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1222 ; XOP-NEXT: vorps %ymm1, %ymm2, %ymm1
1223 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
1224 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1225 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm3, %ymm0, %ymm0
1226 ; XOP-NEXT: vandnps %ymm0, %ymm2, %ymm0
1229 ; AVX1-LABEL: var_shuffle_zero_v8f32:
1231 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1232 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [8,8,8,8]
1233 ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm4
1234 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
1235 ; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm3
1236 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3
1237 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1238 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
1239 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
1240 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1241 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm4, %xmm4
1242 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
1243 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3]
1244 ; AVX1-NEXT: vpermilps %ymm1, %ymm4, %ymm4
1245 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1246 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
1247 ; AVX1-NEXT: vblendvps %ymm3, %ymm4, %ymm0, %ymm0
1248 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
1251 ; AVX2-LABEL: var_shuffle_zero_v8f32:
1253 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8]
1254 ; AVX2-NEXT: vpmaxud %ymm2, %ymm1, %ymm2
1255 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2
1256 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
1257 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
1258 ; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0
1261 ; AVX512-LABEL: var_shuffle_zero_v8f32:
1263 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1264 ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
1265 ; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
1266 ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
1267 ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
1268 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1269 ; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
1270 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1273 ; AVX512VL-LABEL: var_shuffle_zero_v8f32:
1274 ; AVX512VL: # %bb.0:
1275 ; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
1276 ; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
1277 ; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1}
1278 ; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
1279 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1280 ; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 {%k1}
1281 ; AVX512VL-NEXT: retq
1282 %cmp = icmp ugt <8 x i32> %indices, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
1283 %or = select <8 x i1> %cmp, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %indices
1284 %idx0 = extractelement <8 x i32> %or, i64 0
1285 %idx1 = extractelement <8 x i32> %or, i64 1
1286 %idx2 = extractelement <8 x i32> %or, i64 2
1287 %idx3 = extractelement <8 x i32> %or, i64 3
1288 %idx4 = extractelement <8 x i32> %or, i64 4
1289 %idx5 = extractelement <8 x i32> %or, i64 5
1290 %idx6 = extractelement <8 x i32> %or, i64 6
1291 %idx7 = extractelement <8 x i32> %or, i64 7
1292 %elt0 = extractelement <8 x float> %v, i32 %idx0
1293 %elt1 = extractelement <8 x float> %v, i32 %idx1
1294 %elt2 = extractelement <8 x float> %v, i32 %idx2
1295 %elt3 = extractelement <8 x float> %v, i32 %idx3
1296 %elt4 = extractelement <8 x float> %v, i32 %idx4
1297 %elt5 = extractelement <8 x float> %v, i32 %idx5
1298 %elt6 = extractelement <8 x float> %v, i32 %idx6
1299 %elt7 = extractelement <8 x float> %v, i32 %idx7
1300 %vec0 = insertelement <8 x float> poison, float %elt0, i64 0
1301 %vec1 = insertelement <8 x float> %vec0, float %elt1, i64 1
1302 %vec2 = insertelement <8 x float> %vec1, float %elt2, i64 2
1303 %vec3 = insertelement <8 x float> %vec2, float %elt3, i64 3
1304 %vec4 = insertelement <8 x float> %vec3, float %elt4, i64 4
1305 %vec5 = insertelement <8 x float> %vec4, float %elt5, i64 5
1306 %vec6 = insertelement <8 x float> %vec5, float %elt6, i64 6
1307 %vec7 = insertelement <8 x float> %vec6, float %elt7, i64 7
1308 %res = select <8 x i1> %cmp, <8 x float> zeroinitializer, <8 x float> %vec7
1309 ret <8 x float> %res
1313 ; PR35820 - Unequal source/destination vector sizes
1316 define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) nounwind {
1317 ; XOP-LABEL: var_shuffle_v4i64_from_v2i64:
1319 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1320 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1321 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm2
1322 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
1323 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1324 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1325 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm0, %ymm0, %ymm0
1328 ; AVX1-LABEL: var_shuffle_v4i64_from_v2i64:
1330 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1331 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1332 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm2
1333 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1334 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1335 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3
1336 ; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0
1337 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1338 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
1339 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1340 ; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2
1341 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
1344 ; AVX2-LABEL: var_shuffle_v4i64_from_v2i64:
1346 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1347 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
1348 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
1349 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
1350 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm3
1351 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1352 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1353 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
1356 ; AVX512-LABEL: var_shuffle_v4i64_from_v2i64:
1358 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1359 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1360 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
1361 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1364 ; AVX512VL-LABEL: var_shuffle_v4i64_from_v2i64:
1365 ; AVX512VL: # %bb.0:
1366 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1367 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
1368 ; AVX512VL-NEXT: retq
1369 %index0 = extractelement <4 x i64> %indices, i32 0
1370 %index1 = extractelement <4 x i64> %indices, i32 1
1371 %index2 = extractelement <4 x i64> %indices, i32 2
1372 %index3 = extractelement <4 x i64> %indices, i32 3
1373 %v0 = extractelement <2 x i64> %v, i64 %index0
1374 %v1 = extractelement <2 x i64> %v, i64 %index1
1375 %v2 = extractelement <2 x i64> %v, i64 %index2
1376 %v3 = extractelement <2 x i64> %v, i64 %index3
1377 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
1378 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
1379 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
1380 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3
1384 define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind {
1385 ; XOP-LABEL: var_shuffle_v8i32_from_v4i32:
1386 ; XOP: # %bb.0: # %entry
1387 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1388 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1389 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm0, %ymm0, %ymm0
1392 ; AVX1-LABEL: var_shuffle_v8i32_from_v4i32:
1393 ; AVX1: # %bb.0: # %entry
1394 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1395 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2
1396 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1397 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
1398 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
1399 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1400 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
1401 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
1402 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
1405 ; INT256-LABEL: var_shuffle_v8i32_from_v4i32:
1406 ; INT256: # %bb.0: # %entry
1407 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1408 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
1411 %tmp1 = extractelement <8 x i32> %indices, i32 0
1412 %vecext2.8 = extractelement <4 x i32> %v, i32 %tmp1
1413 %tmp2 = extractelement <8 x i32> %indices, i32 1
1414 %vecext2.9 = extractelement <4 x i32> %v, i32 %tmp2
1415 %tmp3 = extractelement <8 x i32> %indices, i32 2
1416 %vecext2.10 = extractelement <4 x i32> %v, i32 %tmp3
1417 %tmp4 = extractelement <8 x i32> %indices, i32 3
1418 %vecext2.11 = extractelement <4 x i32> %v, i32 %tmp4
1419 %tmp5 = extractelement <8 x i32> %indices, i32 4
1420 %vecext2.12 = extractelement <4 x i32> %v, i32 %tmp5
1421 %tmp6 = extractelement <8 x i32> %indices, i32 5
1422 %vecext2.13 = extractelement <4 x i32> %v, i32 %tmp6
1423 %tmp7 = extractelement <8 x i32> %indices, i32 6
1424 %vecext2.14 = extractelement <4 x i32> %v, i32 %tmp7
1425 %tmp8 = extractelement <8 x i32> %indices, i32 7
1426 %vecext2.15 = extractelement <4 x i32> %v, i32 %tmp8
1427 %tmp9 = insertelement <8 x i32> undef, i32 %vecext2.8, i32 0
1428 %tmp10 = insertelement <8 x i32> %tmp9, i32 %vecext2.9, i32 1
1429 %tmp11 = insertelement <8 x i32> %tmp10, i32 %vecext2.10, i32 2
1430 %tmp12 = insertelement <8 x i32> %tmp11, i32 %vecext2.11, i32 3
1431 %tmp13 = insertelement <8 x i32> %tmp12, i32 %vecext2.12, i32 4
1432 %tmp14 = insertelement <8 x i32> %tmp13, i32 %vecext2.13, i32 5
1433 %tmp15 = insertelement <8 x i32> %tmp14, i32 %vecext2.14, i32 6
1434 %tmp16 = insertelement <8 x i32> %tmp15, i32 %vecext2.15, i32 7
1435 ret <8 x i32> %tmp16
1438 define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indices) nounwind {
1439 ; XOP-LABEL: var_shuffle_v16i16_from_v8i16:
1441 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256]
1442 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
1443 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4
1444 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
1445 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1
1446 ; XOP-NEXT: vpperm %xmm1, %xmm0, %xmm0, %xmm1
1447 ; XOP-NEXT: vpperm %xmm4, %xmm0, %xmm0, %xmm0
1448 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1451 ; AVX1-LABEL: var_shuffle_v16i16_from_v8i16:
1453 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514]
1454 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
1455 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256]
1456 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
1457 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1458 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1459 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1
1460 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1461 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4
1462 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm5
1463 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
1464 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
1465 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
1466 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm4
1467 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1468 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
1469 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1472 ; AVX2-LABEL: var_shuffle_v16i16_from_v8i16:
1474 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1475 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
1476 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1477 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2
1478 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1479 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
1480 ; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1481 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
1484 ; AVX512-LABEL: var_shuffle_v16i16_from_v8i16:
1486 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1487 ; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
1488 ; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1489 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2
1490 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1491 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
1492 ; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1493 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
1496 ; AVX512VLDQ-LABEL: var_shuffle_v16i16_from_v8i16:
1497 ; AVX512VLDQ: # %bb.0:
1498 ; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1499 ; AVX512VLDQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
1500 ; AVX512VLDQ-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1501 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1502 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2
1503 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3
1504 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1505 ; AVX512VLDQ-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm0
1506 ; AVX512VLDQ-NEXT: retq
1508 ; AVX512VLBW-LABEL: var_shuffle_v16i16_from_v8i16:
1509 ; AVX512VLBW: # %bb.0:
1510 ; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1511 ; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
1512 ; AVX512VLBW-NEXT: retq
1514 ; VLVBMI-LABEL: var_shuffle_v16i16_from_v8i16:
1516 ; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1517 ; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0
1519 %index0 = extractelement <16 x i16> %indices, i32 0
1520 %index1 = extractelement <16 x i16> %indices, i32 1
1521 %index2 = extractelement <16 x i16> %indices, i32 2
1522 %index3 = extractelement <16 x i16> %indices, i32 3
1523 %index4 = extractelement <16 x i16> %indices, i32 4
1524 %index5 = extractelement <16 x i16> %indices, i32 5
1525 %index6 = extractelement <16 x i16> %indices, i32 6
1526 %index7 = extractelement <16 x i16> %indices, i32 7
1527 %index8 = extractelement <16 x i16> %indices, i32 8
1528 %index9 = extractelement <16 x i16> %indices, i32 9
1529 %index10 = extractelement <16 x i16> %indices, i32 10
1530 %index11 = extractelement <16 x i16> %indices, i32 11
1531 %index12 = extractelement <16 x i16> %indices, i32 12
1532 %index13 = extractelement <16 x i16> %indices, i32 13
1533 %index14 = extractelement <16 x i16> %indices, i32 14
1534 %index15 = extractelement <16 x i16> %indices, i32 15
1535 %v0 = extractelement <8 x i16> %v, i16 %index0
1536 %v1 = extractelement <8 x i16> %v, i16 %index1
1537 %v2 = extractelement <8 x i16> %v, i16 %index2
1538 %v3 = extractelement <8 x i16> %v, i16 %index3
1539 %v4 = extractelement <8 x i16> %v, i16 %index4
1540 %v5 = extractelement <8 x i16> %v, i16 %index5
1541 %v6 = extractelement <8 x i16> %v, i16 %index6
1542 %v7 = extractelement <8 x i16> %v, i16 %index7
1543 %v8 = extractelement <8 x i16> %v, i16 %index8
1544 %v9 = extractelement <8 x i16> %v, i16 %index9
1545 %v10 = extractelement <8 x i16> %v, i16 %index10
1546 %v11 = extractelement <8 x i16> %v, i16 %index11
1547 %v12 = extractelement <8 x i16> %v, i16 %index12
1548 %v13 = extractelement <8 x i16> %v, i16 %index13
1549 %v14 = extractelement <8 x i16> %v, i16 %index14
1550 %v15 = extractelement <8 x i16> %v, i16 %index15
1551 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0
1552 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1
1553 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2
1554 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3
1555 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4
1556 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5
1557 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6
1558 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7
1559 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8
1560 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9
1561 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10
1562 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11
1563 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12
1564 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13
1565 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14
1566 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15
1567 ret <16 x i16> %ret15
1570 define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
1571 ; XOP-LABEL: var_shuffle_v32i8_from_v16i8:
1573 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1574 ; XOP-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm2
1575 ; XOP-NEXT: vpperm %xmm1, %xmm0, %xmm0, %xmm0
1576 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1579 ; AVX1-LABEL: var_shuffle_v32i8_from_v16i8:
1581 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1582 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1583 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
1584 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm5
1585 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
1586 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm2, %xmm2
1587 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3
1588 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm4
1589 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1590 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm0
1591 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1594 ; AVX2-LABEL: var_shuffle_v32i8_from_v16i8:
1596 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1597 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2
1598 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1599 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
1600 ; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1601 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
1604 ; AVX512-LABEL: var_shuffle_v32i8_from_v16i8:
1606 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1607 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2
1608 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1609 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
1610 ; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1611 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
1614 ; AVX512VLDQ-LABEL: var_shuffle_v32i8_from_v16i8:
1615 ; AVX512VLDQ: # %bb.0:
1616 ; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1617 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2
1618 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1619 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3
1620 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1621 ; AVX512VLDQ-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm0
1622 ; AVX512VLDQ-NEXT: retq
1624 ; AVX512VLBW-LABEL: var_shuffle_v32i8_from_v16i8:
1625 ; AVX512VLBW: # %bb.0:
1626 ; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1627 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1628 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
1629 ; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
1630 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 {%k1}
1631 ; AVX512VLBW-NEXT: retq
1633 ; VLVBMI-LABEL: var_shuffle_v32i8_from_v16i8:
1635 ; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1636 ; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
1638 %index0 = extractelement <32 x i8> %indices, i32 0
1639 %index1 = extractelement <32 x i8> %indices, i32 1
1640 %index2 = extractelement <32 x i8> %indices, i32 2
1641 %index3 = extractelement <32 x i8> %indices, i32 3
1642 %index4 = extractelement <32 x i8> %indices, i32 4
1643 %index5 = extractelement <32 x i8> %indices, i32 5
1644 %index6 = extractelement <32 x i8> %indices, i32 6
1645 %index7 = extractelement <32 x i8> %indices, i32 7
1646 %index8 = extractelement <32 x i8> %indices, i32 8
1647 %index9 = extractelement <32 x i8> %indices, i32 9
1648 %index10 = extractelement <32 x i8> %indices, i32 10
1649 %index11 = extractelement <32 x i8> %indices, i32 11
1650 %index12 = extractelement <32 x i8> %indices, i32 12
1651 %index13 = extractelement <32 x i8> %indices, i32 13
1652 %index14 = extractelement <32 x i8> %indices, i32 14
1653 %index15 = extractelement <32 x i8> %indices, i32 15
1654 %index16 = extractelement <32 x i8> %indices, i32 16
1655 %index17 = extractelement <32 x i8> %indices, i32 17
1656 %index18 = extractelement <32 x i8> %indices, i32 18
1657 %index19 = extractelement <32 x i8> %indices, i32 19
1658 %index20 = extractelement <32 x i8> %indices, i32 20
1659 %index21 = extractelement <32 x i8> %indices, i32 21
1660 %index22 = extractelement <32 x i8> %indices, i32 22
1661 %index23 = extractelement <32 x i8> %indices, i32 23
1662 %index24 = extractelement <32 x i8> %indices, i32 24
1663 %index25 = extractelement <32 x i8> %indices, i32 25
1664 %index26 = extractelement <32 x i8> %indices, i32 26
1665 %index27 = extractelement <32 x i8> %indices, i32 27
1666 %index28 = extractelement <32 x i8> %indices, i32 28
1667 %index29 = extractelement <32 x i8> %indices, i32 29
1668 %index30 = extractelement <32 x i8> %indices, i32 30
1669 %index31 = extractelement <32 x i8> %indices, i32 31
1670 %v0 = extractelement <16 x i8> %v, i8 %index0
1671 %v1 = extractelement <16 x i8> %v, i8 %index1
1672 %v2 = extractelement <16 x i8> %v, i8 %index2
1673 %v3 = extractelement <16 x i8> %v, i8 %index3
1674 %v4 = extractelement <16 x i8> %v, i8 %index4
1675 %v5 = extractelement <16 x i8> %v, i8 %index5
1676 %v6 = extractelement <16 x i8> %v, i8 %index6
1677 %v7 = extractelement <16 x i8> %v, i8 %index7
1678 %v8 = extractelement <16 x i8> %v, i8 %index8
1679 %v9 = extractelement <16 x i8> %v, i8 %index9
1680 %v10 = extractelement <16 x i8> %v, i8 %index10
1681 %v11 = extractelement <16 x i8> %v, i8 %index11
1682 %v12 = extractelement <16 x i8> %v, i8 %index12
1683 %v13 = extractelement <16 x i8> %v, i8 %index13
1684 %v14 = extractelement <16 x i8> %v, i8 %index14
1685 %v15 = extractelement <16 x i8> %v, i8 %index15
1686 %v16 = extractelement <16 x i8> %v, i8 %index16
1687 %v17 = extractelement <16 x i8> %v, i8 %index17
1688 %v18 = extractelement <16 x i8> %v, i8 %index18
1689 %v19 = extractelement <16 x i8> %v, i8 %index19
1690 %v20 = extractelement <16 x i8> %v, i8 %index20
1691 %v21 = extractelement <16 x i8> %v, i8 %index21
1692 %v22 = extractelement <16 x i8> %v, i8 %index22
1693 %v23 = extractelement <16 x i8> %v, i8 %index23
1694 %v24 = extractelement <16 x i8> %v, i8 %index24
1695 %v25 = extractelement <16 x i8> %v, i8 %index25
1696 %v26 = extractelement <16 x i8> %v, i8 %index26
1697 %v27 = extractelement <16 x i8> %v, i8 %index27
1698 %v28 = extractelement <16 x i8> %v, i8 %index28
1699 %v29 = extractelement <16 x i8> %v, i8 %index29
1700 %v30 = extractelement <16 x i8> %v, i8 %index30
1701 %v31 = extractelement <16 x i8> %v, i8 %index31
1702 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0
1703 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1
1704 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2
1705 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3
1706 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4
1707 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5
1708 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6
1709 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7
1710 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8
1711 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9
1712 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10
1713 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11
1714 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12
1715 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13
1716 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14
1717 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15
1718 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16
1719 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17
1720 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18
1721 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19
1722 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20
1723 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21
1724 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22
1725 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23
1726 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24
1727 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25
1728 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26
1729 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27
1730 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28
1731 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29
1732 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30
1733 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31
1734 ret <32 x i8> %ret31
1737 define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %indices) nounwind {
1738 ; XOP-LABEL: var_shuffle_v4f64_from_v2f64:
1740 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1741 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1742 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm2
1743 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
1744 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1745 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1746 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm0, %ymm0, %ymm0
1749 ; AVX1-LABEL: var_shuffle_v4f64_from_v2f64:
1751 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1752 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1753 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm2
1754 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1755 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1756 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3
1757 ; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0
1758 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1759 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
1760 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1761 ; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2
1762 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
1765 ; AVX2-LABEL: var_shuffle_v4f64_from_v2f64:
1767 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1768 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
1769 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
1770 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
1771 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm3
1772 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1773 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1774 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
1777 ; AVX512-LABEL: var_shuffle_v4f64_from_v2f64:
1779 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1780 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1781 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
1782 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1785 ; AVX512VL-LABEL: var_shuffle_v4f64_from_v2f64:
1786 ; AVX512VL: # %bb.0:
1787 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1788 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
1789 ; AVX512VL-NEXT: retq
1790 %index0 = extractelement <4 x i64> %indices, i32 0
1791 %index1 = extractelement <4 x i64> %indices, i32 1
1792 %index2 = extractelement <4 x i64> %indices, i32 2
1793 %index3 = extractelement <4 x i64> %indices, i32 3
1794 %v0 = extractelement <2 x double> %v, i64 %index0
1795 %v1 = extractelement <2 x double> %v, i64 %index1
1796 %v2 = extractelement <2 x double> %v, i64 %index2
1797 %v3 = extractelement <2 x double> %v, i64 %index3
1798 %ret0 = insertelement <4 x double> undef, double %v0, i32 0
1799 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1
1800 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2
1801 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3
1802 ret <4 x double> %ret3
1805 define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind {
1806 ; XOP-LABEL: var_shuffle_v8f32_from_v4f32:
1807 ; XOP: # %bb.0: # %entry
1808 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1809 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1810 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm0, %ymm0, %ymm0
1813 ; AVX1-LABEL: var_shuffle_v8f32_from_v4f32:
1814 ; AVX1: # %bb.0: # %entry
1815 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1816 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2
1817 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1818 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
1819 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
1820 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1821 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
1822 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
1823 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
1826 ; INT256-LABEL: var_shuffle_v8f32_from_v4f32:
1827 ; INT256: # %bb.0: # %entry
1828 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1829 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
1832 %tmp1 = extractelement <8 x i32> %indices, i32 0
1833 %vecext2.8 = extractelement <4 x float> %v, i32 %tmp1
1834 %tmp2 = extractelement <8 x i32> %indices, i32 1
1835 %vecext2.9 = extractelement <4 x float> %v, i32 %tmp2
1836 %tmp3 = extractelement <8 x i32> %indices, i32 2
1837 %vecext2.10 = extractelement <4 x float> %v, i32 %tmp3
1838 %tmp4 = extractelement <8 x i32> %indices, i32 3
1839 %vecext2.11 = extractelement <4 x float> %v, i32 %tmp4
1840 %tmp5 = extractelement <8 x i32> %indices, i32 4
1841 %vecext2.12 = extractelement <4 x float> %v, i32 %tmp5
1842 %tmp6 = extractelement <8 x i32> %indices, i32 5
1843 %vecext2.13 = extractelement <4 x float> %v, i32 %tmp6
1844 %tmp7 = extractelement <8 x i32> %indices, i32 6
1845 %vecext2.14 = extractelement <4 x float> %v, i32 %tmp7
1846 %tmp8 = extractelement <8 x i32> %indices, i32 7
1847 %vecext2.15 = extractelement <4 x float> %v, i32 %tmp8
1848 %tmp9 = insertelement <8 x float> undef, float %vecext2.8, i32 0
1849 %tmp10 = insertelement <8 x float> %tmp9, float %vecext2.9, i32 1
1850 %tmp11 = insertelement <8 x float> %tmp10, float %vecext2.10, i32 2
1851 %tmp12 = insertelement <8 x float> %tmp11, float %vecext2.11, i32 3
1852 %tmp13 = insertelement <8 x float> %tmp12, float %vecext2.12, i32 4
1853 %tmp14 = insertelement <8 x float> %tmp13, float %vecext2.13, i32 5
1854 %tmp15 = insertelement <8 x float> %tmp14, float %vecext2.14, i32 6
1855 %tmp16 = insertelement <8 x float> %tmp15, float %vecext2.15, i32 7
1856 ret <8 x float> %tmp16
1859 define <4 x i32> @var_shuffle_v4i32_from_v8i32(<8 x i32> %v, <4 x i32> %indices) unnamed_addr nounwind {
1860 ; XOP-LABEL: var_shuffle_v4i32_from_v8i32:
1861 ; XOP: # %bb.0: # %entry
1862 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1863 ; XOP-NEXT: vpermil2ps $0, %xmm1, %xmm2, %xmm0, %xmm0
1864 ; XOP-NEXT: vzeroupper
1867 ; AVX1-LABEL: var_shuffle_v4i32_from_v8i32:
1868 ; AVX1: # %bb.0: # %entry
1869 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1870 ; AVX1-NEXT: vpermilps %xmm1, %xmm2, %xmm2
1871 ; AVX1-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1872 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1873 ; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1874 ; AVX1-NEXT: vzeroupper
1877 ; INT256-LABEL: var_shuffle_v4i32_from_v8i32:
1878 ; INT256: # %bb.0: # %entry
1879 ; INT256-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1880 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
1881 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1882 ; INT256-NEXT: vzeroupper
1885 %tmp1 = extractelement <4 x i32> %indices, i32 0
1886 %vecext2.8 = extractelement <8 x i32> %v, i32 %tmp1
1887 %tmp2 = extractelement <4 x i32> %indices, i32 1
1888 %vecext2.9 = extractelement <8 x i32> %v, i32 %tmp2
1889 %tmp3 = extractelement <4 x i32> %indices, i32 2
1890 %vecext2.10 = extractelement <8 x i32> %v, i32 %tmp3
1891 %tmp4 = extractelement <4 x i32> %indices, i32 3
1892 %vecext2.11 = extractelement <8 x i32> %v, i32 %tmp4
1893 %tmp9 = insertelement <4 x i32> undef, i32 %vecext2.8, i32 0
1894 %tmp10 = insertelement <4 x i32> %tmp9, i32 %vecext2.9, i32 1
1895 %tmp11 = insertelement <4 x i32> %tmp10, i32 %vecext2.10, i32 2
1896 %tmp12 = insertelement <4 x i32> %tmp11, i32 %vecext2.11, i32 3
1897 ret <4 x i32> %tmp12
1901 ; PR50356 - correctly adjust the indices vector to match the source/destination size.
1904 define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr nounwind {
1905 ; XOP-LABEL: PR50356:
1907 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1908 ; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1909 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
1910 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1911 ; XOP-NEXT: vpermil2pd $0, %xmm1, %xmm3, %xmm0, %xmm0
1912 ; XOP-NEXT: vpcomltq %xmm2, %xmm0, %xmm0
1913 ; XOP-NEXT: vextractf128 $1, %ymm2, %xmm1
1914 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
1915 ; XOP-NEXT: vpcomltq %xmm1, %xmm2, %xmm1
1916 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1917 ; XOP-NEXT: vmovapd {{.*#+}} ymm1 = [34,68,102,136]
1918 ; XOP-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1921 ; AVX1-LABEL: PR50356:
1923 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1924 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1925 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1926 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1927 ; AVX1-NEXT: vpermilpd %xmm1, %xmm3, %xmm3
1928 ; AVX1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
1929 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1930 ; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
1931 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
1932 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
1933 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1934 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
1935 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1936 ; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [34,68,102,136]
1937 ; AVX1-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1940 ; AVX2-LABEL: PR50356:
1942 ; AVX2-NEXT: pushq %rbp
1943 ; AVX2-NEXT: movq %rsp, %rbp
1944 ; AVX2-NEXT: andq $-32, %rsp
1945 ; AVX2-NEXT: subq $64, %rsp
1946 ; AVX2-NEXT: vmovd %xmm1, %eax
1947 ; AVX2-NEXT: vmovaps %ymm0, (%rsp)
1948 ; AVX2-NEXT: andl $3, %eax
1949 ; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
1950 ; AVX2-NEXT: andl $3, %ecx
1951 ; AVX2-NEXT: vmovq (%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero
1952 ; AVX2-NEXT: vmovq (%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero
1953 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1954 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0
1955 ; AVX2-NEXT: vmovapd {{.*#+}} ymm1 = [34,68,102,136]
1956 ; AVX2-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1957 ; AVX2-NEXT: movq %rbp, %rsp
1958 ; AVX2-NEXT: popq %rbp
1961 ; AVX512-LABEL: PR50356:
1963 ; AVX512-NEXT: pushq %rbp
1964 ; AVX512-NEXT: movq %rsp, %rbp
1965 ; AVX512-NEXT: andq $-32, %rsp
1966 ; AVX512-NEXT: subq $64, %rsp
1967 ; AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
1968 ; AVX512-NEXT: vmovd %xmm1, %eax
1969 ; AVX512-NEXT: vmovaps %ymm0, (%rsp)
1970 ; AVX512-NEXT: andl $3, %eax
1971 ; AVX512-NEXT: vpextrd $1, %xmm1, %ecx
1972 ; AVX512-NEXT: andl $3, %ecx
1973 ; AVX512-NEXT: vmovq (%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero
1974 ; AVX512-NEXT: vmovq (%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero
1975 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1976 ; AVX512-NEXT: vpcmpgtq %zmm0, %zmm2, %k1
1977 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [17,51,85,119]
1978 ; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = [34,68,102,136]
1979 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1980 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1981 ; AVX512-NEXT: movq %rbp, %rsp
1982 ; AVX512-NEXT: popq %rbp
1985 ; AVX512VL-LABEL: PR50356:
1986 ; AVX512VL: # %bb.0:
1987 ; AVX512VL-NEXT: pushq %rbp
1988 ; AVX512VL-NEXT: movq %rsp, %rbp
1989 ; AVX512VL-NEXT: andq $-32, %rsp
1990 ; AVX512VL-NEXT: subq $64, %rsp
1991 ; AVX512VL-NEXT: vmovd %xmm1, %eax
1992 ; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
1993 ; AVX512VL-NEXT: andl $3, %eax
1994 ; AVX512VL-NEXT: vpextrd $1, %xmm1, %ecx
1995 ; AVX512VL-NEXT: andl $3, %ecx
1996 ; AVX512VL-NEXT: vmovq (%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero
1997 ; AVX512VL-NEXT: vmovq (%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero
1998 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1999 ; AVX512VL-NEXT: vpcmpgtq %ymm0, %ymm2, %k1
2000 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} ymm0 = [34,68,102,136]
2001 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} ymm0 {%k1} = [17,51,85,119]
2002 ; AVX512VL-NEXT: movq %rbp, %rsp
2003 ; AVX512VL-NEXT: popq %rbp
2004 ; AVX512VL-NEXT: retq
2005 %v9 = and <4 x i32> %1, <i32 7, i32 7, i32 7, i32 7>
2006 %v10 = extractelement <4 x i32> %v9, i32 0
2007 %v11 = extractelement <4 x i64> %0, i32 %v10
2008 %v14 = extractelement <4 x i32> %v9, i32 1
2009 %v15 = extractelement <4 x i64> %0, i32 %v14
2010 %v27 = insertelement <4 x i64> zeroinitializer, i64 %v11, i32 0
2011 %v28 = insertelement <4 x i64> %v27, i64 %v15, i32 1
2012 %v36 = icmp slt <4 x i64> %v28, %2
2013 %v37 = select <4 x i1> %v36, <4 x i64> <i64 17, i64 51, i64 85, i64 119>, <4 x i64> <i64 34, i64 68, i64 102, i64 136> ; 17 68 102 136
2017 define <4 x i64> @var_shuffle_v4i64_with_v16i8_indices(<4 x i64> %v, <16 x i8> %indices) unnamed_addr nounwind {
2018 ; XOP-LABEL: var_shuffle_v4i64_with_v16i8_indices:
2020 ; XOP-NEXT: vpsrld $16, %xmm1, %xmm2
2021 ; XOP-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
2022 ; XOP-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
2023 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
2024 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2025 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
2026 ; XOP-NEXT: vpaddq %xmm2, %xmm2, %xmm2
2027 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2028 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm3, %ymm0, %ymm0
2031 ; AVX1-LABEL: var_shuffle_v4i64_with_v16i8_indices:
2033 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2
2034 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
2035 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
2036 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
2037 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
2038 ; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2
2039 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
2040 ; AVX1-NEXT: vpermilpd %ymm4, %ymm3, %ymm3
2041 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2042 ; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
2043 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2044 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2
2045 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2046 ; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm0, %ymm0
2049 ; AVX2-LABEL: var_shuffle_v4i64_with_v16i8_indices:
2051 ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
2052 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
2053 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
2054 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
2055 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
2056 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
2057 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
2058 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
2059 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
2062 ; AVX512-LABEL: var_shuffle_v4i64_with_v16i8_indices:
2064 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2065 ; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
2066 ; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0
2067 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2070 ; AVX512VL-LABEL: var_shuffle_v4i64_with_v16i8_indices:
2071 ; AVX512VL: # %bb.0:
2072 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
2073 ; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0
2074 ; AVX512VL-NEXT: retq
2075 %index0 = extractelement <16 x i8> %indices, i32 0
2076 %index1 = extractelement <16 x i8> %indices, i32 1
2077 %index2 = extractelement <16 x i8> %indices, i32 2
2078 %index3 = extractelement <16 x i8> %indices, i32 3
2079 %v0 = extractelement <4 x i64> %v, i8 %index0
2080 %v1 = extractelement <4 x i64> %v, i8 %index1
2081 %v2 = extractelement <4 x i64> %v, i8 %index2
2082 %v3 = extractelement <4 x i64> %v, i8 %index3
2083 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
2084 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
2085 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
2086 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3