1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefix=XOP
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=INT256,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=INT256,AVX512
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=INT256,AVX512
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=INT256,AVX512
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=INT256,AVX512
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLDQ
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLBW
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=INT256,AVX512VL,VLVBMI
14 define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
15 ; XOP-LABEL: var_shuffle_v4i64:
17 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
18 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
19 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3
20 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
21 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
22 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
23 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0
26 ; AVX1-LABEL: var_shuffle_v4i64:
28 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
29 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3
30 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
31 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
32 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4
33 ; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2
34 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
35 ; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
36 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
37 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
38 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
39 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
42 ; AVX2-LABEL: var_shuffle_v4i64:
44 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
45 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
46 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
47 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
48 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
49 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
50 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
51 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
54 ; AVX512-LABEL: var_shuffle_v4i64:
56 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
57 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
58 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
59 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
62 ; AVX512VL-LABEL: var_shuffle_v4i64:
64 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
66 %index0 = extractelement <4 x i64> %indices, i32 0
67 %index1 = extractelement <4 x i64> %indices, i32 1
68 %index2 = extractelement <4 x i64> %indices, i32 2
69 %index3 = extractelement <4 x i64> %indices, i32 3
70 %v0 = extractelement <4 x i64> %v, i64 %index0
71 %v1 = extractelement <4 x i64> %v, i64 %index1
72 %v2 = extractelement <4 x i64> %v, i64 %index2
73 %v3 = extractelement <4 x i64> %v, i64 %index3
74 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
75 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
76 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
77 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3
81 define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
82 ; XOP-LABEL: var_shuffle_v8i32:
84 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
85 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
86 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
89 ; AVX1-LABEL: var_shuffle_v8i32:
91 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
92 ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
93 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
94 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
95 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
96 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
97 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
98 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
99 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
102 ; INT256-LABEL: var_shuffle_v8i32:
104 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
106 %index0 = extractelement <8 x i32> %indices, i32 0
107 %index1 = extractelement <8 x i32> %indices, i32 1
108 %index2 = extractelement <8 x i32> %indices, i32 2
109 %index3 = extractelement <8 x i32> %indices, i32 3
110 %index4 = extractelement <8 x i32> %indices, i32 4
111 %index5 = extractelement <8 x i32> %indices, i32 5
112 %index6 = extractelement <8 x i32> %indices, i32 6
113 %index7 = extractelement <8 x i32> %indices, i32 7
114 %v0 = extractelement <8 x i32> %v, i32 %index0
115 %v1 = extractelement <8 x i32> %v, i32 %index1
116 %v2 = extractelement <8 x i32> %v, i32 %index2
117 %v3 = extractelement <8 x i32> %v, i32 %index3
118 %v4 = extractelement <8 x i32> %v, i32 %index4
119 %v5 = extractelement <8 x i32> %v, i32 %index5
120 %v6 = extractelement <8 x i32> %v, i32 %index6
121 %v7 = extractelement <8 x i32> %v, i32 %index7
122 %ret0 = insertelement <8 x i32> undef, i32 %v0, i32 0
123 %ret1 = insertelement <8 x i32> %ret0, i32 %v1, i32 1
124 %ret2 = insertelement <8 x i32> %ret1, i32 %v2, i32 2
125 %ret3 = insertelement <8 x i32> %ret2, i32 %v3, i32 3
126 %ret4 = insertelement <8 x i32> %ret3, i32 %v4, i32 4
127 %ret5 = insertelement <8 x i32> %ret4, i32 %v5, i32 5
128 %ret6 = insertelement <8 x i32> %ret5, i32 %v6, i32 6
129 %ret7 = insertelement <8 x i32> %ret6, i32 %v7, i32 7
133 define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
134 ; XOP-LABEL: var_shuffle_v16i16:
136 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256]
137 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
138 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4
139 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
140 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1
141 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
142 ; XOP-NEXT: vpperm %xmm1, %xmm2, %xmm0, %xmm1
143 ; XOP-NEXT: vpperm %xmm4, %xmm2, %xmm0, %xmm0
144 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
147 ; AVX1-LABEL: var_shuffle_v16i16:
149 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514]
150 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
151 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256]
152 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
153 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
154 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
155 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1
156 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
157 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4
158 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
159 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm6
160 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
161 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
162 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
163 ; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm4
164 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
165 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
166 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
169 ; AVX2-LABEL: var_shuffle_v16i16:
171 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
172 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
173 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
174 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2
175 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
176 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
177 ; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
178 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
181 ; AVX512-LABEL: var_shuffle_v16i16:
183 ; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
184 ; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
185 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
186 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2
187 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
188 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
189 ; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
190 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
193 ; AVX512VLDQ-LABEL: var_shuffle_v16i16:
194 ; AVX512VLDQ: # %bb.0:
195 ; AVX512VLDQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
196 ; AVX512VLDQ-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
197 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
198 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2
199 ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
200 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3
201 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
202 ; AVX512VLDQ-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm0
203 ; AVX512VLDQ-NEXT: retq
205 ; AVX512VLBW-LABEL: var_shuffle_v16i16:
206 ; AVX512VLBW: # %bb.0:
207 ; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
208 ; AVX512VLBW-NEXT: retq
210 ; VLVBMI-LABEL: var_shuffle_v16i16:
212 ; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0
214 %index0 = extractelement <16 x i16> %indices, i32 0
215 %index1 = extractelement <16 x i16> %indices, i32 1
216 %index2 = extractelement <16 x i16> %indices, i32 2
217 %index3 = extractelement <16 x i16> %indices, i32 3
218 %index4 = extractelement <16 x i16> %indices, i32 4
219 %index5 = extractelement <16 x i16> %indices, i32 5
220 %index6 = extractelement <16 x i16> %indices, i32 6
221 %index7 = extractelement <16 x i16> %indices, i32 7
222 %index8 = extractelement <16 x i16> %indices, i32 8
223 %index9 = extractelement <16 x i16> %indices, i32 9
224 %index10 = extractelement <16 x i16> %indices, i32 10
225 %index11 = extractelement <16 x i16> %indices, i32 11
226 %index12 = extractelement <16 x i16> %indices, i32 12
227 %index13 = extractelement <16 x i16> %indices, i32 13
228 %index14 = extractelement <16 x i16> %indices, i32 14
229 %index15 = extractelement <16 x i16> %indices, i32 15
230 %v0 = extractelement <16 x i16> %v, i16 %index0
231 %v1 = extractelement <16 x i16> %v, i16 %index1
232 %v2 = extractelement <16 x i16> %v, i16 %index2
233 %v3 = extractelement <16 x i16> %v, i16 %index3
234 %v4 = extractelement <16 x i16> %v, i16 %index4
235 %v5 = extractelement <16 x i16> %v, i16 %index5
236 %v6 = extractelement <16 x i16> %v, i16 %index6
237 %v7 = extractelement <16 x i16> %v, i16 %index7
238 %v8 = extractelement <16 x i16> %v, i16 %index8
239 %v9 = extractelement <16 x i16> %v, i16 %index9
240 %v10 = extractelement <16 x i16> %v, i16 %index10
241 %v11 = extractelement <16 x i16> %v, i16 %index11
242 %v12 = extractelement <16 x i16> %v, i16 %index12
243 %v13 = extractelement <16 x i16> %v, i16 %index13
244 %v14 = extractelement <16 x i16> %v, i16 %index14
245 %v15 = extractelement <16 x i16> %v, i16 %index15
246 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0
247 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1
248 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2
249 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3
250 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4
251 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5
252 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6
253 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7
254 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8
255 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9
256 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10
257 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11
258 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12
259 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13
260 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14
261 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15
262 ret <16 x i16> %ret15
265 define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
266 ; XOP-LABEL: var_shuffle_v32i8:
268 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
269 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
270 ; XOP-NEXT: vpperm %xmm2, %xmm3, %xmm0, %xmm2
271 ; XOP-NEXT: vpperm %xmm1, %xmm3, %xmm0, %xmm0
272 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
275 ; AVX1-LABEL: var_shuffle_v32i8:
277 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
278 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
279 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
280 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
281 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm6
282 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
283 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm2, %xmm2
284 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3
285 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm4
286 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
287 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm0
288 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
291 ; AVX2-LABEL: var_shuffle_v32i8:
293 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
294 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2
295 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
296 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
297 ; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
298 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
301 ; AVX512-LABEL: var_shuffle_v32i8:
303 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
304 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2
305 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
306 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
307 ; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
308 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
311 ; AVX512VLDQ-LABEL: var_shuffle_v32i8:
312 ; AVX512VLDQ: # %bb.0:
313 ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
314 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2
315 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
316 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3
317 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
318 ; AVX512VLDQ-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm0
319 ; AVX512VLDQ-NEXT: retq
321 ; AVX512VLBW-LABEL: var_shuffle_v32i8:
322 ; AVX512VLBW: # %bb.0:
323 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
324 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
325 ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
326 ; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
327 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1}
328 ; AVX512VLBW-NEXT: vmovdqa %ymm2, %ymm0
329 ; AVX512VLBW-NEXT: retq
331 ; VLVBMI-LABEL: var_shuffle_v32i8:
333 ; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
335 %index0 = extractelement <32 x i8> %indices, i32 0
336 %index1 = extractelement <32 x i8> %indices, i32 1
337 %index2 = extractelement <32 x i8> %indices, i32 2
338 %index3 = extractelement <32 x i8> %indices, i32 3
339 %index4 = extractelement <32 x i8> %indices, i32 4
340 %index5 = extractelement <32 x i8> %indices, i32 5
341 %index6 = extractelement <32 x i8> %indices, i32 6
342 %index7 = extractelement <32 x i8> %indices, i32 7
343 %index8 = extractelement <32 x i8> %indices, i32 8
344 %index9 = extractelement <32 x i8> %indices, i32 9
345 %index10 = extractelement <32 x i8> %indices, i32 10
346 %index11 = extractelement <32 x i8> %indices, i32 11
347 %index12 = extractelement <32 x i8> %indices, i32 12
348 %index13 = extractelement <32 x i8> %indices, i32 13
349 %index14 = extractelement <32 x i8> %indices, i32 14
350 %index15 = extractelement <32 x i8> %indices, i32 15
351 %index16 = extractelement <32 x i8> %indices, i32 16
352 %index17 = extractelement <32 x i8> %indices, i32 17
353 %index18 = extractelement <32 x i8> %indices, i32 18
354 %index19 = extractelement <32 x i8> %indices, i32 19
355 %index20 = extractelement <32 x i8> %indices, i32 20
356 %index21 = extractelement <32 x i8> %indices, i32 21
357 %index22 = extractelement <32 x i8> %indices, i32 22
358 %index23 = extractelement <32 x i8> %indices, i32 23
359 %index24 = extractelement <32 x i8> %indices, i32 24
360 %index25 = extractelement <32 x i8> %indices, i32 25
361 %index26 = extractelement <32 x i8> %indices, i32 26
362 %index27 = extractelement <32 x i8> %indices, i32 27
363 %index28 = extractelement <32 x i8> %indices, i32 28
364 %index29 = extractelement <32 x i8> %indices, i32 29
365 %index30 = extractelement <32 x i8> %indices, i32 30
366 %index31 = extractelement <32 x i8> %indices, i32 31
367 %v0 = extractelement <32 x i8> %v, i8 %index0
368 %v1 = extractelement <32 x i8> %v, i8 %index1
369 %v2 = extractelement <32 x i8> %v, i8 %index2
370 %v3 = extractelement <32 x i8> %v, i8 %index3
371 %v4 = extractelement <32 x i8> %v, i8 %index4
372 %v5 = extractelement <32 x i8> %v, i8 %index5
373 %v6 = extractelement <32 x i8> %v, i8 %index6
374 %v7 = extractelement <32 x i8> %v, i8 %index7
375 %v8 = extractelement <32 x i8> %v, i8 %index8
376 %v9 = extractelement <32 x i8> %v, i8 %index9
377 %v10 = extractelement <32 x i8> %v, i8 %index10
378 %v11 = extractelement <32 x i8> %v, i8 %index11
379 %v12 = extractelement <32 x i8> %v, i8 %index12
380 %v13 = extractelement <32 x i8> %v, i8 %index13
381 %v14 = extractelement <32 x i8> %v, i8 %index14
382 %v15 = extractelement <32 x i8> %v, i8 %index15
383 %v16 = extractelement <32 x i8> %v, i8 %index16
384 %v17 = extractelement <32 x i8> %v, i8 %index17
385 %v18 = extractelement <32 x i8> %v, i8 %index18
386 %v19 = extractelement <32 x i8> %v, i8 %index19
387 %v20 = extractelement <32 x i8> %v, i8 %index20
388 %v21 = extractelement <32 x i8> %v, i8 %index21
389 %v22 = extractelement <32 x i8> %v, i8 %index22
390 %v23 = extractelement <32 x i8> %v, i8 %index23
391 %v24 = extractelement <32 x i8> %v, i8 %index24
392 %v25 = extractelement <32 x i8> %v, i8 %index25
393 %v26 = extractelement <32 x i8> %v, i8 %index26
394 %v27 = extractelement <32 x i8> %v, i8 %index27
395 %v28 = extractelement <32 x i8> %v, i8 %index28
396 %v29 = extractelement <32 x i8> %v, i8 %index29
397 %v30 = extractelement <32 x i8> %v, i8 %index30
398 %v31 = extractelement <32 x i8> %v, i8 %index31
399 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0
400 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1
401 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2
402 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3
403 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4
404 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5
405 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6
406 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7
407 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8
408 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9
409 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10
410 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11
411 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12
412 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13
413 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14
414 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15
415 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16
416 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17
417 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18
418 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19
419 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20
420 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21
421 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22
422 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23
423 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24
424 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25
425 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26
426 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27
427 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28
428 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29
429 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30
430 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31
434 define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind {
435 ; XOP-LABEL: var_shuffle_v4f64:
437 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
438 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
439 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3
440 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
441 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
442 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
443 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0
446 ; AVX1-LABEL: var_shuffle_v4f64:
448 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
449 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3
450 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
451 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
452 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4
453 ; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2
454 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
455 ; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
456 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
457 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
458 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
459 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
462 ; AVX2-LABEL: var_shuffle_v4f64:
464 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
465 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
466 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
467 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
468 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
469 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
470 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
471 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
474 ; AVX512-LABEL: var_shuffle_v4f64:
476 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
477 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
478 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
479 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
482 ; AVX512VL-LABEL: var_shuffle_v4f64:
484 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
485 ; AVX512VL-NEXT: retq
486 %index0 = extractelement <4 x i64> %indices, i32 0
487 %index1 = extractelement <4 x i64> %indices, i32 1
488 %index2 = extractelement <4 x i64> %indices, i32 2
489 %index3 = extractelement <4 x i64> %indices, i32 3
490 %v0 = extractelement <4 x double> %v, i64 %index0
491 %v1 = extractelement <4 x double> %v, i64 %index1
492 %v2 = extractelement <4 x double> %v, i64 %index2
493 %v3 = extractelement <4 x double> %v, i64 %index3
494 %ret0 = insertelement <4 x double> undef, double %v0, i32 0
495 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1
496 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2
497 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3
498 ret <4 x double> %ret3
501 define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind {
502 ; XOP-LABEL: var_shuffle_v8f32:
504 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
505 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
506 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
509 ; AVX1-LABEL: var_shuffle_v8f32:
511 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
512 ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
513 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
514 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
515 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
516 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
517 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
518 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
519 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
522 ; INT256-LABEL: var_shuffle_v8f32:
524 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
526 %index0 = extractelement <8 x i32> %indices, i32 0
527 %index1 = extractelement <8 x i32> %indices, i32 1
528 %index2 = extractelement <8 x i32> %indices, i32 2
529 %index3 = extractelement <8 x i32> %indices, i32 3
530 %index4 = extractelement <8 x i32> %indices, i32 4
531 %index5 = extractelement <8 x i32> %indices, i32 5
532 %index6 = extractelement <8 x i32> %indices, i32 6
533 %index7 = extractelement <8 x i32> %indices, i32 7
534 %v0 = extractelement <8 x float> %v, i32 %index0
535 %v1 = extractelement <8 x float> %v, i32 %index1
536 %v2 = extractelement <8 x float> %v, i32 %index2
537 %v3 = extractelement <8 x float> %v, i32 %index3
538 %v4 = extractelement <8 x float> %v, i32 %index4
539 %v5 = extractelement <8 x float> %v, i32 %index5
540 %v6 = extractelement <8 x float> %v, i32 %index6
541 %v7 = extractelement <8 x float> %v, i32 %index7
542 %ret0 = insertelement <8 x float> undef, float %v0, i32 0
543 %ret1 = insertelement <8 x float> %ret0, float %v1, i32 1
544 %ret2 = insertelement <8 x float> %ret1, float %v2, i32 2
545 %ret3 = insertelement <8 x float> %ret2, float %v3, i32 3
546 %ret4 = insertelement <8 x float> %ret3, float %v4, i32 4
547 %ret5 = insertelement <8 x float> %ret4, float %v5, i32 5
548 %ret6 = insertelement <8 x float> %ret5, float %v6, i32 6
549 %ret7 = insertelement <8 x float> %ret6, float %v7, i32 7
550 ret <8 x float> %ret7
554 ; PR35820 - Unequal source/destination vector sizes
557 define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) nounwind {
558 ; XOP-LABEL: var_shuffle_v4i64_from_v2i64:
560 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
561 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
562 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm2
563 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
564 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
565 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
566 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm0, %ymm0, %ymm0
569 ; AVX1-LABEL: var_shuffle_v4i64_from_v2i64:
571 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
572 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
573 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm2
574 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
575 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
576 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3
577 ; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0
578 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
579 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
580 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
581 ; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2
582 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
585 ; AVX2-LABEL: var_shuffle_v4i64_from_v2i64:
587 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
588 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
589 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
590 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
591 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm3
592 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
593 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
594 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
597 ; AVX512-LABEL: var_shuffle_v4i64_from_v2i64:
599 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
600 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
601 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
602 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
605 ; AVX512VL-LABEL: var_shuffle_v4i64_from_v2i64:
607 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
608 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
609 ; AVX512VL-NEXT: retq
610 %index0 = extractelement <4 x i64> %indices, i32 0
611 %index1 = extractelement <4 x i64> %indices, i32 1
612 %index2 = extractelement <4 x i64> %indices, i32 2
613 %index3 = extractelement <4 x i64> %indices, i32 3
614 %v0 = extractelement <2 x i64> %v, i64 %index0
615 %v1 = extractelement <2 x i64> %v, i64 %index1
616 %v2 = extractelement <2 x i64> %v, i64 %index2
617 %v3 = extractelement <2 x i64> %v, i64 %index3
618 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
619 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
620 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
621 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3
625 define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind {
626 ; XOP-LABEL: var_shuffle_v8i32_from_v4i32:
627 ; XOP: # %bb.0: # %entry
628 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
629 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
630 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm0, %ymm0, %ymm0
633 ; AVX1-LABEL: var_shuffle_v8i32_from_v4i32:
634 ; AVX1: # %bb.0: # %entry
635 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
636 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2
637 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
638 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
639 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
640 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
641 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
642 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
643 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
646 ; INT256-LABEL: var_shuffle_v8i32_from_v4i32:
647 ; INT256: # %bb.0: # %entry
648 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
649 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
652 %tmp1 = extractelement <8 x i32> %indices, i32 0
653 %vecext2.8 = extractelement <4 x i32> %v, i32 %tmp1
654 %tmp2 = extractelement <8 x i32> %indices, i32 1
655 %vecext2.9 = extractelement <4 x i32> %v, i32 %tmp2
656 %tmp3 = extractelement <8 x i32> %indices, i32 2
657 %vecext2.10 = extractelement <4 x i32> %v, i32 %tmp3
658 %tmp4 = extractelement <8 x i32> %indices, i32 3
659 %vecext2.11 = extractelement <4 x i32> %v, i32 %tmp4
660 %tmp5 = extractelement <8 x i32> %indices, i32 4
661 %vecext2.12 = extractelement <4 x i32> %v, i32 %tmp5
662 %tmp6 = extractelement <8 x i32> %indices, i32 5
663 %vecext2.13 = extractelement <4 x i32> %v, i32 %tmp6
664 %tmp7 = extractelement <8 x i32> %indices, i32 6
665 %vecext2.14 = extractelement <4 x i32> %v, i32 %tmp7
666 %tmp8 = extractelement <8 x i32> %indices, i32 7
667 %vecext2.15 = extractelement <4 x i32> %v, i32 %tmp8
668 %tmp9 = insertelement <8 x i32> undef, i32 %vecext2.8, i32 0
669 %tmp10 = insertelement <8 x i32> %tmp9, i32 %vecext2.9, i32 1
670 %tmp11 = insertelement <8 x i32> %tmp10, i32 %vecext2.10, i32 2
671 %tmp12 = insertelement <8 x i32> %tmp11, i32 %vecext2.11, i32 3
672 %tmp13 = insertelement <8 x i32> %tmp12, i32 %vecext2.12, i32 4
673 %tmp14 = insertelement <8 x i32> %tmp13, i32 %vecext2.13, i32 5
674 %tmp15 = insertelement <8 x i32> %tmp14, i32 %vecext2.14, i32 6
675 %tmp16 = insertelement <8 x i32> %tmp15, i32 %vecext2.15, i32 7
679 define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indices) nounwind {
680 ; XOP-LABEL: var_shuffle_v16i16_from_v8i16:
682 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256]
683 ; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
684 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4
685 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
686 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1
687 ; XOP-NEXT: vpperm %xmm1, %xmm0, %xmm0, %xmm1
688 ; XOP-NEXT: vpperm %xmm4, %xmm0, %xmm0, %xmm0
689 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
692 ; AVX1-LABEL: var_shuffle_v16i16_from_v8i16:
694 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514]
695 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
696 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256]
697 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
698 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
699 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
700 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1
701 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
702 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4
703 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm5
704 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
705 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
706 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
707 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm4
708 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
709 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
710 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
713 ; AVX2-LABEL: var_shuffle_v16i16_from_v8i16:
715 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
716 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
717 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
718 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2
719 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
720 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
721 ; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
722 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
725 ; AVX512-LABEL: var_shuffle_v16i16_from_v8i16:
727 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
728 ; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
729 ; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
730 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2
731 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
732 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
733 ; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
734 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
737 ; AVX512VLDQ-LABEL: var_shuffle_v16i16_from_v8i16:
738 ; AVX512VLDQ: # %bb.0:
739 ; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
740 ; AVX512VLDQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
741 ; AVX512VLDQ-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
742 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
743 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2
744 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3
745 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
746 ; AVX512VLDQ-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm0
747 ; AVX512VLDQ-NEXT: retq
749 ; AVX512VLBW-LABEL: var_shuffle_v16i16_from_v8i16:
750 ; AVX512VLBW: # %bb.0:
751 ; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
752 ; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
753 ; AVX512VLBW-NEXT: retq
755 ; VLVBMI-LABEL: var_shuffle_v16i16_from_v8i16:
757 ; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
758 ; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0
760 %index0 = extractelement <16 x i16> %indices, i32 0
761 %index1 = extractelement <16 x i16> %indices, i32 1
762 %index2 = extractelement <16 x i16> %indices, i32 2
763 %index3 = extractelement <16 x i16> %indices, i32 3
764 %index4 = extractelement <16 x i16> %indices, i32 4
765 %index5 = extractelement <16 x i16> %indices, i32 5
766 %index6 = extractelement <16 x i16> %indices, i32 6
767 %index7 = extractelement <16 x i16> %indices, i32 7
768 %index8 = extractelement <16 x i16> %indices, i32 8
769 %index9 = extractelement <16 x i16> %indices, i32 9
770 %index10 = extractelement <16 x i16> %indices, i32 10
771 %index11 = extractelement <16 x i16> %indices, i32 11
772 %index12 = extractelement <16 x i16> %indices, i32 12
773 %index13 = extractelement <16 x i16> %indices, i32 13
774 %index14 = extractelement <16 x i16> %indices, i32 14
775 %index15 = extractelement <16 x i16> %indices, i32 15
776 %v0 = extractelement <8 x i16> %v, i16 %index0
777 %v1 = extractelement <8 x i16> %v, i16 %index1
778 %v2 = extractelement <8 x i16> %v, i16 %index2
779 %v3 = extractelement <8 x i16> %v, i16 %index3
780 %v4 = extractelement <8 x i16> %v, i16 %index4
781 %v5 = extractelement <8 x i16> %v, i16 %index5
782 %v6 = extractelement <8 x i16> %v, i16 %index6
783 %v7 = extractelement <8 x i16> %v, i16 %index7
784 %v8 = extractelement <8 x i16> %v, i16 %index8
785 %v9 = extractelement <8 x i16> %v, i16 %index9
786 %v10 = extractelement <8 x i16> %v, i16 %index10
787 %v11 = extractelement <8 x i16> %v, i16 %index11
788 %v12 = extractelement <8 x i16> %v, i16 %index12
789 %v13 = extractelement <8 x i16> %v, i16 %index13
790 %v14 = extractelement <8 x i16> %v, i16 %index14
791 %v15 = extractelement <8 x i16> %v, i16 %index15
792 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0
793 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1
794 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2
795 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3
796 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4
797 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5
798 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6
799 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7
800 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8
801 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9
802 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10
803 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11
804 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12
805 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13
806 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14
807 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15
808 ret <16 x i16> %ret15
811 define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
812 ; XOP-LABEL: var_shuffle_v32i8_from_v16i8:
814 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
815 ; XOP-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm2
816 ; XOP-NEXT: vpperm %xmm1, %xmm0, %xmm0, %xmm0
817 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
820 ; AVX1-LABEL: var_shuffle_v32i8_from_v16i8:
822 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
823 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
824 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
825 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm5
826 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
827 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm2, %xmm2
828 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3
829 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm4
830 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
831 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm0
832 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
835 ; AVX2-LABEL: var_shuffle_v32i8_from_v16i8:
837 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
838 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2
839 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
840 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
841 ; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
842 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
845 ; AVX512-LABEL: var_shuffle_v32i8_from_v16i8:
847 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
848 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2
849 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
850 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
851 ; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
852 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
855 ; AVX512VLDQ-LABEL: var_shuffle_v32i8_from_v16i8:
856 ; AVX512VLDQ: # %bb.0:
857 ; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
858 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2
859 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
860 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3
861 ; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
862 ; AVX512VLDQ-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm0
863 ; AVX512VLDQ-NEXT: retq
865 ; AVX512VLBW-LABEL: var_shuffle_v32i8_from_v16i8:
866 ; AVX512VLBW: # %bb.0:
867 ; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
868 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
869 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
870 ; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
871 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 {%k1}
872 ; AVX512VLBW-NEXT: retq
874 ; VLVBMI-LABEL: var_shuffle_v32i8_from_v16i8:
876 ; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
877 ; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
879 %index0 = extractelement <32 x i8> %indices, i32 0
880 %index1 = extractelement <32 x i8> %indices, i32 1
881 %index2 = extractelement <32 x i8> %indices, i32 2
882 %index3 = extractelement <32 x i8> %indices, i32 3
883 %index4 = extractelement <32 x i8> %indices, i32 4
884 %index5 = extractelement <32 x i8> %indices, i32 5
885 %index6 = extractelement <32 x i8> %indices, i32 6
886 %index7 = extractelement <32 x i8> %indices, i32 7
887 %index8 = extractelement <32 x i8> %indices, i32 8
888 %index9 = extractelement <32 x i8> %indices, i32 9
889 %index10 = extractelement <32 x i8> %indices, i32 10
890 %index11 = extractelement <32 x i8> %indices, i32 11
891 %index12 = extractelement <32 x i8> %indices, i32 12
892 %index13 = extractelement <32 x i8> %indices, i32 13
893 %index14 = extractelement <32 x i8> %indices, i32 14
894 %index15 = extractelement <32 x i8> %indices, i32 15
895 %index16 = extractelement <32 x i8> %indices, i32 16
896 %index17 = extractelement <32 x i8> %indices, i32 17
897 %index18 = extractelement <32 x i8> %indices, i32 18
898 %index19 = extractelement <32 x i8> %indices, i32 19
899 %index20 = extractelement <32 x i8> %indices, i32 20
900 %index21 = extractelement <32 x i8> %indices, i32 21
901 %index22 = extractelement <32 x i8> %indices, i32 22
902 %index23 = extractelement <32 x i8> %indices, i32 23
903 %index24 = extractelement <32 x i8> %indices, i32 24
904 %index25 = extractelement <32 x i8> %indices, i32 25
905 %index26 = extractelement <32 x i8> %indices, i32 26
906 %index27 = extractelement <32 x i8> %indices, i32 27
907 %index28 = extractelement <32 x i8> %indices, i32 28
908 %index29 = extractelement <32 x i8> %indices, i32 29
909 %index30 = extractelement <32 x i8> %indices, i32 30
910 %index31 = extractelement <32 x i8> %indices, i32 31
911 %v0 = extractelement <16 x i8> %v, i8 %index0
912 %v1 = extractelement <16 x i8> %v, i8 %index1
913 %v2 = extractelement <16 x i8> %v, i8 %index2
914 %v3 = extractelement <16 x i8> %v, i8 %index3
915 %v4 = extractelement <16 x i8> %v, i8 %index4
916 %v5 = extractelement <16 x i8> %v, i8 %index5
917 %v6 = extractelement <16 x i8> %v, i8 %index6
918 %v7 = extractelement <16 x i8> %v, i8 %index7
919 %v8 = extractelement <16 x i8> %v, i8 %index8
920 %v9 = extractelement <16 x i8> %v, i8 %index9
921 %v10 = extractelement <16 x i8> %v, i8 %index10
922 %v11 = extractelement <16 x i8> %v, i8 %index11
923 %v12 = extractelement <16 x i8> %v, i8 %index12
924 %v13 = extractelement <16 x i8> %v, i8 %index13
925 %v14 = extractelement <16 x i8> %v, i8 %index14
926 %v15 = extractelement <16 x i8> %v, i8 %index15
927 %v16 = extractelement <16 x i8> %v, i8 %index16
928 %v17 = extractelement <16 x i8> %v, i8 %index17
929 %v18 = extractelement <16 x i8> %v, i8 %index18
930 %v19 = extractelement <16 x i8> %v, i8 %index19
931 %v20 = extractelement <16 x i8> %v, i8 %index20
932 %v21 = extractelement <16 x i8> %v, i8 %index21
933 %v22 = extractelement <16 x i8> %v, i8 %index22
934 %v23 = extractelement <16 x i8> %v, i8 %index23
935 %v24 = extractelement <16 x i8> %v, i8 %index24
936 %v25 = extractelement <16 x i8> %v, i8 %index25
937 %v26 = extractelement <16 x i8> %v, i8 %index26
938 %v27 = extractelement <16 x i8> %v, i8 %index27
939 %v28 = extractelement <16 x i8> %v, i8 %index28
940 %v29 = extractelement <16 x i8> %v, i8 %index29
941 %v30 = extractelement <16 x i8> %v, i8 %index30
942 %v31 = extractelement <16 x i8> %v, i8 %index31
943 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0
944 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1
945 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2
946 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3
947 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4
948 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5
949 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6
950 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7
951 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8
952 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9
953 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10
954 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11
955 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12
956 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13
957 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14
958 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15
959 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16
960 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17
961 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18
962 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19
963 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20
964 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21
965 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22
966 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23
967 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24
968 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25
969 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26
970 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27
971 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28
972 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29
973 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30
974 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31
978 define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %indices) nounwind {
979 ; XOP-LABEL: var_shuffle_v4f64_from_v2f64:
981 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
982 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
983 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm2
984 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
985 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
986 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
987 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm0, %ymm0, %ymm0
990 ; AVX1-LABEL: var_shuffle_v4f64_from_v2f64:
992 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
993 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
994 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm2
995 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
996 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
997 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3
998 ; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0
999 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1000 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
1001 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1002 ; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2
1003 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
1006 ; AVX2-LABEL: var_shuffle_v4f64_from_v2f64:
1008 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1009 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
1010 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
1011 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
1012 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm3
1013 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1014 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1015 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
1018 ; AVX512-LABEL: var_shuffle_v4f64_from_v2f64:
1020 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1021 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1022 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
1023 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1026 ; AVX512VL-LABEL: var_shuffle_v4f64_from_v2f64:
1027 ; AVX512VL: # %bb.0:
1028 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1029 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
1030 ; AVX512VL-NEXT: retq
1031 %index0 = extractelement <4 x i64> %indices, i32 0
1032 %index1 = extractelement <4 x i64> %indices, i32 1
1033 %index2 = extractelement <4 x i64> %indices, i32 2
1034 %index3 = extractelement <4 x i64> %indices, i32 3
1035 %v0 = extractelement <2 x double> %v, i64 %index0
1036 %v1 = extractelement <2 x double> %v, i64 %index1
1037 %v2 = extractelement <2 x double> %v, i64 %index2
1038 %v3 = extractelement <2 x double> %v, i64 %index3
1039 %ret0 = insertelement <4 x double> undef, double %v0, i32 0
1040 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1
1041 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2
1042 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3
1043 ret <4 x double> %ret3
1046 define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind {
1047 ; XOP-LABEL: var_shuffle_v8f32_from_v4f32:
1048 ; XOP: # %bb.0: # %entry
1049 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1050 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1051 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm0, %ymm0, %ymm0
1054 ; AVX1-LABEL: var_shuffle_v8f32_from_v4f32:
1055 ; AVX1: # %bb.0: # %entry
1056 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1057 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2
1058 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1059 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
1060 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
1061 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1062 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
1063 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
1064 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
1067 ; INT256-LABEL: var_shuffle_v8f32_from_v4f32:
1068 ; INT256: # %bb.0: # %entry
1069 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1070 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
1073 %tmp1 = extractelement <8 x i32> %indices, i32 0
1074 %vecext2.8 = extractelement <4 x float> %v, i32 %tmp1
1075 %tmp2 = extractelement <8 x i32> %indices, i32 1
1076 %vecext2.9 = extractelement <4 x float> %v, i32 %tmp2
1077 %tmp3 = extractelement <8 x i32> %indices, i32 2
1078 %vecext2.10 = extractelement <4 x float> %v, i32 %tmp3
1079 %tmp4 = extractelement <8 x i32> %indices, i32 3
1080 %vecext2.11 = extractelement <4 x float> %v, i32 %tmp4
1081 %tmp5 = extractelement <8 x i32> %indices, i32 4
1082 %vecext2.12 = extractelement <4 x float> %v, i32 %tmp5
1083 %tmp6 = extractelement <8 x i32> %indices, i32 5
1084 %vecext2.13 = extractelement <4 x float> %v, i32 %tmp6
1085 %tmp7 = extractelement <8 x i32> %indices, i32 6
1086 %vecext2.14 = extractelement <4 x float> %v, i32 %tmp7
1087 %tmp8 = extractelement <8 x i32> %indices, i32 7
1088 %vecext2.15 = extractelement <4 x float> %v, i32 %tmp8
1089 %tmp9 = insertelement <8 x float> undef, float %vecext2.8, i32 0
1090 %tmp10 = insertelement <8 x float> %tmp9, float %vecext2.9, i32 1
1091 %tmp11 = insertelement <8 x float> %tmp10, float %vecext2.10, i32 2
1092 %tmp12 = insertelement <8 x float> %tmp11, float %vecext2.11, i32 3
1093 %tmp13 = insertelement <8 x float> %tmp12, float %vecext2.12, i32 4
1094 %tmp14 = insertelement <8 x float> %tmp13, float %vecext2.13, i32 5
1095 %tmp15 = insertelement <8 x float> %tmp14, float %vecext2.14, i32 6
1096 %tmp16 = insertelement <8 x float> %tmp15, float %vecext2.15, i32 7
1097 ret <8 x float> %tmp16
1100 define <4 x i32> @var_shuffle_v4i32_from_v8i32(<8 x i32> %v, <4 x i32> %indices) unnamed_addr nounwind {
1101 ; XOP-LABEL: var_shuffle_v4i32_from_v8i32:
1102 ; XOP: # %bb.0: # %entry
1103 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1104 ; XOP-NEXT: vpermil2ps $0, %xmm1, %xmm2, %xmm0, %xmm0
1105 ; XOP-NEXT: vzeroupper
1108 ; AVX1-LABEL: var_shuffle_v4i32_from_v8i32:
1109 ; AVX1: # %bb.0: # %entry
1110 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1111 ; AVX1-NEXT: vpermilps %xmm1, %xmm2, %xmm2
1112 ; AVX1-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1113 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1114 ; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1115 ; AVX1-NEXT: vzeroupper
1118 ; INT256-LABEL: var_shuffle_v4i32_from_v8i32:
1119 ; INT256: # %bb.0: # %entry
1120 ; INT256-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1121 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
1122 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1123 ; INT256-NEXT: vzeroupper
1126 %tmp1 = extractelement <4 x i32> %indices, i32 0
1127 %vecext2.8 = extractelement <8 x i32> %v, i32 %tmp1
1128 %tmp2 = extractelement <4 x i32> %indices, i32 1
1129 %vecext2.9 = extractelement <8 x i32> %v, i32 %tmp2
1130 %tmp3 = extractelement <4 x i32> %indices, i32 2
1131 %vecext2.10 = extractelement <8 x i32> %v, i32 %tmp3
1132 %tmp4 = extractelement <4 x i32> %indices, i32 3
1133 %vecext2.11 = extractelement <8 x i32> %v, i32 %tmp4
1134 %tmp9 = insertelement <4 x i32> undef, i32 %vecext2.8, i32 0
1135 %tmp10 = insertelement <4 x i32> %tmp9, i32 %vecext2.9, i32 1
1136 %tmp11 = insertelement <4 x i32> %tmp10, i32 %vecext2.10, i32 2
1137 %tmp12 = insertelement <4 x i32> %tmp11, i32 %vecext2.11, i32 3
1138 ret <4 x i32> %tmp12
1142 ; PR50356 - correctly adjust the indices vector to match the source/destination size.
1145 define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr nounwind {
1146 ; XOP-LABEL: PR50356:
1148 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1149 ; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1150 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
1151 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1152 ; XOP-NEXT: vpermil2pd $0, %xmm1, %xmm3, %xmm0, %xmm0
1153 ; XOP-NEXT: vpcomltq %xmm2, %xmm0, %xmm0
1154 ; XOP-NEXT: vextractf128 $1, %ymm2, %xmm1
1155 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
1156 ; XOP-NEXT: vpcomltq %xmm1, %xmm2, %xmm1
1157 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1158 ; XOP-NEXT: vmovapd {{.*#+}} ymm1 = [34,68,102,136]
1159 ; XOP-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1162 ; AVX1-LABEL: PR50356:
1164 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1165 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1166 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1167 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1168 ; AVX1-NEXT: vpermilpd %xmm1, %xmm3, %xmm3
1169 ; AVX1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
1170 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1171 ; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
1172 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
1173 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
1174 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1175 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
1176 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1177 ; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [34,68,102,136]
1178 ; AVX1-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1181 ; AVX2-LABEL: PR50356:
1183 ; AVX2-NEXT: pushq %rbp
1184 ; AVX2-NEXT: movq %rsp, %rbp
1185 ; AVX2-NEXT: andq $-32, %rsp
1186 ; AVX2-NEXT: subq $64, %rsp
1187 ; AVX2-NEXT: vmovd %xmm1, %eax
1188 ; AVX2-NEXT: vmovaps %ymm0, (%rsp)
1189 ; AVX2-NEXT: andl $3, %eax
1190 ; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
1191 ; AVX2-NEXT: andl $3, %ecx
1192 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1193 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1194 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1195 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0
1196 ; AVX2-NEXT: vmovapd {{.*#+}} ymm1 = [34,68,102,136]
1197 ; AVX2-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1198 ; AVX2-NEXT: movq %rbp, %rsp
1199 ; AVX2-NEXT: popq %rbp
1202 ; AVX512-LABEL: PR50356:
1204 ; AVX512-NEXT: pushq %rbp
1205 ; AVX512-NEXT: movq %rsp, %rbp
1206 ; AVX512-NEXT: andq $-32, %rsp
1207 ; AVX512-NEXT: subq $64, %rsp
1208 ; AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
1209 ; AVX512-NEXT: vmovd %xmm1, %eax
1210 ; AVX512-NEXT: vmovaps %ymm0, (%rsp)
1211 ; AVX512-NEXT: andl $3, %eax
1212 ; AVX512-NEXT: vpextrd $1, %xmm1, %ecx
1213 ; AVX512-NEXT: andl $3, %ecx
1214 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1215 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1216 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1217 ; AVX512-NEXT: vpcmpgtq %zmm0, %zmm2, %k1
1218 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [17,51,85,119]
1219 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [34,68,102,136]
1220 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1221 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1222 ; AVX512-NEXT: movq %rbp, %rsp
1223 ; AVX512-NEXT: popq %rbp
1226 ; AVX512VL-LABEL: PR50356:
1227 ; AVX512VL: # %bb.0:
1228 ; AVX512VL-NEXT: pushq %rbp
1229 ; AVX512VL-NEXT: movq %rsp, %rbp
1230 ; AVX512VL-NEXT: andq $-32, %rsp
1231 ; AVX512VL-NEXT: subq $64, %rsp
1232 ; AVX512VL-NEXT: vmovd %xmm1, %eax
1233 ; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
1234 ; AVX512VL-NEXT: andl $3, %eax
1235 ; AVX512VL-NEXT: vpextrd $1, %xmm1, %ecx
1236 ; AVX512VL-NEXT: andl $3, %ecx
1237 ; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1238 ; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1239 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1240 ; AVX512VL-NEXT: vpcmpgtq %ymm0, %ymm2, %k1
1241 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [34,68,102,136]
1242 ; AVX512VL-NEXT: vmovdqa64 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1}
1243 ; AVX512VL-NEXT: movq %rbp, %rsp
1244 ; AVX512VL-NEXT: popq %rbp
1245 ; AVX512VL-NEXT: retq
1246 %v9 = and <4 x i32> %1, <i32 7, i32 7, i32 7, i32 7>
1247 %v10 = extractelement <4 x i32> %v9, i32 0
1248 %v11 = extractelement <4 x i64> %0, i32 %v10
1249 %v14 = extractelement <4 x i32> %v9, i32 1
1250 %v15 = extractelement <4 x i64> %0, i32 %v14
1251 %v27 = insertelement <4 x i64> zeroinitializer, i64 %v11, i32 0
1252 %v28 = insertelement <4 x i64> %v27, i64 %v15, i32 1
1253 %v36 = icmp slt <4 x i64> %v28, %2
1254 %v37 = select <4 x i1> %v36, <4 x i64> <i64 17, i64 51, i64 85, i64 119>, <4 x i64> <i64 34, i64 68, i64 102, i64 136> ; 17 68 102 136
1258 define <4 x i64> @var_shuffle_v4i64_with_v16i8_indices(<4 x i64> %v, <16 x i8> %indices) unnamed_addr nounwind {
1259 ; XOP-LABEL: var_shuffle_v4i64_with_v16i8_indices:
1261 ; XOP-NEXT: vpsrld $16, %xmm1, %xmm2
1262 ; XOP-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1263 ; XOP-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1264 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
1265 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1266 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1267 ; XOP-NEXT: vpaddq %xmm2, %xmm2, %xmm2
1268 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1269 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm3, %ymm0, %ymm0
1272 ; AVX1-LABEL: var_shuffle_v4i64_with_v16i8_indices:
1274 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2
1275 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1276 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1277 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
1278 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1279 ; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2
1280 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
1281 ; AVX1-NEXT: vpermilpd %ymm4, %ymm3, %ymm3
1282 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1283 ; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
1284 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1285 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2
1286 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1287 ; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm0, %ymm0
1290 ; AVX2-LABEL: var_shuffle_v4i64_with_v16i8_indices:
1292 ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
1293 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
1294 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
1295 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
1296 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
1297 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
1298 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1299 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1300 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
1303 ; AVX512-LABEL: var_shuffle_v4i64_with_v16i8_indices:
1305 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1306 ; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
1307 ; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0
1308 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1311 ; AVX512VL-LABEL: var_shuffle_v4i64_with_v16i8_indices:
1312 ; AVX512VL: # %bb.0:
1313 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
1314 ; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0
1315 ; AVX512VL-NEXT: retq
1316 %index0 = extractelement <16 x i8> %indices, i32 0
1317 %index1 = extractelement <16 x i8> %indices, i32 1
1318 %index2 = extractelement <16 x i8> %indices, i32 2
1319 %index3 = extractelement <16 x i8> %indices, i32 3
1320 %v0 = extractelement <4 x i64> %v, i8 %index0
1321 %v1 = extractelement <4 x i64> %v, i8 %index1
1322 %v2 = extractelement <4 x i64> %v, i8 %index2
1323 %v3 = extractelement <4 x i64> %v, i8 %index3
1324 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
1325 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
1326 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
1327 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3