1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512DQ
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,VBMI
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512VL
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512VL,AVX512VLDQ
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,INT256,AVX512VL,AVX512VLBW
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,INT256,AVX512VL,VLVBMI
14 define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
15 ; XOP-LABEL: var_shuffle_v4i64:
17 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
18 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
19 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3
20 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
21 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
22 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
23 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0
26 ; AVX1-LABEL: var_shuffle_v4i64:
28 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
29 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3
30 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
31 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
32 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4
33 ; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2
34 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
35 ; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
36 ; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3
37 ; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
38 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
39 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
42 ; AVX2-LABEL: var_shuffle_v4i64:
44 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
45 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
46 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
47 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
48 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
49 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
50 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
51 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
54 ; AVX512-LABEL: var_shuffle_v4i64:
56 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
57 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
58 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
59 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
62 ; AVX512VL-LABEL: var_shuffle_v4i64:
64 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
66 %index0 = extractelement <4 x i64> %indices, i32 0
67 %index1 = extractelement <4 x i64> %indices, i32 1
68 %index2 = extractelement <4 x i64> %indices, i32 2
69 %index3 = extractelement <4 x i64> %indices, i32 3
70 %v0 = extractelement <4 x i64> %v, i64 %index0
71 %v1 = extractelement <4 x i64> %v, i64 %index1
72 %v2 = extractelement <4 x i64> %v, i64 %index2
73 %v3 = extractelement <4 x i64> %v, i64 %index3
74 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
75 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
76 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
77 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3
81 define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
82 ; XOP-LABEL: var_shuffle_v8i32:
84 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
85 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
86 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
89 ; AVX1-LABEL: var_shuffle_v8i32:
91 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
92 ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
93 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
94 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
95 ; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
96 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
97 ; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
98 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
99 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
102 ; INT256-LABEL: var_shuffle_v8i32:
104 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
106 %index0 = extractelement <8 x i32> %indices, i32 0
107 %index1 = extractelement <8 x i32> %indices, i32 1
108 %index2 = extractelement <8 x i32> %indices, i32 2
109 %index3 = extractelement <8 x i32> %indices, i32 3
110 %index4 = extractelement <8 x i32> %indices, i32 4
111 %index5 = extractelement <8 x i32> %indices, i32 5
112 %index6 = extractelement <8 x i32> %indices, i32 6
113 %index7 = extractelement <8 x i32> %indices, i32 7
114 %v0 = extractelement <8 x i32> %v, i32 %index0
115 %v1 = extractelement <8 x i32> %v, i32 %index1
116 %v2 = extractelement <8 x i32> %v, i32 %index2
117 %v3 = extractelement <8 x i32> %v, i32 %index3
118 %v4 = extractelement <8 x i32> %v, i32 %index4
119 %v5 = extractelement <8 x i32> %v, i32 %index5
120 %v6 = extractelement <8 x i32> %v, i32 %index6
121 %v7 = extractelement <8 x i32> %v, i32 %index7
122 %ret0 = insertelement <8 x i32> undef, i32 %v0, i32 0
123 %ret1 = insertelement <8 x i32> %ret0, i32 %v1, i32 1
124 %ret2 = insertelement <8 x i32> %ret1, i32 %v2, i32 2
125 %ret3 = insertelement <8 x i32> %ret2, i32 %v3, i32 3
126 %ret4 = insertelement <8 x i32> %ret3, i32 %v4, i32 4
127 %ret5 = insertelement <8 x i32> %ret4, i32 %v5, i32 5
128 %ret6 = insertelement <8 x i32> %ret5, i32 %v6, i32 6
129 %ret7 = insertelement <8 x i32> %ret6, i32 %v7, i32 7
133 define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
134 ; XOP-LABEL: var_shuffle_v16i16:
136 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256]
137 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
138 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4
139 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
140 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1
141 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
142 ; XOP-NEXT: vpperm %xmm1, %xmm2, %xmm0, %xmm1
143 ; XOP-NEXT: vpperm %xmm4, %xmm2, %xmm0, %xmm0
144 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
147 ; AVX1-LABEL: var_shuffle_v16i16:
149 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514]
150 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
151 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256]
152 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
153 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
154 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
155 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1
156 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
157 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4
158 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
159 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm6
160 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
161 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
162 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
163 ; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm4
164 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
165 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
166 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
169 ; AVX2-LABEL: var_shuffle_v16i16:
171 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
172 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
173 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
174 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2
175 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
176 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
177 ; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
178 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
181 ; AVX512-LABEL: var_shuffle_v16i16:
183 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
184 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
185 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
186 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2
187 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
188 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
189 ; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
190 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
193 ; AVX512VLDQ-LABEL: var_shuffle_v16i16:
194 ; AVX512VLDQ: # %bb.0:
195 ; AVX512VLDQ-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
196 ; AVX512VLDQ-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
197 ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
198 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2
199 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
200 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
201 ; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
202 ; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
203 ; AVX512VLDQ-NEXT: retq
205 ; AVX512VLBW-LABEL: var_shuffle_v16i16:
206 ; AVX512VLBW: # %bb.0:
207 ; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
208 ; AVX512VLBW-NEXT: retq
210 ; VLVBMI-LABEL: var_shuffle_v16i16:
212 ; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0
214 %index0 = extractelement <16 x i16> %indices, i32 0
215 %index1 = extractelement <16 x i16> %indices, i32 1
216 %index2 = extractelement <16 x i16> %indices, i32 2
217 %index3 = extractelement <16 x i16> %indices, i32 3
218 %index4 = extractelement <16 x i16> %indices, i32 4
219 %index5 = extractelement <16 x i16> %indices, i32 5
220 %index6 = extractelement <16 x i16> %indices, i32 6
221 %index7 = extractelement <16 x i16> %indices, i32 7
222 %index8 = extractelement <16 x i16> %indices, i32 8
223 %index9 = extractelement <16 x i16> %indices, i32 9
224 %index10 = extractelement <16 x i16> %indices, i32 10
225 %index11 = extractelement <16 x i16> %indices, i32 11
226 %index12 = extractelement <16 x i16> %indices, i32 12
227 %index13 = extractelement <16 x i16> %indices, i32 13
228 %index14 = extractelement <16 x i16> %indices, i32 14
229 %index15 = extractelement <16 x i16> %indices, i32 15
230 %v0 = extractelement <16 x i16> %v, i16 %index0
231 %v1 = extractelement <16 x i16> %v, i16 %index1
232 %v2 = extractelement <16 x i16> %v, i16 %index2
233 %v3 = extractelement <16 x i16> %v, i16 %index3
234 %v4 = extractelement <16 x i16> %v, i16 %index4
235 %v5 = extractelement <16 x i16> %v, i16 %index5
236 %v6 = extractelement <16 x i16> %v, i16 %index6
237 %v7 = extractelement <16 x i16> %v, i16 %index7
238 %v8 = extractelement <16 x i16> %v, i16 %index8
239 %v9 = extractelement <16 x i16> %v, i16 %index9
240 %v10 = extractelement <16 x i16> %v, i16 %index10
241 %v11 = extractelement <16 x i16> %v, i16 %index11
242 %v12 = extractelement <16 x i16> %v, i16 %index12
243 %v13 = extractelement <16 x i16> %v, i16 %index13
244 %v14 = extractelement <16 x i16> %v, i16 %index14
245 %v15 = extractelement <16 x i16> %v, i16 %index15
246 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0
247 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1
248 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2
249 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3
250 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4
251 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5
252 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6
253 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7
254 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8
255 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9
256 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10
257 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11
258 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12
259 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13
260 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14
261 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15
262 ret <16 x i16> %ret15
265 define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
266 ; XOP-LABEL: var_shuffle_v32i8:
268 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
269 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
270 ; XOP-NEXT: vpperm %xmm2, %xmm3, %xmm0, %xmm2
271 ; XOP-NEXT: vpperm %xmm1, %xmm3, %xmm0, %xmm0
272 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
275 ; AVX1-LABEL: var_shuffle_v32i8:
277 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
278 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
279 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
280 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
281 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm6
282 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
283 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm2, %xmm2
284 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3
285 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm4
286 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
287 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm0
288 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
291 ; AVX2-LABEL: var_shuffle_v32i8:
293 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
294 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2
295 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
296 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
297 ; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
298 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
301 ; AVX512-LABEL: var_shuffle_v32i8:
303 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
304 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2
305 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
306 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
307 ; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
308 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
311 ; AVX512VLDQ-LABEL: var_shuffle_v32i8:
312 ; AVX512VLDQ: # %bb.0:
313 ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
314 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2
315 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
316 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
317 ; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
318 ; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
319 ; AVX512VLDQ-NEXT: retq
321 ; AVX512VLBW-LABEL: var_shuffle_v32i8:
322 ; AVX512VLBW: # %bb.0:
323 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
324 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
325 ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
326 ; AVX512VLBW-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %k1
327 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1}
328 ; AVX512VLBW-NEXT: vmovdqa %ymm2, %ymm0
329 ; AVX512VLBW-NEXT: retq
331 ; VLVBMI-LABEL: var_shuffle_v32i8:
333 ; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
335 %index0 = extractelement <32 x i8> %indices, i32 0
336 %index1 = extractelement <32 x i8> %indices, i32 1
337 %index2 = extractelement <32 x i8> %indices, i32 2
338 %index3 = extractelement <32 x i8> %indices, i32 3
339 %index4 = extractelement <32 x i8> %indices, i32 4
340 %index5 = extractelement <32 x i8> %indices, i32 5
341 %index6 = extractelement <32 x i8> %indices, i32 6
342 %index7 = extractelement <32 x i8> %indices, i32 7
343 %index8 = extractelement <32 x i8> %indices, i32 8
344 %index9 = extractelement <32 x i8> %indices, i32 9
345 %index10 = extractelement <32 x i8> %indices, i32 10
346 %index11 = extractelement <32 x i8> %indices, i32 11
347 %index12 = extractelement <32 x i8> %indices, i32 12
348 %index13 = extractelement <32 x i8> %indices, i32 13
349 %index14 = extractelement <32 x i8> %indices, i32 14
350 %index15 = extractelement <32 x i8> %indices, i32 15
351 %index16 = extractelement <32 x i8> %indices, i32 16
352 %index17 = extractelement <32 x i8> %indices, i32 17
353 %index18 = extractelement <32 x i8> %indices, i32 18
354 %index19 = extractelement <32 x i8> %indices, i32 19
355 %index20 = extractelement <32 x i8> %indices, i32 20
356 %index21 = extractelement <32 x i8> %indices, i32 21
357 %index22 = extractelement <32 x i8> %indices, i32 22
358 %index23 = extractelement <32 x i8> %indices, i32 23
359 %index24 = extractelement <32 x i8> %indices, i32 24
360 %index25 = extractelement <32 x i8> %indices, i32 25
361 %index26 = extractelement <32 x i8> %indices, i32 26
362 %index27 = extractelement <32 x i8> %indices, i32 27
363 %index28 = extractelement <32 x i8> %indices, i32 28
364 %index29 = extractelement <32 x i8> %indices, i32 29
365 %index30 = extractelement <32 x i8> %indices, i32 30
366 %index31 = extractelement <32 x i8> %indices, i32 31
367 %v0 = extractelement <32 x i8> %v, i8 %index0
368 %v1 = extractelement <32 x i8> %v, i8 %index1
369 %v2 = extractelement <32 x i8> %v, i8 %index2
370 %v3 = extractelement <32 x i8> %v, i8 %index3
371 %v4 = extractelement <32 x i8> %v, i8 %index4
372 %v5 = extractelement <32 x i8> %v, i8 %index5
373 %v6 = extractelement <32 x i8> %v, i8 %index6
374 %v7 = extractelement <32 x i8> %v, i8 %index7
375 %v8 = extractelement <32 x i8> %v, i8 %index8
376 %v9 = extractelement <32 x i8> %v, i8 %index9
377 %v10 = extractelement <32 x i8> %v, i8 %index10
378 %v11 = extractelement <32 x i8> %v, i8 %index11
379 %v12 = extractelement <32 x i8> %v, i8 %index12
380 %v13 = extractelement <32 x i8> %v, i8 %index13
381 %v14 = extractelement <32 x i8> %v, i8 %index14
382 %v15 = extractelement <32 x i8> %v, i8 %index15
383 %v16 = extractelement <32 x i8> %v, i8 %index16
384 %v17 = extractelement <32 x i8> %v, i8 %index17
385 %v18 = extractelement <32 x i8> %v, i8 %index18
386 %v19 = extractelement <32 x i8> %v, i8 %index19
387 %v20 = extractelement <32 x i8> %v, i8 %index20
388 %v21 = extractelement <32 x i8> %v, i8 %index21
389 %v22 = extractelement <32 x i8> %v, i8 %index22
390 %v23 = extractelement <32 x i8> %v, i8 %index23
391 %v24 = extractelement <32 x i8> %v, i8 %index24
392 %v25 = extractelement <32 x i8> %v, i8 %index25
393 %v26 = extractelement <32 x i8> %v, i8 %index26
394 %v27 = extractelement <32 x i8> %v, i8 %index27
395 %v28 = extractelement <32 x i8> %v, i8 %index28
396 %v29 = extractelement <32 x i8> %v, i8 %index29
397 %v30 = extractelement <32 x i8> %v, i8 %index30
398 %v31 = extractelement <32 x i8> %v, i8 %index31
399 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0
400 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1
401 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2
402 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3
403 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4
404 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5
405 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6
406 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7
407 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8
408 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9
409 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10
410 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11
411 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12
412 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13
413 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14
414 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15
415 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16
416 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17
417 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18
418 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19
419 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20
420 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21
421 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22
422 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23
423 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24
424 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25
425 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26
426 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27
427 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28
428 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29
429 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30
430 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31
434 define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind {
435 ; XOP-LABEL: var_shuffle_v4f64:
437 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
438 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
439 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3
440 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
441 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
442 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
443 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0
446 ; AVX1-LABEL: var_shuffle_v4f64:
448 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
449 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3
450 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
451 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
452 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4
453 ; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2
454 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
455 ; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
456 ; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3
457 ; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
458 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
459 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
462 ; AVX2-LABEL: var_shuffle_v4f64:
464 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
465 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
466 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
467 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
468 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
469 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
470 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
471 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
474 ; AVX512-LABEL: var_shuffle_v4f64:
476 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
477 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
478 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
479 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
482 ; AVX512VL-LABEL: var_shuffle_v4f64:
484 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
485 ; AVX512VL-NEXT: retq
486 %index0 = extractelement <4 x i64> %indices, i32 0
487 %index1 = extractelement <4 x i64> %indices, i32 1
488 %index2 = extractelement <4 x i64> %indices, i32 2
489 %index3 = extractelement <4 x i64> %indices, i32 3
490 %v0 = extractelement <4 x double> %v, i64 %index0
491 %v1 = extractelement <4 x double> %v, i64 %index1
492 %v2 = extractelement <4 x double> %v, i64 %index2
493 %v3 = extractelement <4 x double> %v, i64 %index3
494 %ret0 = insertelement <4 x double> undef, double %v0, i32 0
495 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1
496 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2
497 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3
498 ret <4 x double> %ret3
501 define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind {
502 ; XOP-LABEL: var_shuffle_v8f32:
504 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
505 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
506 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
509 ; AVX1-LABEL: var_shuffle_v8f32:
511 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
512 ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
513 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
514 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
515 ; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
516 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
517 ; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
518 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
519 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
522 ; INT256-LABEL: var_shuffle_v8f32:
524 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
526 %index0 = extractelement <8 x i32> %indices, i32 0
527 %index1 = extractelement <8 x i32> %indices, i32 1
528 %index2 = extractelement <8 x i32> %indices, i32 2
529 %index3 = extractelement <8 x i32> %indices, i32 3
530 %index4 = extractelement <8 x i32> %indices, i32 4
531 %index5 = extractelement <8 x i32> %indices, i32 5
532 %index6 = extractelement <8 x i32> %indices, i32 6
533 %index7 = extractelement <8 x i32> %indices, i32 7
534 %v0 = extractelement <8 x float> %v, i32 %index0
535 %v1 = extractelement <8 x float> %v, i32 %index1
536 %v2 = extractelement <8 x float> %v, i32 %index2
537 %v3 = extractelement <8 x float> %v, i32 %index3
538 %v4 = extractelement <8 x float> %v, i32 %index4
539 %v5 = extractelement <8 x float> %v, i32 %index5
540 %v6 = extractelement <8 x float> %v, i32 %index6
541 %v7 = extractelement <8 x float> %v, i32 %index7
542 %ret0 = insertelement <8 x float> undef, float %v0, i32 0
543 %ret1 = insertelement <8 x float> %ret0, float %v1, i32 1
544 %ret2 = insertelement <8 x float> %ret1, float %v2, i32 2
545 %ret3 = insertelement <8 x float> %ret2, float %v3, i32 3
546 %ret4 = insertelement <8 x float> %ret3, float %v4, i32 4
547 %ret5 = insertelement <8 x float> %ret4, float %v5, i32 5
548 %ret6 = insertelement <8 x float> %ret5, float %v6, i32 6
549 %ret7 = insertelement <8 x float> %ret6, float %v7, i32 7
550 ret <8 x float> %ret7
554 ; PR35820 - Unequal source/destination vector sizes
557 define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) nounwind {
558 ; XOP-LABEL: var_shuffle_v4i64_from_v2i64:
560 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
561 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
562 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
563 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3
564 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
565 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
566 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
567 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0
570 ; AVX1-LABEL: var_shuffle_v4i64_from_v2i64:
572 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
573 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
574 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3
575 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
576 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
577 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4
578 ; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2
579 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
580 ; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
581 ; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3
582 ; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
583 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
584 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
587 ; AVX2-LABEL: var_shuffle_v4i64_from_v2i64:
589 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
590 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
591 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
592 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
593 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
594 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
595 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
596 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
597 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
600 ; AVX512-LABEL: var_shuffle_v4i64_from_v2i64:
602 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
603 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
604 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
605 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
608 ; AVX512VL-LABEL: var_shuffle_v4i64_from_v2i64:
610 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
611 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
612 ; AVX512VL-NEXT: retq
613 %index0 = extractelement <4 x i64> %indices, i32 0
614 %index1 = extractelement <4 x i64> %indices, i32 1
615 %index2 = extractelement <4 x i64> %indices, i32 2
616 %index3 = extractelement <4 x i64> %indices, i32 3
617 %v0 = extractelement <2 x i64> %v, i64 %index0
618 %v1 = extractelement <2 x i64> %v, i64 %index1
619 %v2 = extractelement <2 x i64> %v, i64 %index2
620 %v3 = extractelement <2 x i64> %v, i64 %index3
621 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
622 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
623 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
624 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3
628 define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind {
629 ; XOP-LABEL: var_shuffle_v8i32_from_v4i32:
630 ; XOP: # %bb.0: # %entry
631 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
632 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
633 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
634 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
637 ; AVX1-LABEL: var_shuffle_v8i32_from_v4i32:
638 ; AVX1: # %bb.0: # %entry
639 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
640 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
641 ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
642 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
643 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
644 ; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
645 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
646 ; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
647 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
648 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
651 ; INT256-LABEL: var_shuffle_v8i32_from_v4i32:
652 ; INT256: # %bb.0: # %entry
653 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
654 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
657 %tmp1 = extractelement <8 x i32> %indices, i32 0
658 %vecext2.8 = extractelement <4 x i32> %v, i32 %tmp1
659 %tmp2 = extractelement <8 x i32> %indices, i32 1
660 %vecext2.9 = extractelement <4 x i32> %v, i32 %tmp2
661 %tmp3 = extractelement <8 x i32> %indices, i32 2
662 %vecext2.10 = extractelement <4 x i32> %v, i32 %tmp3
663 %tmp4 = extractelement <8 x i32> %indices, i32 3
664 %vecext2.11 = extractelement <4 x i32> %v, i32 %tmp4
665 %tmp5 = extractelement <8 x i32> %indices, i32 4
666 %vecext2.12 = extractelement <4 x i32> %v, i32 %tmp5
667 %tmp6 = extractelement <8 x i32> %indices, i32 5
668 %vecext2.13 = extractelement <4 x i32> %v, i32 %tmp6
669 %tmp7 = extractelement <8 x i32> %indices, i32 6
670 %vecext2.14 = extractelement <4 x i32> %v, i32 %tmp7
671 %tmp8 = extractelement <8 x i32> %indices, i32 7
672 %vecext2.15 = extractelement <4 x i32> %v, i32 %tmp8
673 %tmp9 = insertelement <8 x i32> undef, i32 %vecext2.8, i32 0
674 %tmp10 = insertelement <8 x i32> %tmp9, i32 %vecext2.9, i32 1
675 %tmp11 = insertelement <8 x i32> %tmp10, i32 %vecext2.10, i32 2
676 %tmp12 = insertelement <8 x i32> %tmp11, i32 %vecext2.11, i32 3
677 %tmp13 = insertelement <8 x i32> %tmp12, i32 %vecext2.12, i32 4
678 %tmp14 = insertelement <8 x i32> %tmp13, i32 %vecext2.13, i32 5
679 %tmp15 = insertelement <8 x i32> %tmp14, i32 %vecext2.14, i32 6
680 %tmp16 = insertelement <8 x i32> %tmp15, i32 %vecext2.15, i32 7
684 define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indices) nounwind {
685 ; XOP-LABEL: var_shuffle_v16i16_from_v8i16:
687 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256]
688 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
689 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4
690 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
691 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1
692 ; XOP-NEXT: vpperm %xmm1, %xmm0, %xmm0, %xmm1
693 ; XOP-NEXT: vpperm %xmm4, %xmm0, %xmm0, %xmm0
694 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
697 ; AVX1-LABEL: var_shuffle_v16i16_from_v8i16:
699 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514]
700 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
701 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256]
702 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
703 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
704 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
705 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1
706 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
707 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4
708 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm5
709 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
710 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
711 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
712 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm4
713 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
714 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
715 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
718 ; AVX2-LABEL: var_shuffle_v16i16_from_v8i16:
720 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
721 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
722 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
723 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2
724 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
725 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
726 ; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
727 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
730 ; AVX512-LABEL: var_shuffle_v16i16_from_v8i16:
732 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
733 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
734 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
735 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2
736 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
737 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
738 ; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
739 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
742 ; AVX512VLDQ-LABEL: var_shuffle_v16i16_from_v8i16:
743 ; AVX512VLDQ: # %bb.0:
744 ; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
745 ; AVX512VLDQ-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
746 ; AVX512VLDQ-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
747 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2
748 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
749 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
750 ; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
751 ; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
752 ; AVX512VLDQ-NEXT: retq
754 ; AVX512VLBW-LABEL: var_shuffle_v16i16_from_v8i16:
755 ; AVX512VLBW: # %bb.0:
756 ; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
757 ; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
758 ; AVX512VLBW-NEXT: retq
760 ; VLVBMI-LABEL: var_shuffle_v16i16_from_v8i16:
762 ; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
763 ; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0
765 %index0 = extractelement <16 x i16> %indices, i32 0
766 %index1 = extractelement <16 x i16> %indices, i32 1
767 %index2 = extractelement <16 x i16> %indices, i32 2
768 %index3 = extractelement <16 x i16> %indices, i32 3
769 %index4 = extractelement <16 x i16> %indices, i32 4
770 %index5 = extractelement <16 x i16> %indices, i32 5
771 %index6 = extractelement <16 x i16> %indices, i32 6
772 %index7 = extractelement <16 x i16> %indices, i32 7
773 %index8 = extractelement <16 x i16> %indices, i32 8
774 %index9 = extractelement <16 x i16> %indices, i32 9
775 %index10 = extractelement <16 x i16> %indices, i32 10
776 %index11 = extractelement <16 x i16> %indices, i32 11
777 %index12 = extractelement <16 x i16> %indices, i32 12
778 %index13 = extractelement <16 x i16> %indices, i32 13
779 %index14 = extractelement <16 x i16> %indices, i32 14
780 %index15 = extractelement <16 x i16> %indices, i32 15
781 %v0 = extractelement <8 x i16> %v, i16 %index0
782 %v1 = extractelement <8 x i16> %v, i16 %index1
783 %v2 = extractelement <8 x i16> %v, i16 %index2
784 %v3 = extractelement <8 x i16> %v, i16 %index3
785 %v4 = extractelement <8 x i16> %v, i16 %index4
786 %v5 = extractelement <8 x i16> %v, i16 %index5
787 %v6 = extractelement <8 x i16> %v, i16 %index6
788 %v7 = extractelement <8 x i16> %v, i16 %index7
789 %v8 = extractelement <8 x i16> %v, i16 %index8
790 %v9 = extractelement <8 x i16> %v, i16 %index9
791 %v10 = extractelement <8 x i16> %v, i16 %index10
792 %v11 = extractelement <8 x i16> %v, i16 %index11
793 %v12 = extractelement <8 x i16> %v, i16 %index12
794 %v13 = extractelement <8 x i16> %v, i16 %index13
795 %v14 = extractelement <8 x i16> %v, i16 %index14
796 %v15 = extractelement <8 x i16> %v, i16 %index15
797 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0
798 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1
799 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2
800 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3
801 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4
802 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5
803 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6
804 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7
805 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8
806 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9
807 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10
808 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11
809 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12
810 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13
811 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14
812 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15
813 ret <16 x i16> %ret15
816 define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
817 ; XOP-LABEL: var_shuffle_v32i8_from_v16i8:
819 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
820 ; XOP-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm2
821 ; XOP-NEXT: vpperm %xmm1, %xmm0, %xmm0, %xmm0
822 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
825 ; AVX1-LABEL: var_shuffle_v32i8_from_v16i8:
827 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
828 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
829 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
830 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm5
831 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
832 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm2, %xmm2
833 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3
834 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm4
835 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
836 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm0
837 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
840 ; AVX2-LABEL: var_shuffle_v32i8_from_v16i8:
842 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
843 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2
844 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
845 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
846 ; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
847 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
850 ; AVX512-LABEL: var_shuffle_v32i8_from_v16i8:
852 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
853 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2
854 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
855 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
856 ; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
857 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
860 ; AVX512VLDQ-LABEL: var_shuffle_v32i8_from_v16i8:
861 ; AVX512VLDQ: # %bb.0:
862 ; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
863 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2
864 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
865 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
866 ; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
867 ; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
868 ; AVX512VLDQ-NEXT: retq
870 ; AVX512VLBW-LABEL: var_shuffle_v32i8_from_v16i8:
871 ; AVX512VLBW: # %bb.0:
872 ; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
873 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
874 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
875 ; AVX512VLBW-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %k1
876 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 {%k1}
877 ; AVX512VLBW-NEXT: retq
879 ; VLVBMI-LABEL: var_shuffle_v32i8_from_v16i8:
881 ; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
882 ; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
884 %index0 = extractelement <32 x i8> %indices, i32 0
885 %index1 = extractelement <32 x i8> %indices, i32 1
886 %index2 = extractelement <32 x i8> %indices, i32 2
887 %index3 = extractelement <32 x i8> %indices, i32 3
888 %index4 = extractelement <32 x i8> %indices, i32 4
889 %index5 = extractelement <32 x i8> %indices, i32 5
890 %index6 = extractelement <32 x i8> %indices, i32 6
891 %index7 = extractelement <32 x i8> %indices, i32 7
892 %index8 = extractelement <32 x i8> %indices, i32 8
893 %index9 = extractelement <32 x i8> %indices, i32 9
894 %index10 = extractelement <32 x i8> %indices, i32 10
895 %index11 = extractelement <32 x i8> %indices, i32 11
896 %index12 = extractelement <32 x i8> %indices, i32 12
897 %index13 = extractelement <32 x i8> %indices, i32 13
898 %index14 = extractelement <32 x i8> %indices, i32 14
899 %index15 = extractelement <32 x i8> %indices, i32 15
900 %index16 = extractelement <32 x i8> %indices, i32 16
901 %index17 = extractelement <32 x i8> %indices, i32 17
902 %index18 = extractelement <32 x i8> %indices, i32 18
903 %index19 = extractelement <32 x i8> %indices, i32 19
904 %index20 = extractelement <32 x i8> %indices, i32 20
905 %index21 = extractelement <32 x i8> %indices, i32 21
906 %index22 = extractelement <32 x i8> %indices, i32 22
907 %index23 = extractelement <32 x i8> %indices, i32 23
908 %index24 = extractelement <32 x i8> %indices, i32 24
909 %index25 = extractelement <32 x i8> %indices, i32 25
910 %index26 = extractelement <32 x i8> %indices, i32 26
911 %index27 = extractelement <32 x i8> %indices, i32 27
912 %index28 = extractelement <32 x i8> %indices, i32 28
913 %index29 = extractelement <32 x i8> %indices, i32 29
914 %index30 = extractelement <32 x i8> %indices, i32 30
915 %index31 = extractelement <32 x i8> %indices, i32 31
916 %v0 = extractelement <16 x i8> %v, i8 %index0
917 %v1 = extractelement <16 x i8> %v, i8 %index1
918 %v2 = extractelement <16 x i8> %v, i8 %index2
919 %v3 = extractelement <16 x i8> %v, i8 %index3
920 %v4 = extractelement <16 x i8> %v, i8 %index4
921 %v5 = extractelement <16 x i8> %v, i8 %index5
922 %v6 = extractelement <16 x i8> %v, i8 %index6
923 %v7 = extractelement <16 x i8> %v, i8 %index7
924 %v8 = extractelement <16 x i8> %v, i8 %index8
925 %v9 = extractelement <16 x i8> %v, i8 %index9
926 %v10 = extractelement <16 x i8> %v, i8 %index10
927 %v11 = extractelement <16 x i8> %v, i8 %index11
928 %v12 = extractelement <16 x i8> %v, i8 %index12
929 %v13 = extractelement <16 x i8> %v, i8 %index13
930 %v14 = extractelement <16 x i8> %v, i8 %index14
931 %v15 = extractelement <16 x i8> %v, i8 %index15
932 %v16 = extractelement <16 x i8> %v, i8 %index16
933 %v17 = extractelement <16 x i8> %v, i8 %index17
934 %v18 = extractelement <16 x i8> %v, i8 %index18
935 %v19 = extractelement <16 x i8> %v, i8 %index19
936 %v20 = extractelement <16 x i8> %v, i8 %index20
937 %v21 = extractelement <16 x i8> %v, i8 %index21
938 %v22 = extractelement <16 x i8> %v, i8 %index22
939 %v23 = extractelement <16 x i8> %v, i8 %index23
940 %v24 = extractelement <16 x i8> %v, i8 %index24
941 %v25 = extractelement <16 x i8> %v, i8 %index25
942 %v26 = extractelement <16 x i8> %v, i8 %index26
943 %v27 = extractelement <16 x i8> %v, i8 %index27
944 %v28 = extractelement <16 x i8> %v, i8 %index28
945 %v29 = extractelement <16 x i8> %v, i8 %index29
946 %v30 = extractelement <16 x i8> %v, i8 %index30
947 %v31 = extractelement <16 x i8> %v, i8 %index31
948 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0
949 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1
950 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2
951 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3
952 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4
953 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5
954 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6
955 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7
956 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8
957 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9
958 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10
959 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11
960 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12
961 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13
962 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14
963 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15
964 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16
965 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17
966 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18
967 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19
968 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20
969 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21
970 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22
971 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23
972 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24
973 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25
974 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26
975 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27
976 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28
977 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29
978 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30
979 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31
983 define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %indices) nounwind {
984 ; XOP-LABEL: var_shuffle_v4f64_from_v2f64:
986 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
987 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
988 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
989 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3
990 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
991 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
992 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
993 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0
996 ; AVX1-LABEL: var_shuffle_v4f64_from_v2f64:
998 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
999 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1000 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3
1001 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1002 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1003 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4
1004 ; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2
1005 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1006 ; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
1007 ; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3
1008 ; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
1009 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
1010 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
1013 ; AVX2-LABEL: var_shuffle_v4f64_from_v2f64:
1015 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1016 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
1017 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
1018 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
1019 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
1020 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
1021 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1022 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1023 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
1026 ; AVX512-LABEL: var_shuffle_v4f64_from_v2f64:
1028 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1029 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1030 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
1031 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1034 ; AVX512VL-LABEL: var_shuffle_v4f64_from_v2f64:
1035 ; AVX512VL: # %bb.0:
1036 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1037 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
1038 ; AVX512VL-NEXT: retq
1039 %index0 = extractelement <4 x i64> %indices, i32 0
1040 %index1 = extractelement <4 x i64> %indices, i32 1
1041 %index2 = extractelement <4 x i64> %indices, i32 2
1042 %index3 = extractelement <4 x i64> %indices, i32 3
1043 %v0 = extractelement <2 x double> %v, i64 %index0
1044 %v1 = extractelement <2 x double> %v, i64 %index1
1045 %v2 = extractelement <2 x double> %v, i64 %index2
1046 %v3 = extractelement <2 x double> %v, i64 %index3
1047 %ret0 = insertelement <4 x double> undef, double %v0, i32 0
1048 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1
1049 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2
1050 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3
1051 ret <4 x double> %ret3
1054 define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind {
1055 ; XOP-LABEL: var_shuffle_v8f32_from_v4f32:
1056 ; XOP: # %bb.0: # %entry
1057 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1058 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1059 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1060 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
1063 ; AVX1-LABEL: var_shuffle_v8f32_from_v4f32:
1064 ; AVX1: # %bb.0: # %entry
1065 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1066 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1067 ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
1068 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1069 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
1070 ; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
1071 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1072 ; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
1073 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
1074 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
1077 ; INT256-LABEL: var_shuffle_v8f32_from_v4f32:
1078 ; INT256: # %bb.0: # %entry
1079 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1080 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
1083 %tmp1 = extractelement <8 x i32> %indices, i32 0
1084 %vecext2.8 = extractelement <4 x float> %v, i32 %tmp1
1085 %tmp2 = extractelement <8 x i32> %indices, i32 1
1086 %vecext2.9 = extractelement <4 x float> %v, i32 %tmp2
1087 %tmp3 = extractelement <8 x i32> %indices, i32 2
1088 %vecext2.10 = extractelement <4 x float> %v, i32 %tmp3
1089 %tmp4 = extractelement <8 x i32> %indices, i32 3
1090 %vecext2.11 = extractelement <4 x float> %v, i32 %tmp4
1091 %tmp5 = extractelement <8 x i32> %indices, i32 4
1092 %vecext2.12 = extractelement <4 x float> %v, i32 %tmp5
1093 %tmp6 = extractelement <8 x i32> %indices, i32 5
1094 %vecext2.13 = extractelement <4 x float> %v, i32 %tmp6
1095 %tmp7 = extractelement <8 x i32> %indices, i32 6
1096 %vecext2.14 = extractelement <4 x float> %v, i32 %tmp7
1097 %tmp8 = extractelement <8 x i32> %indices, i32 7
1098 %vecext2.15 = extractelement <4 x float> %v, i32 %tmp8
1099 %tmp9 = insertelement <8 x float> undef, float %vecext2.8, i32 0
1100 %tmp10 = insertelement <8 x float> %tmp9, float %vecext2.9, i32 1
1101 %tmp11 = insertelement <8 x float> %tmp10, float %vecext2.10, i32 2
1102 %tmp12 = insertelement <8 x float> %tmp11, float %vecext2.11, i32 3
1103 %tmp13 = insertelement <8 x float> %tmp12, float %vecext2.12, i32 4
1104 %tmp14 = insertelement <8 x float> %tmp13, float %vecext2.13, i32 5
1105 %tmp15 = insertelement <8 x float> %tmp14, float %vecext2.14, i32 6
1106 %tmp16 = insertelement <8 x float> %tmp15, float %vecext2.15, i32 7
1107 ret <8 x float> %tmp16
1110 define <4 x i32> @var_shuffle_v4i32_from_v8i32(<8 x i32> %v, <4 x i32> %indices) unnamed_addr nounwind {
1111 ; XOP-LABEL: var_shuffle_v4i32_from_v8i32:
1112 ; XOP: # %bb.0: # %entry
1113 ; XOP-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1114 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1115 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1116 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
1117 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1118 ; XOP-NEXT: vzeroupper
1121 ; AVX1-LABEL: var_shuffle_v4i32_from_v8i32:
1122 ; AVX1: # %bb.0: # %entry
1123 ; AVX1-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1124 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1125 ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
1126 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1127 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
1128 ; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm1
1129 ; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1130 ; AVX1-NEXT: vzeroupper
1133 ; INT256-LABEL: var_shuffle_v4i32_from_v8i32:
1134 ; INT256: # %bb.0: # %entry
1135 ; INT256-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1136 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
1137 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1138 ; INT256-NEXT: vzeroupper
1141 %tmp1 = extractelement <4 x i32> %indices, i32 0
1142 %vecext2.8 = extractelement <8 x i32> %v, i32 %tmp1
1143 %tmp2 = extractelement <4 x i32> %indices, i32 1
1144 %vecext2.9 = extractelement <8 x i32> %v, i32 %tmp2
1145 %tmp3 = extractelement <4 x i32> %indices, i32 2
1146 %vecext2.10 = extractelement <8 x i32> %v, i32 %tmp3
1147 %tmp4 = extractelement <4 x i32> %indices, i32 3
1148 %vecext2.11 = extractelement <8 x i32> %v, i32 %tmp4
1149 %tmp9 = insertelement <4 x i32> undef, i32 %vecext2.8, i32 0
1150 %tmp10 = insertelement <4 x i32> %tmp9, i32 %vecext2.9, i32 1
1151 %tmp11 = insertelement <4 x i32> %tmp10, i32 %vecext2.10, i32 2
1152 %tmp12 = insertelement <4 x i32> %tmp11, i32 %vecext2.11, i32 3
1153 ret <4 x i32> %tmp12