1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3
4 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
5 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
6 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
7 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
8 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
9 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
10 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL
11 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL
13 define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
14 ; SSE-LABEL: shuffle_v4i32_0001:
16 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
19 ; AVX-LABEL: shuffle_v4i32_0001:
21 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
23 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
24 ret <4 x i32> %shuffle
26 define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) {
27 ; SSE-LABEL: shuffle_v4i32_0020:
29 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
32 ; AVX-LABEL: shuffle_v4i32_0020:
34 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
36 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
37 ret <4 x i32> %shuffle
39 define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) {
40 ; SSE-LABEL: shuffle_v4i32_0112:
42 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
45 ; AVX-LABEL: shuffle_v4i32_0112:
47 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
49 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
50 ret <4 x i32> %shuffle
52 define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) {
53 ; SSE-LABEL: shuffle_v4i32_0300:
55 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
58 ; AVX-LABEL: shuffle_v4i32_0300:
60 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
62 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
63 ret <4 x i32> %shuffle
65 define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) {
66 ; SSE-LABEL: shuffle_v4i32_1000:
68 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
71 ; AVX-LABEL: shuffle_v4i32_1000:
73 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
75 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
76 ret <4 x i32> %shuffle
78 define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) {
79 ; SSE-LABEL: shuffle_v4i32_2200:
81 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
84 ; AVX-LABEL: shuffle_v4i32_2200:
86 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
88 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
89 ret <4 x i32> %shuffle
91 define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) {
92 ; SSE-LABEL: shuffle_v4i32_3330:
94 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
97 ; AVX-LABEL: shuffle_v4i32_3330:
99 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
101 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
102 ret <4 x i32> %shuffle
104 define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) {
105 ; SSE-LABEL: shuffle_v4i32_3210:
107 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
110 ; AVX-LABEL: shuffle_v4i32_3210:
112 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
114 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
115 ret <4 x i32> %shuffle
118 define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) {
119 ; SSE-LABEL: shuffle_v4i32_2121:
121 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
124 ; AVX-LABEL: shuffle_v4i32_2121:
126 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,1]
128 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1>
129 ret <4 x i32> %shuffle
132 define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
133 ; SSE-LABEL: shuffle_v4f32_0001:
135 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
138 ; AVX-LABEL: shuffle_v4f32_0001:
140 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
142 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
143 ret <4 x float> %shuffle
145 define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
146 ; SSE-LABEL: shuffle_v4f32_0020:
148 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
151 ; AVX-LABEL: shuffle_v4f32_0020:
153 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
155 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
156 ret <4 x float> %shuffle
158 define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
159 ; SSE-LABEL: shuffle_v4f32_0300:
161 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
164 ; AVX-LABEL: shuffle_v4f32_0300:
166 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
168 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
169 ret <4 x float> %shuffle
171 define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
172 ; SSE-LABEL: shuffle_v4f32_1000:
174 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
177 ; AVX-LABEL: shuffle_v4f32_1000:
179 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
181 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
182 ret <4 x float> %shuffle
184 define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
185 ; SSE-LABEL: shuffle_v4f32_2200:
187 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
190 ; AVX-LABEL: shuffle_v4f32_2200:
192 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
194 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
195 ret <4 x float> %shuffle
197 define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
198 ; SSE-LABEL: shuffle_v4f32_3330:
200 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
203 ; AVX-LABEL: shuffle_v4f32_3330:
205 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
207 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
208 ret <4 x float> %shuffle
210 define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
211 ; SSE-LABEL: shuffle_v4f32_3210:
213 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
216 ; AVX-LABEL: shuffle_v4f32_3210:
218 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
220 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
221 ret <4 x float> %shuffle
223 define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
224 ; SSE-LABEL: shuffle_v4f32_0011:
226 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
229 ; AVX-LABEL: shuffle_v4f32_0011:
231 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
233 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
234 ret <4 x float> %shuffle
236 define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
237 ; SSE-LABEL: shuffle_v4f32_2233:
239 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
242 ; AVX-LABEL: shuffle_v4f32_2233:
244 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
246 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
247 ret <4 x float> %shuffle
249 define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
250 ; SSE2-LABEL: shuffle_v4f32_0022:
252 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
255 ; SSE3-LABEL: shuffle_v4f32_0022:
257 ; SSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
260 ; SSSE3-LABEL: shuffle_v4f32_0022:
262 ; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
265 ; SSE41-LABEL: shuffle_v4f32_0022:
267 ; SSE41-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
270 ; AVX-LABEL: shuffle_v4f32_0022:
272 ; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
274 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
275 ret <4 x float> %shuffle
277 define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
278 ; SSE2-LABEL: shuffle_v4f32_1133:
280 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
283 ; SSE3-LABEL: shuffle_v4f32_1133:
285 ; SSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
288 ; SSSE3-LABEL: shuffle_v4f32_1133:
290 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
293 ; SSE41-LABEL: shuffle_v4f32_1133:
295 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
298 ; AVX-LABEL: shuffle_v4f32_1133:
300 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
302 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
303 ret <4 x float> %shuffle
306 define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
307 ; SSE-LABEL: shuffle_v4f32_0145:
309 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
312 ; AVX-LABEL: shuffle_v4f32_0145:
314 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
316 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
317 ret <4 x float> %shuffle
320 define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
321 ; SSE-LABEL: shuffle_v4f32_6723:
323 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
326 ; AVX-LABEL: shuffle_v4f32_6723:
328 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
330 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
331 ret <4 x float> %shuffle
334 define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
335 ; SSE2-LABEL: shuffle_v4i32_0124:
337 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
338 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
341 ; SSE3-LABEL: shuffle_v4i32_0124:
343 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
344 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
347 ; SSSE3-LABEL: shuffle_v4i32_0124:
349 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
350 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
353 ; SSE41-LABEL: shuffle_v4i32_0124:
355 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
356 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
359 ; AVX1-LABEL: shuffle_v4i32_0124:
361 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
362 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
365 ; AVX2-LABEL: shuffle_v4i32_0124:
367 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
368 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
371 ; AVX512VL-LABEL: shuffle_v4i32_0124:
373 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,4]
374 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
375 ; AVX512VL-NEXT: retq
376 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
377 ret <4 x i32> %shuffle
379 define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
380 ; SSE2-LABEL: shuffle_v4i32_0142:
382 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
383 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
386 ; SSE3-LABEL: shuffle_v4i32_0142:
388 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
389 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
392 ; SSSE3-LABEL: shuffle_v4i32_0142:
394 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
395 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
398 ; SSE41-LABEL: shuffle_v4i32_0142:
400 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
401 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
402 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
405 ; AVX1-LABEL: shuffle_v4i32_0142:
407 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
408 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
409 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
412 ; AVX2-LABEL: shuffle_v4i32_0142:
414 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
415 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
416 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
419 ; AVX512VL-LABEL: shuffle_v4i32_0142:
421 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,2]
422 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
423 ; AVX512VL-NEXT: retq
424 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
425 ret <4 x i32> %shuffle
427 define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
428 ; SSE2-LABEL: shuffle_v4i32_0412:
430 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
431 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
432 ; SSE2-NEXT: movaps %xmm1, %xmm0
435 ; SSE3-LABEL: shuffle_v4i32_0412:
437 ; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
438 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
439 ; SSE3-NEXT: movaps %xmm1, %xmm0
442 ; SSSE3-LABEL: shuffle_v4i32_0412:
444 ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
445 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
446 ; SSSE3-NEXT: movaps %xmm1, %xmm0
449 ; SSE41-LABEL: shuffle_v4i32_0412:
451 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
452 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
453 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
456 ; AVX1-LABEL: shuffle_v4i32_0412:
458 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
459 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
460 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
463 ; AVX2-LABEL: shuffle_v4i32_0412:
465 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
466 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
467 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
470 ; AVX512VL-LABEL: shuffle_v4i32_0412:
472 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,1,2]
473 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
474 ; AVX512VL-NEXT: retq
475 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
476 ret <4 x i32> %shuffle
478 define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
479 ; SSE2-LABEL: shuffle_v4i32_4012:
481 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
482 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
483 ; SSE2-NEXT: movaps %xmm1, %xmm0
486 ; SSE3-LABEL: shuffle_v4i32_4012:
488 ; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
489 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
490 ; SSE3-NEXT: movaps %xmm1, %xmm0
493 ; SSSE3-LABEL: shuffle_v4i32_4012:
495 ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
496 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
497 ; SSSE3-NEXT: movaps %xmm1, %xmm0
500 ; SSE41-LABEL: shuffle_v4i32_4012:
502 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
503 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
506 ; AVX1OR2-LABEL: shuffle_v4i32_4012:
508 ; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2]
509 ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
512 ; AVX512VL-LABEL: shuffle_v4i32_4012:
514 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,2]
515 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
516 ; AVX512VL-NEXT: retq
517 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
518 ret <4 x i32> %shuffle
520 define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
521 ; SSE-LABEL: shuffle_v4i32_0145:
523 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
526 ; AVX-LABEL: shuffle_v4i32_0145:
528 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
530 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
531 ret <4 x i32> %shuffle
533 define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
534 ; SSE2-LABEL: shuffle_v4i32_0451:
536 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
537 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
540 ; SSE3-LABEL: shuffle_v4i32_0451:
542 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
543 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
546 ; SSSE3-LABEL: shuffle_v4i32_0451:
548 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
549 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
552 ; SSE41-LABEL: shuffle_v4i32_0451:
554 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
555 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
556 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
559 ; AVX1-LABEL: shuffle_v4i32_0451:
561 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
562 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
563 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
566 ; AVX2-LABEL: shuffle_v4i32_0451:
568 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
569 ; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
570 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
573 ; AVX512VL-LABEL: shuffle_v4i32_0451:
575 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,5,1]
576 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
577 ; AVX512VL-NEXT: retq
578 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
579 ret <4 x i32> %shuffle
581 define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
582 ; SSE-LABEL: shuffle_v4i32_4501:
584 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
585 ; SSE-NEXT: movaps %xmm1, %xmm0
588 ; AVX-LABEL: shuffle_v4i32_4501:
590 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
592 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
593 ret <4 x i32> %shuffle
595 define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
596 ; SSE2-LABEL: shuffle_v4i32_4015:
598 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
599 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
602 ; SSE3-LABEL: shuffle_v4i32_4015:
604 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
605 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
608 ; SSSE3-LABEL: shuffle_v4i32_4015:
610 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
611 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
614 ; SSE41-LABEL: shuffle_v4i32_4015:
616 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
617 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
618 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
621 ; AVX1-LABEL: shuffle_v4i32_4015:
623 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
624 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
625 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
628 ; AVX2-LABEL: shuffle_v4i32_4015:
630 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
631 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
632 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
635 ; AVX512VL-LABEL: shuffle_v4i32_4015:
637 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,5]
638 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
639 ; AVX512VL-NEXT: retq
640 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
641 ret <4 x i32> %shuffle
644 define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
645 ; SSE2-LABEL: shuffle_v4f32_4zzz:
647 ; SSE2-NEXT: xorps %xmm1, %xmm1
648 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
649 ; SSE2-NEXT: movaps %xmm1, %xmm0
652 ; SSE3-LABEL: shuffle_v4f32_4zzz:
654 ; SSE3-NEXT: xorps %xmm1, %xmm1
655 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
656 ; SSE3-NEXT: movaps %xmm1, %xmm0
659 ; SSSE3-LABEL: shuffle_v4f32_4zzz:
661 ; SSSE3-NEXT: xorps %xmm1, %xmm1
662 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
663 ; SSSE3-NEXT: movaps %xmm1, %xmm0
666 ; SSE41-LABEL: shuffle_v4f32_4zzz:
668 ; SSE41-NEXT: xorps %xmm1, %xmm1
669 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
672 ; AVX-LABEL: shuffle_v4f32_4zzz:
674 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
675 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
677 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
678 ret <4 x float> %shuffle
681 define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
682 ; SSE2-LABEL: shuffle_v4f32_z4zz:
684 ; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
685 ; SSE2-NEXT: xorps %xmm1, %xmm1
686 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
689 ; SSE3-LABEL: shuffle_v4f32_z4zz:
691 ; SSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
692 ; SSE3-NEXT: xorps %xmm1, %xmm1
693 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
696 ; SSSE3-LABEL: shuffle_v4f32_z4zz:
698 ; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
699 ; SSSE3-NEXT: xorps %xmm1, %xmm1
700 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
703 ; SSE41-LABEL: shuffle_v4f32_z4zz:
705 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
708 ; AVX-LABEL: shuffle_v4f32_z4zz:
710 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
712 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
713 ret <4 x float> %shuffle
716 define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
717 ; SSE2-LABEL: shuffle_v4f32_zz4z:
719 ; SSE2-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
720 ; SSE2-NEXT: pxor %xmm0, %xmm0
721 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
724 ; SSE3-LABEL: shuffle_v4f32_zz4z:
726 ; SSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
727 ; SSE3-NEXT: pxor %xmm0, %xmm0
728 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
731 ; SSSE3-LABEL: shuffle_v4f32_zz4z:
733 ; SSSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
734 ; SSSE3-NEXT: pxor %xmm0, %xmm0
735 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
738 ; SSE41-LABEL: shuffle_v4f32_zz4z:
740 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
743 ; AVX-LABEL: shuffle_v4f32_zz4z:
745 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
747 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
748 ret <4 x float> %shuffle
751 define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
752 ; SSE2-LABEL: shuffle_v4f32_zuu4:
754 ; SSE2-NEXT: xorps %xmm1, %xmm1
755 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
756 ; SSE2-NEXT: movaps %xmm1, %xmm0
759 ; SSE3-LABEL: shuffle_v4f32_zuu4:
761 ; SSE3-NEXT: xorps %xmm1, %xmm1
762 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
763 ; SSE3-NEXT: movaps %xmm1, %xmm0
766 ; SSSE3-LABEL: shuffle_v4f32_zuu4:
768 ; SSSE3-NEXT: xorps %xmm1, %xmm1
769 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
770 ; SSSE3-NEXT: movaps %xmm1, %xmm0
773 ; SSE41-LABEL: shuffle_v4f32_zuu4:
775 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
778 ; AVX-LABEL: shuffle_v4f32_zuu4:
780 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
782 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
783 ret <4 x float> %shuffle
786 define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
787 ; SSE2-LABEL: shuffle_v4f32_zzz7:
789 ; SSE2-NEXT: xorps %xmm1, %xmm1
790 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
791 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
792 ; SSE2-NEXT: movaps %xmm1, %xmm0
795 ; SSE3-LABEL: shuffle_v4f32_zzz7:
797 ; SSE3-NEXT: xorps %xmm1, %xmm1
798 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
799 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
800 ; SSE3-NEXT: movaps %xmm1, %xmm0
803 ; SSSE3-LABEL: shuffle_v4f32_zzz7:
805 ; SSSE3-NEXT: xorps %xmm1, %xmm1
806 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
807 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
808 ; SSSE3-NEXT: movaps %xmm1, %xmm0
811 ; SSE41-LABEL: shuffle_v4f32_zzz7:
813 ; SSE41-NEXT: xorps %xmm1, %xmm1
814 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
817 ; AVX-LABEL: shuffle_v4f32_zzz7:
819 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
820 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
822 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
823 ret <4 x float> %shuffle
826 define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
827 ; SSE2-LABEL: shuffle_v4f32_z6zz:
829 ; SSE2-NEXT: xorps %xmm1, %xmm1
830 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
831 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
834 ; SSE3-LABEL: shuffle_v4f32_z6zz:
836 ; SSE3-NEXT: xorps %xmm1, %xmm1
837 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
838 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
841 ; SSSE3-LABEL: shuffle_v4f32_z6zz:
843 ; SSSE3-NEXT: xorps %xmm1, %xmm1
844 ; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
845 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
848 ; SSE41-LABEL: shuffle_v4f32_z6zz:
850 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
853 ; AVX-LABEL: shuffle_v4f32_z6zz:
855 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
857 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
858 ret <4 x float> %shuffle
861 define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
862 ; SSE2-LABEL: shuffle_v4f32_0z23:
864 ; SSE2-NEXT: xorps %xmm1, %xmm1
865 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
866 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
867 ; SSE2-NEXT: movaps %xmm1, %xmm0
870 ; SSE3-LABEL: shuffle_v4f32_0z23:
872 ; SSE3-NEXT: xorps %xmm1, %xmm1
873 ; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
874 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
875 ; SSE3-NEXT: movaps %xmm1, %xmm0
878 ; SSSE3-LABEL: shuffle_v4f32_0z23:
880 ; SSSE3-NEXT: xorps %xmm1, %xmm1
881 ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
882 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
883 ; SSSE3-NEXT: movaps %xmm1, %xmm0
886 ; SSE41-LABEL: shuffle_v4f32_0z23:
888 ; SSE41-NEXT: xorps %xmm1, %xmm1
889 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
892 ; AVX-LABEL: shuffle_v4f32_0z23:
894 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
895 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
897 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
898 ret <4 x float> %shuffle
901 define <4 x float> @shuffle_v4f32_01z3(<4 x float> %a) {
902 ; SSE2-LABEL: shuffle_v4f32_01z3:
904 ; SSE2-NEXT: xorps %xmm1, %xmm1
905 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
906 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
909 ; SSE3-LABEL: shuffle_v4f32_01z3:
911 ; SSE3-NEXT: xorps %xmm1, %xmm1
912 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
913 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
916 ; SSSE3-LABEL: shuffle_v4f32_01z3:
918 ; SSSE3-NEXT: xorps %xmm1, %xmm1
919 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
920 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
923 ; SSE41-LABEL: shuffle_v4f32_01z3:
925 ; SSE41-NEXT: xorps %xmm1, %xmm1
926 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
929 ; AVX-LABEL: shuffle_v4f32_01z3:
931 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
932 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
934 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
935 ret <4 x float> %shuffle
938 define <4 x float> @shuffle_v4f32_012z(<4 x float> %a) {
939 ; SSE2-LABEL: shuffle_v4f32_012z:
941 ; SSE2-NEXT: xorps %xmm1, %xmm1
942 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
943 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
946 ; SSE3-LABEL: shuffle_v4f32_012z:
948 ; SSE3-NEXT: xorps %xmm1, %xmm1
949 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
950 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
953 ; SSSE3-LABEL: shuffle_v4f32_012z:
955 ; SSSE3-NEXT: xorps %xmm1, %xmm1
956 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
957 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
960 ; SSE41-LABEL: shuffle_v4f32_012z:
962 ; SSE41-NEXT: xorps %xmm1, %xmm1
963 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
966 ; AVX-LABEL: shuffle_v4f32_012z:
968 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
969 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
971 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
972 ret <4 x float> %shuffle
975 define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) {
976 ; SSE2-LABEL: shuffle_v4f32_0zz3:
978 ; SSE2-NEXT: xorps %xmm1, %xmm1
979 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
980 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
983 ; SSE3-LABEL: shuffle_v4f32_0zz3:
985 ; SSE3-NEXT: xorps %xmm1, %xmm1
986 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
987 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
990 ; SSSE3-LABEL: shuffle_v4f32_0zz3:
992 ; SSSE3-NEXT: xorps %xmm1, %xmm1
993 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
994 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
997 ; SSE41-LABEL: shuffle_v4f32_0zz3:
999 ; SSE41-NEXT: xorps %xmm1, %xmm1
1000 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1003 ; AVX-LABEL: shuffle_v4f32_0zz3:
1005 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1006 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1008 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
1009 ret <4 x float> %shuffle
1012 define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) {
1013 ; SSE2-LABEL: shuffle_v4f32_0z2z:
1015 ; SSE2-NEXT: xorps %xmm1, %xmm1
1016 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
1017 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1020 ; SSE3-LABEL: shuffle_v4f32_0z2z:
1022 ; SSE3-NEXT: xorps %xmm1, %xmm1
1023 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
1024 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1027 ; SSSE3-LABEL: shuffle_v4f32_0z2z:
1029 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1030 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
1031 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1034 ; SSE41-LABEL: shuffle_v4f32_0z2z:
1036 ; SSE41-NEXT: xorps %xmm1, %xmm1
1037 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1040 ; AVX-LABEL: shuffle_v4f32_0z2z:
1042 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1043 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1045 %shuffle = shufflevector <4 x float> %v, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
1046 ret <4 x float> %shuffle
1049 define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
1050 ; SSE-LABEL: shuffle_v4f32_u051:
1052 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1053 ; SSE-NEXT: movaps %xmm1, %xmm0
1056 ; AVX-LABEL: shuffle_v4f32_u051:
1058 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1060 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 5, i32 1>
1061 ret <4 x float> %shuffle
1064 define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
1065 ; SSE2-LABEL: shuffle_v4f32_0zz4:
1067 ; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
1068 ; SSE2-NEXT: pxor %xmm1, %xmm1
1069 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1070 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1071 ; SSE2-NEXT: movaps %xmm1, %xmm0
1074 ; SSE3-LABEL: shuffle_v4f32_0zz4:
1076 ; SSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
1077 ; SSE3-NEXT: pxor %xmm1, %xmm1
1078 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1079 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1080 ; SSE3-NEXT: movaps %xmm1, %xmm0
1083 ; SSSE3-LABEL: shuffle_v4f32_0zz4:
1085 ; SSSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
1086 ; SSSE3-NEXT: pxor %xmm1, %xmm1
1087 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1088 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1089 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1092 ; SSE41-LABEL: shuffle_v4f32_0zz4:
1094 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
1097 ; AVX-LABEL: shuffle_v4f32_0zz4:
1099 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
1101 %shuffle = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 6, i32 0>
1102 %shuffle1 = shufflevector <4 x float> %a, <4 x float> %shuffle, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1103 ret <4 x float> %shuffle1
1106 define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
1107 ; SSE2-LABEL: shuffle_v4f32_0zz6:
1109 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1110 ; SSE2-NEXT: xorps %xmm1, %xmm1
1111 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1112 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1113 ; SSE2-NEXT: movaps %xmm1, %xmm0
1116 ; SSE3-LABEL: shuffle_v4f32_0zz6:
1118 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1119 ; SSE3-NEXT: xorps %xmm1, %xmm1
1120 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1121 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1122 ; SSE3-NEXT: movaps %xmm1, %xmm0
1125 ; SSSE3-LABEL: shuffle_v4f32_0zz6:
1127 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1128 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1129 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1130 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1131 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1134 ; SSE41-LABEL: shuffle_v4f32_0zz6:
1136 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
1139 ; AVX-LABEL: shuffle_v4f32_0zz6:
1141 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
1143 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
1144 %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
1145 ret <4 x float> %shuffle1
1148 define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
1149 ; SSE2-LABEL: shuffle_v4f32_0z24:
1151 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1152 ; SSE2-NEXT: xorps %xmm2, %xmm2
1153 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1154 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1155 ; SSE2-NEXT: movaps %xmm2, %xmm0
1158 ; SSE3-LABEL: shuffle_v4f32_0z24:
1160 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1161 ; SSE3-NEXT: xorps %xmm2, %xmm2
1162 ; SSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1163 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1164 ; SSE3-NEXT: movaps %xmm2, %xmm0
1167 ; SSSE3-LABEL: shuffle_v4f32_0z24:
1169 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1170 ; SSSE3-NEXT: xorps %xmm2, %xmm2
1171 ; SSSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1172 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1173 ; SSSE3-NEXT: movaps %xmm2, %xmm0
1176 ; SSE41-LABEL: shuffle_v4f32_0z24:
1178 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
1181 ; AVX-LABEL: shuffle_v4f32_0z24:
1183 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
1185 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
1186 %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1187 ret <4 x float> %shuffle1
1190 define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
1191 ; SSE2-LABEL: shuffle_v4i32_4zzz:
1193 ; SSE2-NEXT: xorps %xmm1, %xmm1
1194 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1195 ; SSE2-NEXT: movaps %xmm1, %xmm0
1198 ; SSE3-LABEL: shuffle_v4i32_4zzz:
1200 ; SSE3-NEXT: xorps %xmm1, %xmm1
1201 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1202 ; SSE3-NEXT: movaps %xmm1, %xmm0
1205 ; SSSE3-LABEL: shuffle_v4i32_4zzz:
1207 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1208 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1209 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1212 ; SSE41-LABEL: shuffle_v4i32_4zzz:
1214 ; SSE41-NEXT: xorps %xmm1, %xmm1
1215 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1218 ; AVX-LABEL: shuffle_v4i32_4zzz:
1220 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1221 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1223 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1224 ret <4 x i32> %shuffle
1227 define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
1228 ; SSE2-LABEL: shuffle_v4i32_z4zz:
1230 ; SSE2-NEXT: xorps %xmm1, %xmm1
1231 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1232 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1235 ; SSE3-LABEL: shuffle_v4i32_z4zz:
1237 ; SSE3-NEXT: xorps %xmm1, %xmm1
1238 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1239 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1242 ; SSSE3-LABEL: shuffle_v4i32_z4zz:
1244 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1245 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1246 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1249 ; SSE41-LABEL: shuffle_v4i32_z4zz:
1251 ; SSE41-NEXT: pxor %xmm1, %xmm1
1252 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1253 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1256 ; AVX1-LABEL: shuffle_v4i32_z4zz:
1258 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1259 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1260 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
1263 ; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
1264 ; AVX2-SLOW: # %bb.0:
1265 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1266 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1267 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
1268 ; AVX2-SLOW-NEXT: retq
1270 ; AVX2-FAST-LABEL: shuffle_v4i32_z4zz:
1271 ; AVX2-FAST: # %bb.0:
1272 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
1273 ; AVX2-FAST-NEXT: retq
1275 ; AVX512VL-LABEL: shuffle_v4i32_z4zz:
1276 ; AVX512VL: # %bb.0:
1277 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
1278 ; AVX512VL-NEXT: retq
1279 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
1280 ret <4 x i32> %shuffle
1283 define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
1284 ; SSE2-LABEL: shuffle_v4i32_zz4z:
1286 ; SSE2-NEXT: xorps %xmm1, %xmm1
1287 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1288 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1291 ; SSE3-LABEL: shuffle_v4i32_zz4z:
1293 ; SSE3-NEXT: xorps %xmm1, %xmm1
1294 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1295 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1298 ; SSSE3-LABEL: shuffle_v4i32_zz4z:
1300 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1301 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1302 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1305 ; SSE41-LABEL: shuffle_v4i32_zz4z:
1307 ; SSE41-NEXT: pxor %xmm1, %xmm1
1308 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1309 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1312 ; AVX1-LABEL: shuffle_v4i32_zz4z:
1314 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1315 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1316 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
1319 ; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
1320 ; AVX2-SLOW: # %bb.0:
1321 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1322 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1323 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
1324 ; AVX2-SLOW-NEXT: retq
1326 ; AVX2-FAST-LABEL: shuffle_v4i32_zz4z:
1327 ; AVX2-FAST: # %bb.0:
1328 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
1329 ; AVX2-FAST-NEXT: retq
1331 ; AVX512VL-LABEL: shuffle_v4i32_zz4z:
1332 ; AVX512VL: # %bb.0:
1333 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
1334 ; AVX512VL-NEXT: retq
1335 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
1336 ret <4 x i32> %shuffle
1339 define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
1340 ; SSE-LABEL: shuffle_v4i32_zuu4:
1342 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1345 ; AVX-LABEL: shuffle_v4i32_zuu4:
1347 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1349 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
1350 ret <4 x i32> %shuffle
1353 define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
1354 ; SSE2-LABEL: shuffle_v4i32_z6zz:
1356 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1357 ; SSE2-NEXT: xorps %xmm1, %xmm1
1358 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1361 ; SSE3-LABEL: shuffle_v4i32_z6zz:
1363 ; SSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1364 ; SSE3-NEXT: xorps %xmm1, %xmm1
1365 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1368 ; SSSE3-LABEL: shuffle_v4i32_z6zz:
1370 ; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1371 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1372 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1375 ; SSE41-LABEL: shuffle_v4i32_z6zz:
1377 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
1378 ; SSE41-NEXT: pxor %xmm0, %xmm0
1379 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1382 ; AVX1-LABEL: shuffle_v4i32_z6zz:
1384 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1385 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1386 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1389 ; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz:
1390 ; AVX2-SLOW: # %bb.0:
1391 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1392 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1393 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1394 ; AVX2-SLOW-NEXT: retq
1396 ; AVX2-FAST-LABEL: shuffle_v4i32_z6zz:
1397 ; AVX2-FAST: # %bb.0:
1398 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
1399 ; AVX2-FAST-NEXT: retq
1401 ; AVX512VL-LABEL: shuffle_v4i32_z6zz:
1402 ; AVX512VL: # %bb.0:
1403 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
1404 ; AVX512VL-NEXT: retq
1405 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
1406 ret <4 x i32> %shuffle
1409 define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) {
1410 ; SSE2-LABEL: shuffle_v4i32_7012:
1412 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
1413 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
1414 ; SSE2-NEXT: movaps %xmm1, %xmm0
1417 ; SSE3-LABEL: shuffle_v4i32_7012:
1419 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
1420 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
1421 ; SSE3-NEXT: movaps %xmm1, %xmm0
1424 ; SSSE3-LABEL: shuffle_v4i32_7012:
1426 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1429 ; SSE41-LABEL: shuffle_v4i32_7012:
1431 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1434 ; AVX-LABEL: shuffle_v4i32_7012:
1436 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1438 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
1439 ret <4 x i32> %shuffle
1442 define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) {
1443 ; SSE2-LABEL: shuffle_v4i32_6701:
1445 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
1446 ; SSE2-NEXT: movaps %xmm1, %xmm0
1449 ; SSE3-LABEL: shuffle_v4i32_6701:
1451 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
1452 ; SSE3-NEXT: movaps %xmm1, %xmm0
1455 ; SSSE3-LABEL: shuffle_v4i32_6701:
1457 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1460 ; SSE41-LABEL: shuffle_v4i32_6701:
1462 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1465 ; AVX-LABEL: shuffle_v4i32_6701:
1467 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1469 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1470 ret <4 x i32> %shuffle
1473 define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) {
1474 ; SSE2-LABEL: shuffle_v4i32_5670:
1476 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1477 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
1478 ; SSE2-NEXT: movaps %xmm1, %xmm0
1481 ; SSE3-LABEL: shuffle_v4i32_5670:
1483 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1484 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
1485 ; SSE3-NEXT: movaps %xmm1, %xmm0
1488 ; SSSE3-LABEL: shuffle_v4i32_5670:
1490 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1493 ; SSE41-LABEL: shuffle_v4i32_5670:
1495 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1498 ; AVX-LABEL: shuffle_v4i32_5670:
1500 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1502 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
1503 ret <4 x i32> %shuffle
1506 define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) {
1507 ; SSE2-LABEL: shuffle_v4i32_1234:
1509 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1510 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
1513 ; SSE3-LABEL: shuffle_v4i32_1234:
1515 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1516 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
1519 ; SSSE3-LABEL: shuffle_v4i32_1234:
1521 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1522 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1525 ; SSE41-LABEL: shuffle_v4i32_1234:
1527 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1528 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1531 ; AVX-LABEL: shuffle_v4i32_1234:
1533 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1535 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
1536 ret <4 x i32> %shuffle
1539 define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
1540 ; SSE2-LABEL: shuffle_v4i32_2345:
1542 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1545 ; SSE3-LABEL: shuffle_v4i32_2345:
1547 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1550 ; SSSE3-LABEL: shuffle_v4i32_2345:
1552 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1553 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1556 ; SSE41-LABEL: shuffle_v4i32_2345:
1558 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1559 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1562 ; AVX-LABEL: shuffle_v4i32_2345:
1564 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1566 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1567 ret <4 x i32> %shuffle
1571 define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) {
1572 ; SSE2-LABEL: shuffle_v4i32_2456:
1574 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1575 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1578 ; SSE3-LABEL: shuffle_v4i32_2456:
1580 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1581 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1584 ; SSSE3-LABEL: shuffle_v4i32_2456:
1586 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1587 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1588 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1591 ; SSE41-LABEL: shuffle_v4i32_2456:
1593 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1594 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1595 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1598 ; AVX1OR2-LABEL: shuffle_v4i32_2456:
1600 ; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1601 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1602 ; AVX1OR2-NEXT: retq
1604 ; AVX512VL-LABEL: shuffle_v4i32_2456:
1605 ; AVX512VL: # %bb.0:
1606 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,0,1,2]
1607 ; AVX512VL-NEXT: vpermi2d %xmm0, %xmm1, %xmm2
1608 ; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
1609 ; AVX512VL-NEXT: retq
1610 %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1611 %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1615 define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) {
1616 ; SSE-LABEL: shuffle_v4i32_40u1:
1618 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1619 ; SSE-NEXT: movaps %xmm1, %xmm0
1622 ; AVX-LABEL: shuffle_v4i32_40u1:
1624 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1626 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 1>
1627 ret <4 x i32> %shuffle
1630 define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
1631 ; SSE2-LABEL: shuffle_v4i32_3456:
1633 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
1634 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1637 ; SSE3-LABEL: shuffle_v4i32_3456:
1639 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
1640 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1643 ; SSSE3-LABEL: shuffle_v4i32_3456:
1645 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1646 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1649 ; SSE41-LABEL: shuffle_v4i32_3456:
1651 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1652 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1655 ; AVX-LABEL: shuffle_v4i32_3456:
1657 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1659 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1660 ret <4 x i32> %shuffle
1663 define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
1664 ; SSE2-LABEL: shuffle_v4i32_0u1u:
1666 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1669 ; SSE3-LABEL: shuffle_v4i32_0u1u:
1671 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1674 ; SSSE3-LABEL: shuffle_v4i32_0u1u:
1676 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1679 ; SSE41-LABEL: shuffle_v4i32_0u1u:
1681 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1684 ; AVX-LABEL: shuffle_v4i32_0u1u:
1686 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1688 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
1689 ret <4 x i32> %shuffle
1692 define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
1693 ; SSE2-LABEL: shuffle_v4i32_0z1z:
1695 ; SSE2-NEXT: xorps %xmm1, %xmm1
1696 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1699 ; SSE3-LABEL: shuffle_v4i32_0z1z:
1701 ; SSE3-NEXT: xorps %xmm1, %xmm1
1702 ; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1705 ; SSSE3-LABEL: shuffle_v4i32_0z1z:
1707 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1708 ; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1711 ; SSE41-LABEL: shuffle_v4i32_0z1z:
1713 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1716 ; AVX-LABEL: shuffle_v4i32_0z1z:
1718 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1720 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1721 ret <4 x i32> %shuffle
1724 define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) {
1725 ; SSE-LABEL: shuffle_v4i32_01zu:
1727 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
1730 ; AVX-LABEL: shuffle_v4i32_01zu:
1732 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1734 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 7, i32 undef>
1735 ret <4 x i32> %shuffle
1738 define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
1739 ; SSE2-LABEL: shuffle_v4i32_0z23:
1741 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1744 ; SSE3-LABEL: shuffle_v4i32_0z23:
1746 ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1749 ; SSSE3-LABEL: shuffle_v4i32_0z23:
1751 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1754 ; SSE41-LABEL: shuffle_v4i32_0z23:
1756 ; SSE41-NEXT: xorps %xmm1, %xmm1
1757 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1760 ; AVX-LABEL: shuffle_v4i32_0z23:
1762 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1763 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1765 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
1766 ret <4 x i32> %shuffle
1769 define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
1770 ; SSE2-LABEL: shuffle_v4i32_01z3:
1772 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1775 ; SSE3-LABEL: shuffle_v4i32_01z3:
1777 ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1780 ; SSSE3-LABEL: shuffle_v4i32_01z3:
1782 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1785 ; SSE41-LABEL: shuffle_v4i32_01z3:
1787 ; SSE41-NEXT: xorps %xmm1, %xmm1
1788 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1791 ; AVX-LABEL: shuffle_v4i32_01z3:
1793 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1794 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1796 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
1797 ret <4 x i32> %shuffle
1800 define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
1801 ; SSE2-LABEL: shuffle_v4i32_012z:
1803 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1806 ; SSE3-LABEL: shuffle_v4i32_012z:
1808 ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1811 ; SSSE3-LABEL: shuffle_v4i32_012z:
1813 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1816 ; SSE41-LABEL: shuffle_v4i32_012z:
1818 ; SSE41-NEXT: xorps %xmm1, %xmm1
1819 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1822 ; AVX-LABEL: shuffle_v4i32_012z:
1824 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1825 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1827 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1828 ret <4 x i32> %shuffle
1831 define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
1832 ; SSE2-LABEL: shuffle_v4i32_0zz3:
1834 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1837 ; SSE3-LABEL: shuffle_v4i32_0zz3:
1839 ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1842 ; SSSE3-LABEL: shuffle_v4i32_0zz3:
1844 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1847 ; SSE41-LABEL: shuffle_v4i32_0zz3:
1849 ; SSE41-NEXT: xorps %xmm1, %xmm1
1850 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1853 ; AVX-LABEL: shuffle_v4i32_0zz3:
1855 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1856 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1858 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
1859 ret <4 x i32> %shuffle
1862 define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) {
1863 ; SSE-LABEL: shuffle_v4i32_bitcast_0415:
1865 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1868 ; AVX-LABEL: shuffle_v4i32_bitcast_0415:
1870 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1872 %shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 0, i32 4>
1873 %bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double>
1874 %shuffle64 = shufflevector <2 x double> %bitcast64, <2 x double> undef, <2 x i32> <i32 1, i32 0>
1875 %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x i32>
1876 ret <4 x i32> %bitcast32
1879 define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) {
1880 ; SSE-LABEL: shuffle_v4f32_bitcast_4401:
1882 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
1883 ; SSE-NEXT: movaps %xmm1, %xmm0
1886 ; AVX-LABEL: shuffle_v4f32_bitcast_4401:
1888 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
1890 %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
1891 %2 = bitcast <4 x i32> %1 to <2 x double>
1892 %3 = bitcast <4 x float> %a to <2 x double>
1893 %4 = shufflevector <2 x double> %2, <2 x double> %3, <2 x i32> <i32 0, i32 2>
1894 %5 = bitcast <2 x double> %4 to <4 x float>
1898 define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) {
1899 ; SSE-LABEL: shuffle_v4f32_bitcast_0045:
1901 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
1904 ; AVX-LABEL: shuffle_v4f32_bitcast_0045:
1906 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
1908 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
1909 %2 = bitcast <4 x i32> %b to <4 x float>
1910 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 1, i32 0, i32 4, i32 5>
1914 define <4 x float> @mask_v4f32_4127(<4 x float> %a, <4 x float> %b) {
1915 ; SSE2-LABEL: mask_v4f32_4127:
1917 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1918 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1919 ; SSE2-NEXT: movaps %xmm1, %xmm0
1922 ; SSE3-LABEL: mask_v4f32_4127:
1924 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1925 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1926 ; SSE3-NEXT: movaps %xmm1, %xmm0
1929 ; SSSE3-LABEL: mask_v4f32_4127:
1931 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1932 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1933 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1936 ; SSE41-LABEL: mask_v4f32_4127:
1938 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
1941 ; AVX-LABEL: mask_v4f32_4127:
1943 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
1945 %1 = bitcast <4 x float> %a to <4 x i32>
1946 %2 = bitcast <4 x float> %b to <4 x i32>
1947 %3 = and <4 x i32> %1, <i32 0, i32 -1, i32 -1, i32 0>
1948 %4 = and <4 x i32> %2, <i32 -1, i32 0, i32 0, i32 -1>
1949 %5 = or <4 x i32> %4, %3
1950 %6 = bitcast <4 x i32> %5 to <4 x float>
1954 define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) {
1955 ; SSE2-LABEL: mask_v4f32_0127:
1957 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1958 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1959 ; SSE2-NEXT: movaps %xmm1, %xmm0
1962 ; SSE3-LABEL: mask_v4f32_0127:
1964 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1965 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1966 ; SSE3-NEXT: movaps %xmm1, %xmm0
1969 ; SSSE3-LABEL: mask_v4f32_0127:
1971 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1972 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1973 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1976 ; SSE41-LABEL: mask_v4f32_0127:
1978 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1981 ; AVX-LABEL: mask_v4f32_0127:
1983 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1985 %1 = bitcast <4 x float> %a to <2 x i64>
1986 %2 = bitcast <4 x float> %b to <2 x i64>
1987 %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
1988 %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
1989 %5 = or <2 x i64> %4, %3
1990 %6 = bitcast <2 x i64> %5 to <4 x float>
1994 define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
1995 ; SSE2-LABEL: mask_v4i32_0127:
1997 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1998 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1999 ; SSE2-NEXT: movaps %xmm1, %xmm0
2002 ; SSE3-LABEL: mask_v4i32_0127:
2004 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
2005 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2006 ; SSE3-NEXT: movaps %xmm1, %xmm0
2009 ; SSSE3-LABEL: mask_v4i32_0127:
2011 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
2012 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2013 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2016 ; SSE41-LABEL: mask_v4i32_0127:
2018 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2021 ; AVX-LABEL: mask_v4i32_0127:
2023 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2025 %1 = bitcast <4 x i32> %a to <2 x i64>
2026 %2 = bitcast <4 x i32> %b to <2 x i64>
2027 %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
2028 %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
2029 %5 = or <2 x i64> %4, %3
2030 %6 = bitcast <2 x i64> %5 to <4 x i32>
2034 define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) {
2035 ; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
2037 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
2038 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2041 ; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
2043 ; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
2046 ; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
2048 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
2051 ; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32:
2053 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
2056 ; AVX-LABEL: broadcast_v4f32_0101_from_v2f32:
2058 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
2060 %1 = load <2 x float>, <2 x float>* %x, align 1
2061 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
2065 define <4 x i32> @extract3_insert0_v4i32_7123(<4 x i32> %a0, <4 x i32> %a1) {
2066 ; SSE2-LABEL: extract3_insert0_v4i32_7123:
2068 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2069 ; SSE2-NEXT: movd %xmm1, %eax
2070 ; SSE2-NEXT: movd %eax, %xmm1
2071 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2074 ; SSE3-LABEL: extract3_insert0_v4i32_7123:
2076 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2077 ; SSE3-NEXT: movd %xmm1, %eax
2078 ; SSE3-NEXT: movd %eax, %xmm1
2079 ; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2082 ; SSSE3-LABEL: extract3_insert0_v4i32_7123:
2084 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2085 ; SSSE3-NEXT: movd %xmm1, %eax
2086 ; SSSE3-NEXT: movd %eax, %xmm1
2087 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2090 ; SSE41-LABEL: extract3_insert0_v4i32_7123:
2092 ; SSE41-NEXT: extractps $3, %xmm1, %eax
2093 ; SSE41-NEXT: pinsrd $0, %eax, %xmm0
2096 ; AVX-LABEL: extract3_insert0_v4i32_7123:
2098 ; AVX-NEXT: vextractps $3, %xmm1, %eax
2099 ; AVX-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
2101 %1 = extractelement <4 x i32> %a1, i32 3
2102 %2 = insertelement <4 x i32> %a0, i32 %1, i32 0
2106 define <4 x i32> @extract3_insert3_v4i32_0127(<4 x i32> %a0, <4 x i32> %a1) {
2107 ; SSE2-LABEL: extract3_insert3_v4i32_0127:
2109 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2110 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2113 ; SSE3-LABEL: extract3_insert3_v4i32_0127:
2115 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2116 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2119 ; SSSE3-LABEL: extract3_insert3_v4i32_0127:
2121 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2122 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2125 ; SSE41-LABEL: extract3_insert3_v4i32_0127:
2127 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2130 ; AVX-LABEL: extract3_insert3_v4i32_0127:
2132 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2134 %1 = extractelement <4 x i32> %a1, i32 3
2135 %2 = insertelement <4 x i32> %a0, i32 %1, i32 3
2139 define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
2140 ; SSE-LABEL: insert_reg_and_zero_v4i32:
2142 ; SSE-NEXT: movd %edi, %xmm0
2145 ; AVX-LABEL: insert_reg_and_zero_v4i32:
2147 ; AVX-NEXT: vmovd %edi, %xmm0
2149 %v = insertelement <4 x i32> undef, i32 %a, i32 0
2150 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2151 ret <4 x i32> %shuffle
2154 define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
2155 ; SSE-LABEL: insert_mem_and_zero_v4i32:
2157 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2160 ; AVX-LABEL: insert_mem_and_zero_v4i32:
2162 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2164 %a = load i32, i32* %ptr
2165 %v = insertelement <4 x i32> undef, i32 %a, i32 0
2166 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2167 ret <4 x i32> %shuffle
2170 define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
2171 ; SSE2-LABEL: insert_reg_and_zero_v4f32:
2173 ; SSE2-NEXT: xorps %xmm1, %xmm1
2174 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2175 ; SSE2-NEXT: movaps %xmm1, %xmm0
2178 ; SSE3-LABEL: insert_reg_and_zero_v4f32:
2180 ; SSE3-NEXT: xorps %xmm1, %xmm1
2181 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2182 ; SSE3-NEXT: movaps %xmm1, %xmm0
2185 ; SSSE3-LABEL: insert_reg_and_zero_v4f32:
2187 ; SSSE3-NEXT: xorps %xmm1, %xmm1
2188 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2189 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2192 ; SSE41-LABEL: insert_reg_and_zero_v4f32:
2194 ; SSE41-NEXT: xorps %xmm1, %xmm1
2195 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2198 ; AVX-LABEL: insert_reg_and_zero_v4f32:
2200 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
2201 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2203 %v = insertelement <4 x float> undef, float %a, i32 0
2204 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2205 ret <4 x float> %shuffle
2208 define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
2209 ; SSE-LABEL: insert_mem_and_zero_v4f32:
2211 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2214 ; AVX-LABEL: insert_mem_and_zero_v4f32:
2216 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2218 %a = load float, float* %ptr
2219 %v = insertelement <4 x float> undef, float %a, i32 0
2220 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2221 ret <4 x float> %shuffle
2224 define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
2225 ; SSE2-LABEL: insert_reg_lo_v4i32:
2227 ; SSE2-NEXT: movq %rdi, %xmm1
2228 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2231 ; SSE3-LABEL: insert_reg_lo_v4i32:
2233 ; SSE3-NEXT: movq %rdi, %xmm1
2234 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2237 ; SSSE3-LABEL: insert_reg_lo_v4i32:
2239 ; SSSE3-NEXT: movq %rdi, %xmm1
2240 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2243 ; SSE41-LABEL: insert_reg_lo_v4i32:
2245 ; SSE41-NEXT: movq %rdi, %xmm1
2246 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2249 ; AVX1-LABEL: insert_reg_lo_v4i32:
2251 ; AVX1-NEXT: vmovq %rdi, %xmm1
2252 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2255 ; AVX2OR512VL-LABEL: insert_reg_lo_v4i32:
2256 ; AVX2OR512VL: # %bb.0:
2257 ; AVX2OR512VL-NEXT: vmovq %rdi, %xmm1
2258 ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2259 ; AVX2OR512VL-NEXT: retq
2260 %a.cast = bitcast i64 %a to <2 x i32>
2261 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2262 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2263 ret <4 x i32> %shuffle
2266 define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
2267 ; SSE2-LABEL: insert_mem_lo_v4i32:
2269 ; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2272 ; SSE3-LABEL: insert_mem_lo_v4i32:
2274 ; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2277 ; SSSE3-LABEL: insert_mem_lo_v4i32:
2279 ; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2282 ; SSE41-LABEL: insert_mem_lo_v4i32:
2284 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
2285 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2288 ; AVX-LABEL: insert_mem_lo_v4i32:
2290 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2291 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2293 %a = load <2 x i32>, <2 x i32>* %ptr
2294 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2295 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2296 ret <4 x i32> %shuffle
2299 define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
2300 ; SSE-LABEL: insert_reg_hi_v4i32:
2302 ; SSE-NEXT: movq %rdi, %xmm1
2303 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2306 ; AVX-LABEL: insert_reg_hi_v4i32:
2308 ; AVX-NEXT: vmovq %rdi, %xmm1
2309 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2311 %a.cast = bitcast i64 %a to <2 x i32>
2312 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2313 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2314 ret <4 x i32> %shuffle
2317 define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
2318 ; SSE-LABEL: insert_mem_hi_v4i32:
2320 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
2321 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2324 ; AVX-LABEL: insert_mem_hi_v4i32:
2326 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2327 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2329 %a = load <2 x i32>, <2 x i32>* %ptr
2330 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2331 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2332 ret <4 x i32> %shuffle
2335 define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
2336 ; SSE2-LABEL: insert_reg_lo_v4f32:
2338 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2341 ; SSE3-LABEL: insert_reg_lo_v4f32:
2343 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2346 ; SSSE3-LABEL: insert_reg_lo_v4f32:
2348 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2351 ; SSE41-LABEL: insert_reg_lo_v4f32:
2353 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2356 ; AVX-LABEL: insert_reg_lo_v4f32:
2358 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2360 %a.cast = bitcast double %a to <2 x float>
2361 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2362 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2363 ret <4 x float> %shuffle
2366 define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
2367 ; SSE-LABEL: insert_mem_lo_v4f32:
2369 ; SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2372 ; AVX-LABEL: insert_mem_lo_v4f32:
2374 ; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2376 %a = load <2 x float>, <2 x float>* %ptr
2377 %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2378 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2379 ret <4 x float> %shuffle
2382 define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
2383 ; SSE-LABEL: insert_reg_hi_v4f32:
2385 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2386 ; SSE-NEXT: movaps %xmm1, %xmm0
2389 ; AVX-LABEL: insert_reg_hi_v4f32:
2391 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2393 %a.cast = bitcast double %a to <2 x float>
2394 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2395 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2396 ret <4 x float> %shuffle
2399 define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
2400 ; SSE-LABEL: insert_mem_hi_v4f32:
2402 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2405 ; AVX-LABEL: insert_mem_hi_v4f32:
2407 ; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2409 %a = load <2 x float>, <2 x float>* %ptr
2410 %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2411 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2412 ret <4 x float> %shuffle
2416 define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
2417 ; SSE-LABEL: shuffle_mem_v4f32_3210:
2419 ; SSE-NEXT: movaps (%rdi), %xmm0
2420 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
2423 ; AVX-LABEL: shuffle_mem_v4f32_3210:
2425 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
2427 %a = load <4 x float>, <4 x float>* %ptr
2428 %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2429 ret <4 x float> %shuffle
2432 define <4 x i32> @insert_dup_mem_v4i32(i32* %ptr) {
2433 ; SSE-LABEL: insert_dup_mem_v4i32:
2435 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2436 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2439 ; AVX-LABEL: insert_dup_mem_v4i32:
2441 ; AVX-NEXT: vbroadcastss (%rdi), %xmm0
2443 %tmp = load i32, i32* %ptr, align 4
2444 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2445 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
2450 define <4 x float> @shuffle_mem_pmovzx_v4f32(<2 x float>* %p0, <4 x float>* %p1) {
2451 ; SSE-LABEL: shuffle_mem_pmovzx_v4f32:
2453 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
2454 ; SSE-NEXT: xorps %xmm1, %xmm1
2455 ; SSE-NEXT: movaps %xmm0, %xmm2
2456 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2457 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2458 ; SSE-NEXT: movaps %xmm2, (%rsi)
2461 ; AVX1-LABEL: shuffle_mem_pmovzx_v4f32:
2463 ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2464 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
2465 ; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2466 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2467 ; AVX1-NEXT: vmovaps %xmm1, (%rsi)
2470 ; AVX2OR512VL-LABEL: shuffle_mem_pmovzx_v4f32:
2471 ; AVX2OR512VL: # %bb.0:
2472 ; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2473 ; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
2474 ; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2475 ; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %xmm0
2476 ; AVX2OR512VL-NEXT: vmovaps %xmm1, (%rsi)
2477 ; AVX2OR512VL-NEXT: retq
2478 %1 = load <2 x float>, <2 x float>* %p0
2479 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
2480 %3 = shufflevector <4 x float> %2, <4 x float> <float undef, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
2481 %4 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> zeroinitializer
2482 store <4 x float> %3, <4 x float>* %p1
2487 ; Shuffle to logical bit shifts
2490 define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) {
2491 ; SSE-LABEL: shuffle_v4i32_z0zX:
2493 ; SSE-NEXT: psllq $32, %xmm0
2496 ; AVX-LABEL: shuffle_v4i32_z0zX:
2498 ; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
2500 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 undef>
2501 ret <4 x i32> %shuffle
2504 define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) {
2505 ; SSE-LABEL: shuffle_v4i32_1z3z:
2507 ; SSE-NEXT: psrlq $32, %xmm0
2510 ; AVX-LABEL: shuffle_v4i32_1z3z:
2512 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
2514 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
2515 ret <4 x i32> %shuffle
2518 define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) {
2519 ; SSE-LABEL: shuffle_mem_v4f32_0145:
2521 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2524 ; AVX-LABEL: shuffle_mem_v4f32_0145:
2526 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
2528 %b = load <4 x float>, <4 x float>* %pb, align 1
2529 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
2530 ret <4 x float> %shuffle
2533 define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, <4 x float>* %pb) {
2534 ; SSE2-LABEL: shuffle_mem_v4f32_4523:
2536 ; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2539 ; SSE3-LABEL: shuffle_mem_v4f32_4523:
2541 ; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2544 ; SSSE3-LABEL: shuffle_mem_v4f32_4523:
2546 ; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2549 ; SSE41-LABEL: shuffle_mem_v4f32_4523:
2551 ; SSE41-NEXT: movups (%rdi), %xmm1
2552 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2555 ; AVX-LABEL: shuffle_mem_v4f32_4523:
2557 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2559 %b = load <4 x float>, <4 x float>* %pb, align 1
2560 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
2561 ret <4 x float> %shuffle
2564 define <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, <4 x float>* %a1) {
2565 ; SSE-LABEL: shuffle_mem_v4f32_0624:
2567 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
2568 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
2571 ; AVX1OR2-LABEL: shuffle_mem_v4f32_0624:
2573 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
2574 ; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
2575 ; AVX1OR2-NEXT: retq
2577 ; AVX512VL-LABEL: shuffle_mem_v4f32_0624:
2578 ; AVX512VL: # %bb.0:
2579 ; AVX512VL-NEXT: vmovaps (%rdi), %xmm2
2580 ; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,2,4]
2581 ; AVX512VL-NEXT: vpermi2ps %xmm0, %xmm2, %xmm1
2582 ; AVX512VL-NEXT: vmovaps %xmm1, %xmm0
2583 ; AVX512VL-NEXT: retq
2584 %1 = load <4 x float>, <4 x float>* %a1
2585 %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
2589 define <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, <4 x float>* %a1) {
2590 ; SSE-LABEL: shuffle_mem_v4f32_4760:
2592 ; SSE-NEXT: movaps %xmm0, %xmm1
2593 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
2594 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
2597 ; AVX1OR2-LABEL: shuffle_mem_v4f32_4760:
2599 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0]
2600 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
2601 ; AVX1OR2-NEXT: retq
2603 ; AVX512VL-LABEL: shuffle_mem_v4f32_4760:
2604 ; AVX512VL: # %bb.0:
2605 ; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,3,2,4]
2606 ; AVX512VL-NEXT: vpermt2ps (%rdi), %xmm1, %xmm0
2607 ; AVX512VL-NEXT: retq
2608 %1 = load <4 x float>, <4 x float>* %a1
2609 %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 4, i32 7, i32 6, i32 0>