1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3
4 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
5 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
6 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
7 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
8 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
9 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
10 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL
11 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL
13 define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
14 ; SSE-LABEL: shuffle_v4i32_0001:
16 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
19 ; AVX-LABEL: shuffle_v4i32_0001:
21 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
23 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
24 ret <4 x i32> %shuffle
26 define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) {
27 ; SSE-LABEL: shuffle_v4i32_0020:
29 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
32 ; AVX-LABEL: shuffle_v4i32_0020:
34 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
36 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
37 ret <4 x i32> %shuffle
39 define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) {
40 ; SSE-LABEL: shuffle_v4i32_0112:
42 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
45 ; AVX-LABEL: shuffle_v4i32_0112:
47 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2]
49 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
50 ret <4 x i32> %shuffle
52 define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) {
53 ; SSE-LABEL: shuffle_v4i32_0300:
55 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
58 ; AVX-LABEL: shuffle_v4i32_0300:
60 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
62 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
63 ret <4 x i32> %shuffle
65 define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) {
66 ; SSE-LABEL: shuffle_v4i32_1000:
68 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
71 ; AVX-LABEL: shuffle_v4i32_1000:
73 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
75 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
76 ret <4 x i32> %shuffle
78 define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) {
79 ; SSE-LABEL: shuffle_v4i32_2200:
81 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
84 ; AVX-LABEL: shuffle_v4i32_2200:
86 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
88 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
89 ret <4 x i32> %shuffle
91 define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) {
92 ; SSE-LABEL: shuffle_v4i32_3330:
94 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
97 ; AVX-LABEL: shuffle_v4i32_3330:
99 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
101 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
102 ret <4 x i32> %shuffle
104 define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) {
105 ; SSE-LABEL: shuffle_v4i32_3210:
107 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
110 ; AVX-LABEL: shuffle_v4i32_3210:
112 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
114 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
115 ret <4 x i32> %shuffle
118 define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) {
119 ; SSE-LABEL: shuffle_v4i32_2121:
121 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
124 ; AVX-LABEL: shuffle_v4i32_2121:
126 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,2,1]
128 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1>
129 ret <4 x i32> %shuffle
132 define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
133 ; SSE-LABEL: shuffle_v4f32_0001:
135 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
138 ; AVX-LABEL: shuffle_v4f32_0001:
140 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
142 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
143 ret <4 x float> %shuffle
145 define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
146 ; SSE-LABEL: shuffle_v4f32_0020:
148 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
151 ; AVX-LABEL: shuffle_v4f32_0020:
153 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
155 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
156 ret <4 x float> %shuffle
158 define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
159 ; SSE-LABEL: shuffle_v4f32_0300:
161 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
164 ; AVX-LABEL: shuffle_v4f32_0300:
166 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
168 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
169 ret <4 x float> %shuffle
171 define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
172 ; SSE-LABEL: shuffle_v4f32_1000:
174 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
177 ; AVX-LABEL: shuffle_v4f32_1000:
179 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
181 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
182 ret <4 x float> %shuffle
184 define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
185 ; SSE-LABEL: shuffle_v4f32_2200:
187 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
190 ; AVX-LABEL: shuffle_v4f32_2200:
192 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
194 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
195 ret <4 x float> %shuffle
197 define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
198 ; SSE-LABEL: shuffle_v4f32_3330:
200 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
203 ; AVX-LABEL: shuffle_v4f32_3330:
205 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
207 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
208 ret <4 x float> %shuffle
210 define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
211 ; SSE-LABEL: shuffle_v4f32_3210:
213 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
216 ; AVX-LABEL: shuffle_v4f32_3210:
218 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
220 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
221 ret <4 x float> %shuffle
223 define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
224 ; SSE-LABEL: shuffle_v4f32_0011:
226 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
229 ; AVX-LABEL: shuffle_v4f32_0011:
231 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
233 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
234 ret <4 x float> %shuffle
236 define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
237 ; SSE-LABEL: shuffle_v4f32_2233:
239 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
242 ; AVX-LABEL: shuffle_v4f32_2233:
244 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
246 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
247 ret <4 x float> %shuffle
249 define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
250 ; SSE2-LABEL: shuffle_v4f32_0022:
252 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
255 ; SSE3-LABEL: shuffle_v4f32_0022:
257 ; SSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
260 ; SSSE3-LABEL: shuffle_v4f32_0022:
262 ; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
265 ; SSE41-LABEL: shuffle_v4f32_0022:
267 ; SSE41-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
270 ; AVX-LABEL: shuffle_v4f32_0022:
272 ; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
274 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
275 ret <4 x float> %shuffle
277 define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
278 ; SSE2-LABEL: shuffle_v4f32_1133:
280 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
283 ; SSE3-LABEL: shuffle_v4f32_1133:
285 ; SSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
288 ; SSSE3-LABEL: shuffle_v4f32_1133:
290 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
293 ; SSE41-LABEL: shuffle_v4f32_1133:
295 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
298 ; AVX-LABEL: shuffle_v4f32_1133:
300 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
302 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
303 ret <4 x float> %shuffle
306 define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
307 ; SSE-LABEL: shuffle_v4f32_0145:
309 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
312 ; AVX-LABEL: shuffle_v4f32_0145:
314 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
316 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
317 ret <4 x float> %shuffle
320 define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
321 ; SSE-LABEL: shuffle_v4f32_6723:
323 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
326 ; AVX-LABEL: shuffle_v4f32_6723:
328 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
330 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
331 ret <4 x float> %shuffle
334 define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
335 ; SSE2-LABEL: shuffle_v4i32_0124:
337 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
338 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
341 ; SSE3-LABEL: shuffle_v4i32_0124:
343 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
344 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
347 ; SSSE3-LABEL: shuffle_v4i32_0124:
349 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
350 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
353 ; SSE41-LABEL: shuffle_v4i32_0124:
355 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
356 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
359 ; AVX1-LABEL: shuffle_v4i32_0124:
361 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
362 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
365 ; AVX2-LABEL: shuffle_v4i32_0124:
367 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
368 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
371 ; AVX512VL-LABEL: shuffle_v4i32_0124:
373 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,4]
374 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
375 ; AVX512VL-NEXT: retq
376 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
377 ret <4 x i32> %shuffle
379 define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
380 ; SSE2-LABEL: shuffle_v4i32_0142:
382 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
383 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
386 ; SSE3-LABEL: shuffle_v4i32_0142:
388 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
389 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
392 ; SSSE3-LABEL: shuffle_v4i32_0142:
394 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
395 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
398 ; SSE41-LABEL: shuffle_v4i32_0142:
400 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
401 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
402 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
405 ; AVX1-LABEL: shuffle_v4i32_0142:
407 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
408 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
409 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
412 ; AVX2-LABEL: shuffle_v4i32_0142:
414 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
415 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
416 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
419 ; AVX512VL-LABEL: shuffle_v4i32_0142:
421 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,2]
422 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
423 ; AVX512VL-NEXT: retq
424 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
425 ret <4 x i32> %shuffle
427 define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
428 ; SSE2-LABEL: shuffle_v4i32_0412:
430 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
431 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
432 ; SSE2-NEXT: movaps %xmm1, %xmm0
435 ; SSE3-LABEL: shuffle_v4i32_0412:
437 ; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
438 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
439 ; SSE3-NEXT: movaps %xmm1, %xmm0
442 ; SSSE3-LABEL: shuffle_v4i32_0412:
444 ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
445 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
446 ; SSSE3-NEXT: movaps %xmm1, %xmm0
449 ; SSE41-LABEL: shuffle_v4i32_0412:
451 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
452 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
453 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
456 ; AVX1-LABEL: shuffle_v4i32_0412:
458 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,1,1]
459 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2]
460 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
463 ; AVX2-LABEL: shuffle_v4i32_0412:
465 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
466 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2]
467 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
470 ; AVX512VL-LABEL: shuffle_v4i32_0412:
472 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,1,2]
473 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
474 ; AVX512VL-NEXT: retq
475 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
476 ret <4 x i32> %shuffle
478 define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
479 ; SSE2-LABEL: shuffle_v4i32_4012:
481 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
482 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
483 ; SSE2-NEXT: movaps %xmm1, %xmm0
486 ; SSE3-LABEL: shuffle_v4i32_4012:
488 ; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
489 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
490 ; SSE3-NEXT: movaps %xmm1, %xmm0
493 ; SSSE3-LABEL: shuffle_v4i32_4012:
495 ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
496 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
497 ; SSSE3-NEXT: movaps %xmm1, %xmm0
500 ; SSE41-LABEL: shuffle_v4i32_4012:
502 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
503 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
506 ; AVX1OR2-LABEL: shuffle_v4i32_4012:
508 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,2]
509 ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
512 ; AVX512VL-LABEL: shuffle_v4i32_4012:
514 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,2]
515 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
516 ; AVX512VL-NEXT: retq
517 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
518 ret <4 x i32> %shuffle
520 define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
521 ; SSE-LABEL: shuffle_v4i32_0145:
523 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
526 ; AVX-LABEL: shuffle_v4i32_0145:
528 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
530 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
531 ret <4 x i32> %shuffle
533 define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
534 ; SSE-LABEL: shuffle_v4i32_0451:
536 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
537 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
540 ; AVX1OR2-LABEL: shuffle_v4i32_0451:
542 ; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
543 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
546 ; AVX512VL-LABEL: shuffle_v4i32_0451:
548 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,5,1]
549 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
550 ; AVX512VL-NEXT: retq
551 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
552 ret <4 x i32> %shuffle
554 define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
555 ; SSE-LABEL: shuffle_v4i32_4501:
557 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
558 ; SSE-NEXT: movaps %xmm1, %xmm0
561 ; AVX-LABEL: shuffle_v4i32_4501:
563 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
565 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
566 ret <4 x i32> %shuffle
568 define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
569 ; SSE-LABEL: shuffle_v4i32_4015:
571 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
572 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
575 ; AVX1OR2-LABEL: shuffle_v4i32_4015:
577 ; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
578 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
581 ; AVX512VL-LABEL: shuffle_v4i32_4015:
583 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,5]
584 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
585 ; AVX512VL-NEXT: retq
586 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
587 ret <4 x i32> %shuffle
590 define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
591 ; SSE2-LABEL: shuffle_v4f32_4zzz:
593 ; SSE2-NEXT: xorps %xmm1, %xmm1
594 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
595 ; SSE2-NEXT: movaps %xmm1, %xmm0
598 ; SSE3-LABEL: shuffle_v4f32_4zzz:
600 ; SSE3-NEXT: xorps %xmm1, %xmm1
601 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
602 ; SSE3-NEXT: movaps %xmm1, %xmm0
605 ; SSSE3-LABEL: shuffle_v4f32_4zzz:
607 ; SSSE3-NEXT: xorps %xmm1, %xmm1
608 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
609 ; SSSE3-NEXT: movaps %xmm1, %xmm0
612 ; SSE41-LABEL: shuffle_v4f32_4zzz:
614 ; SSE41-NEXT: xorps %xmm1, %xmm1
615 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
618 ; AVX-LABEL: shuffle_v4f32_4zzz:
620 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
621 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
623 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
624 ret <4 x float> %shuffle
627 define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
628 ; SSE2-LABEL: shuffle_v4f32_z4zz:
630 ; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
631 ; SSE2-NEXT: xorps %xmm1, %xmm1
632 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
635 ; SSE3-LABEL: shuffle_v4f32_z4zz:
637 ; SSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
638 ; SSE3-NEXT: xorps %xmm1, %xmm1
639 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
642 ; SSSE3-LABEL: shuffle_v4f32_z4zz:
644 ; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
645 ; SSSE3-NEXT: xorps %xmm1, %xmm1
646 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
649 ; SSE41-LABEL: shuffle_v4f32_z4zz:
651 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
654 ; AVX-LABEL: shuffle_v4f32_z4zz:
656 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
658 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
659 ret <4 x float> %shuffle
662 define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
663 ; SSE2-LABEL: shuffle_v4f32_zz4z:
665 ; SSE2-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
666 ; SSE2-NEXT: pxor %xmm0, %xmm0
667 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
670 ; SSE3-LABEL: shuffle_v4f32_zz4z:
672 ; SSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
673 ; SSE3-NEXT: pxor %xmm0, %xmm0
674 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
677 ; SSSE3-LABEL: shuffle_v4f32_zz4z:
679 ; SSSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
680 ; SSSE3-NEXT: pxor %xmm0, %xmm0
681 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
684 ; SSE41-LABEL: shuffle_v4f32_zz4z:
686 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
689 ; AVX-LABEL: shuffle_v4f32_zz4z:
691 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
693 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
694 ret <4 x float> %shuffle
697 define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
698 ; SSE2-LABEL: shuffle_v4f32_zuu4:
700 ; SSE2-NEXT: xorps %xmm1, %xmm1
701 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
702 ; SSE2-NEXT: movaps %xmm1, %xmm0
705 ; SSE3-LABEL: shuffle_v4f32_zuu4:
707 ; SSE3-NEXT: xorps %xmm1, %xmm1
708 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
709 ; SSE3-NEXT: movaps %xmm1, %xmm0
712 ; SSSE3-LABEL: shuffle_v4f32_zuu4:
714 ; SSSE3-NEXT: xorps %xmm1, %xmm1
715 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
716 ; SSSE3-NEXT: movaps %xmm1, %xmm0
719 ; SSE41-LABEL: shuffle_v4f32_zuu4:
721 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
724 ; AVX-LABEL: shuffle_v4f32_zuu4:
726 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
728 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
729 ret <4 x float> %shuffle
732 define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
733 ; SSE2-LABEL: shuffle_v4f32_zzz7:
735 ; SSE2-NEXT: xorps %xmm1, %xmm1
736 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
737 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
738 ; SSE2-NEXT: movaps %xmm1, %xmm0
741 ; SSE3-LABEL: shuffle_v4f32_zzz7:
743 ; SSE3-NEXT: xorps %xmm1, %xmm1
744 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
745 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
746 ; SSE3-NEXT: movaps %xmm1, %xmm0
749 ; SSSE3-LABEL: shuffle_v4f32_zzz7:
751 ; SSSE3-NEXT: xorps %xmm1, %xmm1
752 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
753 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
754 ; SSSE3-NEXT: movaps %xmm1, %xmm0
757 ; SSE41-LABEL: shuffle_v4f32_zzz7:
759 ; SSE41-NEXT: xorps %xmm1, %xmm1
760 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
763 ; AVX-LABEL: shuffle_v4f32_zzz7:
765 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
766 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
768 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
769 ret <4 x float> %shuffle
772 define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
773 ; SSE2-LABEL: shuffle_v4f32_z6zz:
775 ; SSE2-NEXT: xorps %xmm1, %xmm1
776 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
777 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
780 ; SSE3-LABEL: shuffle_v4f32_z6zz:
782 ; SSE3-NEXT: xorps %xmm1, %xmm1
783 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
784 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
787 ; SSSE3-LABEL: shuffle_v4f32_z6zz:
789 ; SSSE3-NEXT: xorps %xmm1, %xmm1
790 ; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
791 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
794 ; SSE41-LABEL: shuffle_v4f32_z6zz:
796 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
799 ; AVX-LABEL: shuffle_v4f32_z6zz:
801 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
803 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
804 ret <4 x float> %shuffle
807 define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
808 ; SSE2-LABEL: shuffle_v4f32_0z23:
810 ; SSE2-NEXT: xorps %xmm1, %xmm1
811 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
812 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
813 ; SSE2-NEXT: movaps %xmm1, %xmm0
816 ; SSE3-LABEL: shuffle_v4f32_0z23:
818 ; SSE3-NEXT: xorps %xmm1, %xmm1
819 ; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
820 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
821 ; SSE3-NEXT: movaps %xmm1, %xmm0
824 ; SSSE3-LABEL: shuffle_v4f32_0z23:
826 ; SSSE3-NEXT: xorps %xmm1, %xmm1
827 ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
828 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
829 ; SSSE3-NEXT: movaps %xmm1, %xmm0
832 ; SSE41-LABEL: shuffle_v4f32_0z23:
834 ; SSE41-NEXT: xorps %xmm1, %xmm1
835 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
838 ; AVX-LABEL: shuffle_v4f32_0z23:
840 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
841 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
843 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
844 ret <4 x float> %shuffle
847 define <4 x float> @shuffle_v4f32_01z3(<4 x float> %a) {
848 ; SSE2-LABEL: shuffle_v4f32_01z3:
850 ; SSE2-NEXT: xorps %xmm1, %xmm1
851 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
852 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
855 ; SSE3-LABEL: shuffle_v4f32_01z3:
857 ; SSE3-NEXT: xorps %xmm1, %xmm1
858 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
859 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
862 ; SSSE3-LABEL: shuffle_v4f32_01z3:
864 ; SSSE3-NEXT: xorps %xmm1, %xmm1
865 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
866 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
869 ; SSE41-LABEL: shuffle_v4f32_01z3:
871 ; SSE41-NEXT: xorps %xmm1, %xmm1
872 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
875 ; AVX-LABEL: shuffle_v4f32_01z3:
877 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
878 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
880 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
881 ret <4 x float> %shuffle
884 define <4 x float> @shuffle_v4f32_012z(<4 x float> %a) {
885 ; SSE2-LABEL: shuffle_v4f32_012z:
887 ; SSE2-NEXT: xorps %xmm1, %xmm1
888 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
889 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
892 ; SSE3-LABEL: shuffle_v4f32_012z:
894 ; SSE3-NEXT: xorps %xmm1, %xmm1
895 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
896 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
899 ; SSSE3-LABEL: shuffle_v4f32_012z:
901 ; SSSE3-NEXT: xorps %xmm1, %xmm1
902 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
903 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
906 ; SSE41-LABEL: shuffle_v4f32_012z:
908 ; SSE41-NEXT: xorps %xmm1, %xmm1
909 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
912 ; AVX-LABEL: shuffle_v4f32_012z:
914 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
915 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
917 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
918 ret <4 x float> %shuffle
921 define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) {
922 ; SSE2-LABEL: shuffle_v4f32_0zz3:
924 ; SSE2-NEXT: xorps %xmm1, %xmm1
925 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
926 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
929 ; SSE3-LABEL: shuffle_v4f32_0zz3:
931 ; SSE3-NEXT: xorps %xmm1, %xmm1
932 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
933 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
936 ; SSSE3-LABEL: shuffle_v4f32_0zz3:
938 ; SSSE3-NEXT: xorps %xmm1, %xmm1
939 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
940 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
943 ; SSE41-LABEL: shuffle_v4f32_0zz3:
945 ; SSE41-NEXT: xorps %xmm1, %xmm1
946 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
949 ; AVX-LABEL: shuffle_v4f32_0zz3:
951 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
952 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
954 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
955 ret <4 x float> %shuffle
958 define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) {
959 ; SSE2-LABEL: shuffle_v4f32_0z2z:
961 ; SSE2-NEXT: xorps %xmm1, %xmm1
962 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
963 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
966 ; SSE3-LABEL: shuffle_v4f32_0z2z:
968 ; SSE3-NEXT: xorps %xmm1, %xmm1
969 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
970 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
973 ; SSSE3-LABEL: shuffle_v4f32_0z2z:
975 ; SSSE3-NEXT: xorps %xmm1, %xmm1
976 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
977 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
980 ; SSE41-LABEL: shuffle_v4f32_0z2z:
982 ; SSE41-NEXT: xorps %xmm1, %xmm1
983 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
986 ; AVX-LABEL: shuffle_v4f32_0z2z:
988 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
989 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
991 %shuffle = shufflevector <4 x float> %v, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
992 ret <4 x float> %shuffle
995 define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
996 ; SSE-LABEL: shuffle_v4f32_u051:
998 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
999 ; SSE-NEXT: movaps %xmm1, %xmm0
1002 ; AVX-LABEL: shuffle_v4f32_u051:
1004 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1006 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 5, i32 1>
1007 ret <4 x float> %shuffle
1010 define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
1011 ; SSE2-LABEL: shuffle_v4f32_0zz4:
1013 ; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
1014 ; SSE2-NEXT: pxor %xmm1, %xmm1
1015 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1016 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1017 ; SSE2-NEXT: movaps %xmm1, %xmm0
1020 ; SSE3-LABEL: shuffle_v4f32_0zz4:
1022 ; SSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
1023 ; SSE3-NEXT: pxor %xmm1, %xmm1
1024 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1025 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1026 ; SSE3-NEXT: movaps %xmm1, %xmm0
1029 ; SSSE3-LABEL: shuffle_v4f32_0zz4:
1031 ; SSSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
1032 ; SSSE3-NEXT: pxor %xmm1, %xmm1
1033 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1034 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1035 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1038 ; SSE41-LABEL: shuffle_v4f32_0zz4:
1040 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
1043 ; AVX-LABEL: shuffle_v4f32_0zz4:
1045 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
1047 %shuffle = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 6, i32 0>
1048 %shuffle1 = shufflevector <4 x float> %a, <4 x float> %shuffle, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1049 ret <4 x float> %shuffle1
1052 define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
1053 ; SSE2-LABEL: shuffle_v4f32_0zz6:
1055 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1056 ; SSE2-NEXT: xorps %xmm1, %xmm1
1057 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1058 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1059 ; SSE2-NEXT: movaps %xmm1, %xmm0
1062 ; SSE3-LABEL: shuffle_v4f32_0zz6:
1064 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1065 ; SSE3-NEXT: xorps %xmm1, %xmm1
1066 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1067 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1068 ; SSE3-NEXT: movaps %xmm1, %xmm0
1071 ; SSSE3-LABEL: shuffle_v4f32_0zz6:
1073 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1074 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1075 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1076 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1077 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1080 ; SSE41-LABEL: shuffle_v4f32_0zz6:
1082 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
1085 ; AVX-LABEL: shuffle_v4f32_0zz6:
1087 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
1089 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
1090 %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
1091 ret <4 x float> %shuffle1
1094 define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
1095 ; SSE2-LABEL: shuffle_v4f32_0z24:
1097 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1098 ; SSE2-NEXT: xorps %xmm2, %xmm2
1099 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1100 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1101 ; SSE2-NEXT: movaps %xmm2, %xmm0
1104 ; SSE3-LABEL: shuffle_v4f32_0z24:
1106 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1107 ; SSE3-NEXT: xorps %xmm2, %xmm2
1108 ; SSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1109 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1110 ; SSE3-NEXT: movaps %xmm2, %xmm0
1113 ; SSSE3-LABEL: shuffle_v4f32_0z24:
1115 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1116 ; SSSE3-NEXT: xorps %xmm2, %xmm2
1117 ; SSSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1118 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1119 ; SSSE3-NEXT: movaps %xmm2, %xmm0
1122 ; SSE41-LABEL: shuffle_v4f32_0z24:
1124 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
1127 ; AVX-LABEL: shuffle_v4f32_0z24:
1129 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
1131 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
1132 %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1133 ret <4 x float> %shuffle1
1136 define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
1137 ; SSE2-LABEL: shuffle_v4i32_4zzz:
1139 ; SSE2-NEXT: xorps %xmm1, %xmm1
1140 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1141 ; SSE2-NEXT: movaps %xmm1, %xmm0
1144 ; SSE3-LABEL: shuffle_v4i32_4zzz:
1146 ; SSE3-NEXT: xorps %xmm1, %xmm1
1147 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1148 ; SSE3-NEXT: movaps %xmm1, %xmm0
1151 ; SSSE3-LABEL: shuffle_v4i32_4zzz:
1153 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1154 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1155 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1158 ; SSE41-LABEL: shuffle_v4i32_4zzz:
1160 ; SSE41-NEXT: xorps %xmm1, %xmm1
1161 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1164 ; AVX-LABEL: shuffle_v4i32_4zzz:
1166 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1167 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1169 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1170 ret <4 x i32> %shuffle
1173 define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
1174 ; SSE2-LABEL: shuffle_v4i32_z4zz:
1176 ; SSE2-NEXT: xorps %xmm1, %xmm1
1177 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1178 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1181 ; SSE3-LABEL: shuffle_v4i32_z4zz:
1183 ; SSE3-NEXT: xorps %xmm1, %xmm1
1184 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1185 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1188 ; SSSE3-LABEL: shuffle_v4i32_z4zz:
1190 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1191 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1192 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1195 ; SSE41-LABEL: shuffle_v4i32_z4zz:
1197 ; SSE41-NEXT: pxor %xmm1, %xmm1
1198 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1199 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1202 ; AVX1-LABEL: shuffle_v4i32_z4zz:
1204 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1205 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1206 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
1209 ; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
1210 ; AVX2-SLOW: # %bb.0:
1211 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1212 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1213 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
1214 ; AVX2-SLOW-NEXT: retq
1216 ; AVX2-FAST-LABEL: shuffle_v4i32_z4zz:
1217 ; AVX2-FAST: # %bb.0:
1218 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
1219 ; AVX2-FAST-NEXT: retq
1221 ; AVX512VL-LABEL: shuffle_v4i32_z4zz:
1222 ; AVX512VL: # %bb.0:
1223 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
1224 ; AVX512VL-NEXT: retq
1225 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
1226 ret <4 x i32> %shuffle
1229 define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
1230 ; SSE2-LABEL: shuffle_v4i32_zz4z:
1232 ; SSE2-NEXT: xorps %xmm1, %xmm1
1233 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1234 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1237 ; SSE3-LABEL: shuffle_v4i32_zz4z:
1239 ; SSE3-NEXT: xorps %xmm1, %xmm1
1240 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1241 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1244 ; SSSE3-LABEL: shuffle_v4i32_zz4z:
1246 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1247 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1248 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1251 ; SSE41-LABEL: shuffle_v4i32_zz4z:
1253 ; SSE41-NEXT: pxor %xmm1, %xmm1
1254 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1255 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1258 ; AVX1-LABEL: shuffle_v4i32_zz4z:
1260 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1261 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1262 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
1265 ; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
1266 ; AVX2-SLOW: # %bb.0:
1267 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1268 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1269 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
1270 ; AVX2-SLOW-NEXT: retq
1272 ; AVX2-FAST-LABEL: shuffle_v4i32_zz4z:
1273 ; AVX2-FAST: # %bb.0:
1274 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
1275 ; AVX2-FAST-NEXT: retq
1277 ; AVX512VL-LABEL: shuffle_v4i32_zz4z:
1278 ; AVX512VL: # %bb.0:
1279 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
1280 ; AVX512VL-NEXT: retq
1281 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
1282 ret <4 x i32> %shuffle
1285 define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
1286 ; SSE-LABEL: shuffle_v4i32_zuu4:
1288 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1291 ; AVX-LABEL: shuffle_v4i32_zuu4:
1293 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1295 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
1296 ret <4 x i32> %shuffle
1299 define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
1300 ; SSE2-LABEL: shuffle_v4i32_z6zz:
1302 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1303 ; SSE2-NEXT: xorps %xmm1, %xmm1
1304 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1307 ; SSE3-LABEL: shuffle_v4i32_z6zz:
1309 ; SSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1310 ; SSE3-NEXT: xorps %xmm1, %xmm1
1311 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1314 ; SSSE3-LABEL: shuffle_v4i32_z6zz:
1316 ; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1317 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1318 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1321 ; SSE41-LABEL: shuffle_v4i32_z6zz:
1323 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
1324 ; SSE41-NEXT: pxor %xmm0, %xmm0
1325 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1328 ; AVX1-LABEL: shuffle_v4i32_z6zz:
1330 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1331 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1332 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1335 ; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz:
1336 ; AVX2-SLOW: # %bb.0:
1337 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1338 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1339 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1340 ; AVX2-SLOW-NEXT: retq
1342 ; AVX2-FAST-LABEL: shuffle_v4i32_z6zz:
1343 ; AVX2-FAST: # %bb.0:
1344 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
1345 ; AVX2-FAST-NEXT: retq
1347 ; AVX512VL-LABEL: shuffle_v4i32_z6zz:
1348 ; AVX512VL: # %bb.0:
1349 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
1350 ; AVX512VL-NEXT: retq
1351 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
1352 ret <4 x i32> %shuffle
1355 define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) {
1356 ; SSE2-LABEL: shuffle_v4i32_7012:
1358 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
1359 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
1360 ; SSE2-NEXT: movaps %xmm1, %xmm0
1363 ; SSE3-LABEL: shuffle_v4i32_7012:
1365 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
1366 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
1367 ; SSE3-NEXT: movaps %xmm1, %xmm0
1370 ; SSSE3-LABEL: shuffle_v4i32_7012:
1372 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1375 ; SSE41-LABEL: shuffle_v4i32_7012:
1377 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1380 ; AVX-LABEL: shuffle_v4i32_7012:
1382 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1384 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
1385 ret <4 x i32> %shuffle
1388 define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) {
1389 ; SSE2-LABEL: shuffle_v4i32_6701:
1391 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
1392 ; SSE2-NEXT: movaps %xmm1, %xmm0
1395 ; SSE3-LABEL: shuffle_v4i32_6701:
1397 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
1398 ; SSE3-NEXT: movaps %xmm1, %xmm0
1401 ; SSSE3-LABEL: shuffle_v4i32_6701:
1403 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1406 ; SSE41-LABEL: shuffle_v4i32_6701:
1408 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1411 ; AVX-LABEL: shuffle_v4i32_6701:
1413 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1415 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1416 ret <4 x i32> %shuffle
1419 define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) {
1420 ; SSE2-LABEL: shuffle_v4i32_5670:
1422 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1423 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
1424 ; SSE2-NEXT: movaps %xmm1, %xmm0
1427 ; SSE3-LABEL: shuffle_v4i32_5670:
1429 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1430 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
1431 ; SSE3-NEXT: movaps %xmm1, %xmm0
1434 ; SSSE3-LABEL: shuffle_v4i32_5670:
1436 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1439 ; SSE41-LABEL: shuffle_v4i32_5670:
1441 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1444 ; AVX-LABEL: shuffle_v4i32_5670:
1446 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1448 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
1449 ret <4 x i32> %shuffle
1452 define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) {
1453 ; SSE2-LABEL: shuffle_v4i32_1234:
1455 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1456 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
1459 ; SSE3-LABEL: shuffle_v4i32_1234:
1461 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1462 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
1465 ; SSSE3-LABEL: shuffle_v4i32_1234:
1467 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1468 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1471 ; SSE41-LABEL: shuffle_v4i32_1234:
1473 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1474 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1477 ; AVX-LABEL: shuffle_v4i32_1234:
1479 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1481 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
1482 ret <4 x i32> %shuffle
1485 define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
1486 ; SSE2-LABEL: shuffle_v4i32_2345:
1488 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1491 ; SSE3-LABEL: shuffle_v4i32_2345:
1493 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1496 ; SSSE3-LABEL: shuffle_v4i32_2345:
1498 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1499 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1502 ; SSE41-LABEL: shuffle_v4i32_2345:
1504 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1505 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1508 ; AVX-LABEL: shuffle_v4i32_2345:
1510 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1512 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1513 ret <4 x i32> %shuffle
1517 define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) {
1518 ; SSE2-LABEL: shuffle_v4i32_2456:
1520 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1521 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1524 ; SSE3-LABEL: shuffle_v4i32_2456:
1526 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1527 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1530 ; SSSE3-LABEL: shuffle_v4i32_2456:
1532 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1533 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1534 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1537 ; SSE41-LABEL: shuffle_v4i32_2456:
1539 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1540 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1541 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1544 ; AVX1OR2-LABEL: shuffle_v4i32_2456:
1546 ; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1547 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1548 ; AVX1OR2-NEXT: retq
1550 ; AVX512VL-LABEL: shuffle_v4i32_2456:
1551 ; AVX512VL: # %bb.0:
1552 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,0,1,2]
1553 ; AVX512VL-NEXT: vpermi2d %xmm0, %xmm1, %xmm2
1554 ; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
1555 ; AVX512VL-NEXT: retq
1556 %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1557 %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1561 define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) {
1562 ; SSE-LABEL: shuffle_v4i32_40u1:
1564 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1565 ; SSE-NEXT: movaps %xmm1, %xmm0
1568 ; AVX-LABEL: shuffle_v4i32_40u1:
1570 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1572 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 1>
1573 ret <4 x i32> %shuffle
1576 define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
1577 ; SSE2-LABEL: shuffle_v4i32_3456:
1579 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
1580 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1583 ; SSE3-LABEL: shuffle_v4i32_3456:
1585 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
1586 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1589 ; SSSE3-LABEL: shuffle_v4i32_3456:
1591 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1592 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1595 ; SSE41-LABEL: shuffle_v4i32_3456:
1597 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1598 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1601 ; AVX-LABEL: shuffle_v4i32_3456:
1603 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1605 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1606 ret <4 x i32> %shuffle
1609 define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
1610 ; SSE2-LABEL: shuffle_v4i32_0u1u:
1612 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1615 ; SSE3-LABEL: shuffle_v4i32_0u1u:
1617 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1620 ; SSSE3-LABEL: shuffle_v4i32_0u1u:
1622 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1625 ; SSE41-LABEL: shuffle_v4i32_0u1u:
1627 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1630 ; AVX-LABEL: shuffle_v4i32_0u1u:
1632 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1634 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
1635 ret <4 x i32> %shuffle
1638 define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
1639 ; SSE2-LABEL: shuffle_v4i32_0z1z:
1641 ; SSE2-NEXT: xorps %xmm1, %xmm1
1642 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1645 ; SSE3-LABEL: shuffle_v4i32_0z1z:
1647 ; SSE3-NEXT: xorps %xmm1, %xmm1
1648 ; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1651 ; SSSE3-LABEL: shuffle_v4i32_0z1z:
1653 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1654 ; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1657 ; SSE41-LABEL: shuffle_v4i32_0z1z:
1659 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1662 ; AVX-LABEL: shuffle_v4i32_0z1z:
1664 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1666 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1667 ret <4 x i32> %shuffle
1670 define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) {
1671 ; SSE-LABEL: shuffle_v4i32_01zu:
1673 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
1676 ; AVX-LABEL: shuffle_v4i32_01zu:
1678 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1680 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 7, i32 undef>
1681 ret <4 x i32> %shuffle
1684 define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
1685 ; SSE2-LABEL: shuffle_v4i32_0z23:
1687 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1690 ; SSE3-LABEL: shuffle_v4i32_0z23:
1692 ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1695 ; SSSE3-LABEL: shuffle_v4i32_0z23:
1697 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1700 ; SSE41-LABEL: shuffle_v4i32_0z23:
1702 ; SSE41-NEXT: xorps %xmm1, %xmm1
1703 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1706 ; AVX-LABEL: shuffle_v4i32_0z23:
1708 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1709 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1711 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
1712 ret <4 x i32> %shuffle
1715 define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
1716 ; SSE2-LABEL: shuffle_v4i32_01z3:
1718 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1721 ; SSE3-LABEL: shuffle_v4i32_01z3:
1723 ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1726 ; SSSE3-LABEL: shuffle_v4i32_01z3:
1728 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1731 ; SSE41-LABEL: shuffle_v4i32_01z3:
1733 ; SSE41-NEXT: xorps %xmm1, %xmm1
1734 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1737 ; AVX-LABEL: shuffle_v4i32_01z3:
1739 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1740 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1742 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
1743 ret <4 x i32> %shuffle
1746 define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
1747 ; SSE2-LABEL: shuffle_v4i32_012z:
1749 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1752 ; SSE3-LABEL: shuffle_v4i32_012z:
1754 ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1757 ; SSSE3-LABEL: shuffle_v4i32_012z:
1759 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1762 ; SSE41-LABEL: shuffle_v4i32_012z:
1764 ; SSE41-NEXT: xorps %xmm1, %xmm1
1765 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1768 ; AVX-LABEL: shuffle_v4i32_012z:
1770 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1771 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1773 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1774 ret <4 x i32> %shuffle
1777 define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
1778 ; SSE2-LABEL: shuffle_v4i32_0zz3:
1780 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1783 ; SSE3-LABEL: shuffle_v4i32_0zz3:
1785 ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1788 ; SSSE3-LABEL: shuffle_v4i32_0zz3:
1790 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1793 ; SSE41-LABEL: shuffle_v4i32_0zz3:
1795 ; SSE41-NEXT: xorps %xmm1, %xmm1
1796 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1799 ; AVX-LABEL: shuffle_v4i32_0zz3:
1801 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1802 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1804 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
1805 ret <4 x i32> %shuffle
1808 define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) {
1809 ; SSE-LABEL: shuffle_v4i32_bitcast_0415:
1811 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1814 ; AVX-LABEL: shuffle_v4i32_bitcast_0415:
1816 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1818 %shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 0, i32 4>
1819 %bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double>
1820 %shuffle64 = shufflevector <2 x double> %bitcast64, <2 x double> undef, <2 x i32> <i32 1, i32 0>
1821 %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x i32>
1822 ret <4 x i32> %bitcast32
1825 define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) {
1826 ; SSE-LABEL: shuffle_v4f32_bitcast_4401:
1828 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
1829 ; SSE-NEXT: movaps %xmm1, %xmm0
1832 ; AVX-LABEL: shuffle_v4f32_bitcast_4401:
1834 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
1836 %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
1837 %2 = bitcast <4 x i32> %1 to <2 x double>
1838 %3 = bitcast <4 x float> %a to <2 x double>
1839 %4 = shufflevector <2 x double> %2, <2 x double> %3, <2 x i32> <i32 0, i32 2>
1840 %5 = bitcast <2 x double> %4 to <4 x float>
1844 define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) {
1845 ; SSE-LABEL: shuffle_v4f32_bitcast_0045:
1847 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
1850 ; AVX-LABEL: shuffle_v4f32_bitcast_0045:
1852 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
1854 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
1855 %2 = bitcast <4 x i32> %b to <4 x float>
1856 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 1, i32 0, i32 4, i32 5>
1860 define <4 x float> @mask_v4f32_4127(<4 x float> %a, <4 x float> %b) {
1861 ; SSE2-LABEL: mask_v4f32_4127:
1863 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1864 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1865 ; SSE2-NEXT: movaps %xmm1, %xmm0
1868 ; SSE3-LABEL: mask_v4f32_4127:
1870 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1871 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1872 ; SSE3-NEXT: movaps %xmm1, %xmm0
1875 ; SSSE3-LABEL: mask_v4f32_4127:
1877 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1878 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1879 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1882 ; SSE41-LABEL: mask_v4f32_4127:
1884 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
1887 ; AVX-LABEL: mask_v4f32_4127:
1889 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
1891 %1 = bitcast <4 x float> %a to <4 x i32>
1892 %2 = bitcast <4 x float> %b to <4 x i32>
1893 %3 = and <4 x i32> %1, <i32 0, i32 -1, i32 -1, i32 0>
1894 %4 = and <4 x i32> %2, <i32 -1, i32 0, i32 0, i32 -1>
1895 %5 = or <4 x i32> %4, %3
1896 %6 = bitcast <4 x i32> %5 to <4 x float>
1900 define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) {
1901 ; SSE2-LABEL: mask_v4f32_0127:
1903 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1904 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1905 ; SSE2-NEXT: movaps %xmm1, %xmm0
1908 ; SSE3-LABEL: mask_v4f32_0127:
1910 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1911 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1912 ; SSE3-NEXT: movaps %xmm1, %xmm0
1915 ; SSSE3-LABEL: mask_v4f32_0127:
1917 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1918 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1919 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1922 ; SSE41-LABEL: mask_v4f32_0127:
1924 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1927 ; AVX-LABEL: mask_v4f32_0127:
1929 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1931 %1 = bitcast <4 x float> %a to <2 x i64>
1932 %2 = bitcast <4 x float> %b to <2 x i64>
1933 %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
1934 %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
1935 %5 = or <2 x i64> %4, %3
1936 %6 = bitcast <2 x i64> %5 to <4 x float>
1940 define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
1941 ; SSE2-LABEL: mask_v4i32_0127:
1943 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1944 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1945 ; SSE2-NEXT: movaps %xmm1, %xmm0
1948 ; SSE3-LABEL: mask_v4i32_0127:
1950 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1951 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1952 ; SSE3-NEXT: movaps %xmm1, %xmm0
1955 ; SSSE3-LABEL: mask_v4i32_0127:
1957 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1958 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1959 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1962 ; SSE41-LABEL: mask_v4i32_0127:
1964 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1967 ; AVX-LABEL: mask_v4i32_0127:
1969 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1971 %1 = bitcast <4 x i32> %a to <2 x i64>
1972 %2 = bitcast <4 x i32> %b to <2 x i64>
1973 %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
1974 %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
1975 %5 = or <2 x i64> %4, %3
1976 %6 = bitcast <2 x i64> %5 to <4 x i32>
1980 define <4 x float> @broadcast_v4f32_0101_from_v2f32(ptr %x) {
1981 ; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
1983 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1984 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
1987 ; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
1989 ; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
1992 ; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
1994 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
1997 ; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32:
1999 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
2002 ; AVX-LABEL: broadcast_v4f32_0101_from_v2f32:
2004 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
2006 %1 = load <2 x float>, ptr %x, align 1
2007 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
2011 define <4 x i32> @extract3_insert0_v4i32_7123(<4 x i32> %a0, <4 x i32> %a1) {
2012 ; SSE2-LABEL: extract3_insert0_v4i32_7123:
2014 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2015 ; SSE2-NEXT: movd %xmm1, %eax
2016 ; SSE2-NEXT: movd %eax, %xmm1
2017 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2020 ; SSE3-LABEL: extract3_insert0_v4i32_7123:
2022 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2023 ; SSE3-NEXT: movd %xmm1, %eax
2024 ; SSE3-NEXT: movd %eax, %xmm1
2025 ; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2028 ; SSSE3-LABEL: extract3_insert0_v4i32_7123:
2030 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2031 ; SSSE3-NEXT: movd %xmm1, %eax
2032 ; SSSE3-NEXT: movd %eax, %xmm1
2033 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2036 ; SSE41-LABEL: extract3_insert0_v4i32_7123:
2038 ; SSE41-NEXT: extractps $3, %xmm1, %eax
2039 ; SSE41-NEXT: pinsrd $0, %eax, %xmm0
2042 ; AVX-LABEL: extract3_insert0_v4i32_7123:
2044 ; AVX-NEXT: vextractps $3, %xmm1, %eax
2045 ; AVX-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
2047 %1 = extractelement <4 x i32> %a1, i32 3
2048 %2 = insertelement <4 x i32> %a0, i32 %1, i32 0
2052 define <4 x i32> @extract3_insert3_v4i32_0127(<4 x i32> %a0, <4 x i32> %a1) {
2053 ; SSE2-LABEL: extract3_insert3_v4i32_0127:
2055 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2056 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2059 ; SSE3-LABEL: extract3_insert3_v4i32_0127:
2061 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2062 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2065 ; SSSE3-LABEL: extract3_insert3_v4i32_0127:
2067 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2068 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2071 ; SSE41-LABEL: extract3_insert3_v4i32_0127:
2073 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2076 ; AVX-LABEL: extract3_insert3_v4i32_0127:
2078 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2080 %1 = extractelement <4 x i32> %a1, i32 3
2081 %2 = insertelement <4 x i32> %a0, i32 %1, i32 3
2085 define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
2086 ; SSE-LABEL: insert_reg_and_zero_v4i32:
2088 ; SSE-NEXT: movd %edi, %xmm0
2091 ; AVX-LABEL: insert_reg_and_zero_v4i32:
2093 ; AVX-NEXT: vmovd %edi, %xmm0
2095 %v = insertelement <4 x i32> undef, i32 %a, i32 0
2096 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2097 ret <4 x i32> %shuffle
2100 define <4 x i32> @insert_mem_and_zero_v4i32(ptr %ptr) {
2101 ; SSE-LABEL: insert_mem_and_zero_v4i32:
2103 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2106 ; AVX-LABEL: insert_mem_and_zero_v4i32:
2108 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2110 %a = load i32, ptr %ptr
2111 %v = insertelement <4 x i32> undef, i32 %a, i32 0
2112 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2113 ret <4 x i32> %shuffle
2116 define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
2117 ; SSE2-LABEL: insert_reg_and_zero_v4f32:
2119 ; SSE2-NEXT: xorps %xmm1, %xmm1
2120 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2121 ; SSE2-NEXT: movaps %xmm1, %xmm0
2124 ; SSE3-LABEL: insert_reg_and_zero_v4f32:
2126 ; SSE3-NEXT: xorps %xmm1, %xmm1
2127 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2128 ; SSE3-NEXT: movaps %xmm1, %xmm0
2131 ; SSSE3-LABEL: insert_reg_and_zero_v4f32:
2133 ; SSSE3-NEXT: xorps %xmm1, %xmm1
2134 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2135 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2138 ; SSE41-LABEL: insert_reg_and_zero_v4f32:
2140 ; SSE41-NEXT: xorps %xmm1, %xmm1
2141 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2144 ; AVX-LABEL: insert_reg_and_zero_v4f32:
2146 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
2147 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2149 %v = insertelement <4 x float> undef, float %a, i32 0
2150 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2151 ret <4 x float> %shuffle
2154 define <4 x float> @insert_mem_and_zero_v4f32(ptr %ptr) {
2155 ; SSE-LABEL: insert_mem_and_zero_v4f32:
2157 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2160 ; AVX-LABEL: insert_mem_and_zero_v4f32:
2162 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2164 %a = load float, ptr %ptr
2165 %v = insertelement <4 x float> undef, float %a, i32 0
2166 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2167 ret <4 x float> %shuffle
2170 define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
2171 ; SSE2-LABEL: insert_reg_lo_v4i32:
2173 ; SSE2-NEXT: movq %rdi, %xmm1
2174 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2177 ; SSE3-LABEL: insert_reg_lo_v4i32:
2179 ; SSE3-NEXT: movq %rdi, %xmm1
2180 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2183 ; SSSE3-LABEL: insert_reg_lo_v4i32:
2185 ; SSSE3-NEXT: movq %rdi, %xmm1
2186 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2189 ; SSE41-LABEL: insert_reg_lo_v4i32:
2191 ; SSE41-NEXT: movq %rdi, %xmm1
2192 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2195 ; AVX1-LABEL: insert_reg_lo_v4i32:
2197 ; AVX1-NEXT: vmovq %rdi, %xmm1
2198 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2201 ; AVX2OR512VL-LABEL: insert_reg_lo_v4i32:
2202 ; AVX2OR512VL: # %bb.0:
2203 ; AVX2OR512VL-NEXT: vmovq %rdi, %xmm1
2204 ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2205 ; AVX2OR512VL-NEXT: retq
2206 %a.cast = bitcast i64 %a to <2 x i32>
2207 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2208 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2209 ret <4 x i32> %shuffle
2212 define <4 x i32> @insert_mem_lo_v4i32(ptr %ptr, <4 x i32> %b) {
2213 ; SSE2-LABEL: insert_mem_lo_v4i32:
2215 ; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2218 ; SSE3-LABEL: insert_mem_lo_v4i32:
2220 ; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2223 ; SSSE3-LABEL: insert_mem_lo_v4i32:
2225 ; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2228 ; SSE41-LABEL: insert_mem_lo_v4i32:
2230 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
2231 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2234 ; AVX-LABEL: insert_mem_lo_v4i32:
2236 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2237 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2239 %a = load <2 x i32>, ptr %ptr
2240 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2241 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2242 ret <4 x i32> %shuffle
2245 define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
2246 ; SSE-LABEL: insert_reg_hi_v4i32:
2248 ; SSE-NEXT: movq %rdi, %xmm1
2249 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2252 ; AVX-LABEL: insert_reg_hi_v4i32:
2254 ; AVX-NEXT: vmovq %rdi, %xmm1
2255 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2257 %a.cast = bitcast i64 %a to <2 x i32>
2258 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2259 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2260 ret <4 x i32> %shuffle
2263 define <4 x i32> @insert_mem_hi_v4i32(ptr %ptr, <4 x i32> %b) {
2264 ; SSE-LABEL: insert_mem_hi_v4i32:
2266 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
2267 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2270 ; AVX-LABEL: insert_mem_hi_v4i32:
2272 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2273 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2275 %a = load <2 x i32>, ptr %ptr
2276 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2277 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2278 ret <4 x i32> %shuffle
2281 define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
2282 ; SSE2-LABEL: insert_reg_lo_v4f32:
2284 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2287 ; SSE3-LABEL: insert_reg_lo_v4f32:
2289 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2292 ; SSSE3-LABEL: insert_reg_lo_v4f32:
2294 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2297 ; SSE41-LABEL: insert_reg_lo_v4f32:
2299 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2302 ; AVX-LABEL: insert_reg_lo_v4f32:
2304 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2306 %a.cast = bitcast double %a to <2 x float>
2307 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2308 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2309 ret <4 x float> %shuffle
2312 define <4 x float> @insert_mem_lo_v4f32(ptr %ptr, <4 x float> %b) {
2313 ; SSE-LABEL: insert_mem_lo_v4f32:
2315 ; SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2318 ; AVX-LABEL: insert_mem_lo_v4f32:
2320 ; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2322 %a = load <2 x float>, ptr %ptr
2323 %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2324 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2325 ret <4 x float> %shuffle
2328 define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
2329 ; SSE-LABEL: insert_reg_hi_v4f32:
2331 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2332 ; SSE-NEXT: movaps %xmm1, %xmm0
2335 ; AVX-LABEL: insert_reg_hi_v4f32:
2337 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2339 %a.cast = bitcast double %a to <2 x float>
2340 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2341 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2342 ret <4 x float> %shuffle
2345 define <4 x float> @insert_mem_hi_v4f32(ptr %ptr, <4 x float> %b) {
2346 ; SSE-LABEL: insert_mem_hi_v4f32:
2348 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2351 ; AVX-LABEL: insert_mem_hi_v4f32:
2353 ; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2355 %a = load <2 x float>, ptr %ptr
2356 %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2357 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2358 ret <4 x float> %shuffle
2362 define <4 x float> @shuffle_mem_v4f32_3210(ptr %ptr) {
2363 ; SSE-LABEL: shuffle_mem_v4f32_3210:
2365 ; SSE-NEXT: movaps (%rdi), %xmm0
2366 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
2369 ; AVX-LABEL: shuffle_mem_v4f32_3210:
2371 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
2373 %a = load <4 x float>, ptr %ptr
2374 %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2375 ret <4 x float> %shuffle
2378 define <4 x i32> @insert_dup_mem_v4i32(ptr %ptr) {
2379 ; SSE-LABEL: insert_dup_mem_v4i32:
2381 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2382 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2385 ; AVX-LABEL: insert_dup_mem_v4i32:
2387 ; AVX-NEXT: vbroadcastss (%rdi), %xmm0
2389 %tmp = load i32, ptr %ptr, align 4
2390 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2391 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
2396 define <4 x float> @shuffle_mem_pmovzx_v4f32(ptr %p0, ptr %p1) {
2397 ; SSE-LABEL: shuffle_mem_pmovzx_v4f32:
2399 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
2400 ; SSE-NEXT: xorps %xmm1, %xmm1
2401 ; SSE-NEXT: movaps %xmm0, %xmm2
2402 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2403 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2404 ; SSE-NEXT: movaps %xmm2, (%rsi)
2407 ; AVX1-LABEL: shuffle_mem_pmovzx_v4f32:
2409 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
2410 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
2411 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2412 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2413 ; AVX1-NEXT: vmovaps %xmm1, (%rsi)
2416 ; AVX2OR512VL-LABEL: shuffle_mem_pmovzx_v4f32:
2417 ; AVX2OR512VL: # %bb.0:
2418 ; AVX2OR512VL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
2419 ; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
2420 ; AVX2OR512VL-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2421 ; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %xmm0
2422 ; AVX2OR512VL-NEXT: vmovaps %xmm1, (%rsi)
2423 ; AVX2OR512VL-NEXT: retq
2424 %1 = load <2 x float>, ptr %p0
2425 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
2426 %3 = shufflevector <4 x float> %2, <4 x float> <float undef, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
2427 %4 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> zeroinitializer
2428 store <4 x float> %3, ptr %p1
2433 ; Shuffle to logical bit shifts
2436 define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) {
2437 ; SSE-LABEL: shuffle_v4i32_z0zX:
2439 ; SSE-NEXT: psllq $32, %xmm0
2442 ; AVX-LABEL: shuffle_v4i32_z0zX:
2444 ; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
2446 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 undef>
2447 ret <4 x i32> %shuffle
2450 define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) {
2451 ; SSE-LABEL: shuffle_v4i32_1z3z:
2453 ; SSE-NEXT: psrlq $32, %xmm0
2456 ; AVX-LABEL: shuffle_v4i32_1z3z:
2458 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
2460 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
2461 ret <4 x i32> %shuffle
2464 define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, ptr %pb) {
2465 ; SSE-LABEL: shuffle_mem_v4f32_0145:
2467 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2470 ; AVX-LABEL: shuffle_mem_v4f32_0145:
2472 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
2474 %b = load <4 x float>, ptr %pb, align 1
2475 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
2476 ret <4 x float> %shuffle
2479 define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, ptr %pb) {
2480 ; SSE2-LABEL: shuffle_mem_v4f32_4523:
2482 ; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2485 ; SSE3-LABEL: shuffle_mem_v4f32_4523:
2487 ; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2490 ; SSSE3-LABEL: shuffle_mem_v4f32_4523:
2492 ; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2495 ; SSE41-LABEL: shuffle_mem_v4f32_4523:
2497 ; SSE41-NEXT: movups (%rdi), %xmm1
2498 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2501 ; AVX-LABEL: shuffle_mem_v4f32_4523:
2503 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2505 %b = load <4 x float>, ptr %pb, align 1
2506 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
2507 ret <4 x float> %shuffle
2510 define <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, ptr %a1) {
2511 ; SSE-LABEL: shuffle_mem_v4f32_0624:
2513 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
2514 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
2517 ; AVX1OR2-LABEL: shuffle_mem_v4f32_0624:
2519 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
2520 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
2521 ; AVX1OR2-NEXT: retq
2523 ; AVX512VL-LABEL: shuffle_mem_v4f32_0624:
2524 ; AVX512VL: # %bb.0:
2525 ; AVX512VL-NEXT: vmovaps (%rdi), %xmm2
2526 ; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,2,4]
2527 ; AVX512VL-NEXT: vpermi2ps %xmm0, %xmm2, %xmm1
2528 ; AVX512VL-NEXT: vmovaps %xmm1, %xmm0
2529 ; AVX512VL-NEXT: retq
2530 %1 = load <4 x float>, ptr %a1
2531 %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
2535 define <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, ptr %a1) {
2536 ; SSE-LABEL: shuffle_mem_v4f32_4760:
2538 ; SSE-NEXT: movaps %xmm0, %xmm1
2539 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
2540 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
2543 ; AVX1OR2-LABEL: shuffle_mem_v4f32_4760:
2545 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0]
2546 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
2547 ; AVX1OR2-NEXT: retq
2549 ; AVX512VL-LABEL: shuffle_mem_v4f32_4760:
2550 ; AVX512VL: # %bb.0:
2551 ; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,3,2,4]
2552 ; AVX512VL-NEXT: vpermt2ps (%rdi), %xmm1, %xmm0
2553 ; AVX512VL-NEXT: retq
2554 %1 = load <4 x float>, ptr %a1
2555 %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 4, i32 7, i32 6, i32 0>