1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
4 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
5 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
6 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
7 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
8 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
9 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL
11 define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
12 ; SSE-LABEL: shuffle_v4i32_0001:
14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
17 ; AVX-LABEL: shuffle_v4i32_0001:
19 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
21 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
22 ret <4 x i32> %shuffle
24 define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) {
25 ; SSE-LABEL: shuffle_v4i32_0020:
27 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
30 ; AVX-LABEL: shuffle_v4i32_0020:
32 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
34 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
35 ret <4 x i32> %shuffle
37 define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) {
38 ; SSE-LABEL: shuffle_v4i32_0112:
40 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
43 ; AVX-LABEL: shuffle_v4i32_0112:
45 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
47 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
48 ret <4 x i32> %shuffle
50 define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) {
51 ; SSE-LABEL: shuffle_v4i32_0300:
53 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
56 ; AVX-LABEL: shuffle_v4i32_0300:
58 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
60 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
61 ret <4 x i32> %shuffle
63 define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) {
64 ; SSE-LABEL: shuffle_v4i32_1000:
66 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
69 ; AVX-LABEL: shuffle_v4i32_1000:
71 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
73 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
74 ret <4 x i32> %shuffle
76 define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) {
77 ; SSE-LABEL: shuffle_v4i32_2200:
79 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
82 ; AVX-LABEL: shuffle_v4i32_2200:
84 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
86 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
87 ret <4 x i32> %shuffle
89 define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) {
90 ; SSE-LABEL: shuffle_v4i32_3330:
92 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
95 ; AVX-LABEL: shuffle_v4i32_3330:
97 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
99 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
100 ret <4 x i32> %shuffle
102 define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) {
103 ; SSE-LABEL: shuffle_v4i32_3210:
105 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
108 ; AVX-LABEL: shuffle_v4i32_3210:
110 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
112 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
113 ret <4 x i32> %shuffle
116 define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) {
117 ; SSE-LABEL: shuffle_v4i32_2121:
119 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
122 ; AVX-LABEL: shuffle_v4i32_2121:
124 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,1]
126 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1>
127 ret <4 x i32> %shuffle
130 define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
131 ; SSE-LABEL: shuffle_v4f32_0001:
133 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
136 ; AVX-LABEL: shuffle_v4f32_0001:
138 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
140 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
141 ret <4 x float> %shuffle
143 define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
144 ; SSE-LABEL: shuffle_v4f32_0020:
146 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
149 ; AVX-LABEL: shuffle_v4f32_0020:
151 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
153 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
154 ret <4 x float> %shuffle
156 define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
157 ; SSE-LABEL: shuffle_v4f32_0300:
159 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
162 ; AVX-LABEL: shuffle_v4f32_0300:
164 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
166 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
167 ret <4 x float> %shuffle
169 define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
170 ; SSE-LABEL: shuffle_v4f32_1000:
172 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
175 ; AVX-LABEL: shuffle_v4f32_1000:
177 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
179 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
180 ret <4 x float> %shuffle
182 define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
183 ; SSE-LABEL: shuffle_v4f32_2200:
185 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
188 ; AVX-LABEL: shuffle_v4f32_2200:
190 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
192 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
193 ret <4 x float> %shuffle
195 define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
196 ; SSE-LABEL: shuffle_v4f32_3330:
198 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
201 ; AVX-LABEL: shuffle_v4f32_3330:
203 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
205 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
206 ret <4 x float> %shuffle
208 define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
209 ; SSE-LABEL: shuffle_v4f32_3210:
211 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
214 ; AVX-LABEL: shuffle_v4f32_3210:
216 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
218 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
219 ret <4 x float> %shuffle
221 define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
222 ; SSE-LABEL: shuffle_v4f32_0011:
224 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
227 ; AVX-LABEL: shuffle_v4f32_0011:
229 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
231 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
232 ret <4 x float> %shuffle
234 define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
235 ; SSE-LABEL: shuffle_v4f32_2233:
237 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
240 ; AVX-LABEL: shuffle_v4f32_2233:
242 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
244 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
245 ret <4 x float> %shuffle
247 define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
248 ; SSE2-LABEL: shuffle_v4f32_0022:
250 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
253 ; SSE3-LABEL: shuffle_v4f32_0022:
255 ; SSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
258 ; SSSE3-LABEL: shuffle_v4f32_0022:
260 ; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
263 ; SSE41-LABEL: shuffle_v4f32_0022:
265 ; SSE41-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
268 ; AVX-LABEL: shuffle_v4f32_0022:
270 ; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
272 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
273 ret <4 x float> %shuffle
275 define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
276 ; SSE2-LABEL: shuffle_v4f32_1133:
278 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
281 ; SSE3-LABEL: shuffle_v4f32_1133:
283 ; SSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
286 ; SSSE3-LABEL: shuffle_v4f32_1133:
288 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
291 ; SSE41-LABEL: shuffle_v4f32_1133:
293 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
296 ; AVX-LABEL: shuffle_v4f32_1133:
298 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
300 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
301 ret <4 x float> %shuffle
304 define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
305 ; SSE-LABEL: shuffle_v4f32_0145:
307 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
310 ; AVX-LABEL: shuffle_v4f32_0145:
312 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
314 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
315 ret <4 x float> %shuffle
318 define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
319 ; SSE-LABEL: shuffle_v4f32_6723:
321 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
324 ; AVX-LABEL: shuffle_v4f32_6723:
326 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
328 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
329 ret <4 x float> %shuffle
332 define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
333 ; SSE2-LABEL: shuffle_v4i32_0124:
335 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
336 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
339 ; SSE3-LABEL: shuffle_v4i32_0124:
341 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
342 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
345 ; SSSE3-LABEL: shuffle_v4i32_0124:
347 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
348 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
351 ; SSE41-LABEL: shuffle_v4i32_0124:
353 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
354 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
357 ; AVX1-LABEL: shuffle_v4i32_0124:
359 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
360 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
363 ; AVX2-LABEL: shuffle_v4i32_0124:
365 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
366 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
369 ; AVX512VL-LABEL: shuffle_v4i32_0124:
371 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,4]
372 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
373 ; AVX512VL-NEXT: retq
374 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
375 ret <4 x i32> %shuffle
377 define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
378 ; SSE2-LABEL: shuffle_v4i32_0142:
380 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
381 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
384 ; SSE3-LABEL: shuffle_v4i32_0142:
386 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
387 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
390 ; SSSE3-LABEL: shuffle_v4i32_0142:
392 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
393 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
396 ; SSE41-LABEL: shuffle_v4i32_0142:
398 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
399 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
400 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
403 ; AVX1-LABEL: shuffle_v4i32_0142:
405 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
406 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
407 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
410 ; AVX2-LABEL: shuffle_v4i32_0142:
412 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
413 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
414 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
417 ; AVX512VL-LABEL: shuffle_v4i32_0142:
419 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,2]
420 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
421 ; AVX512VL-NEXT: retq
422 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
423 ret <4 x i32> %shuffle
425 define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
426 ; SSE2-LABEL: shuffle_v4i32_0412:
428 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
429 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
430 ; SSE2-NEXT: movaps %xmm1, %xmm0
433 ; SSE3-LABEL: shuffle_v4i32_0412:
435 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
436 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
437 ; SSE3-NEXT: movaps %xmm1, %xmm0
440 ; SSSE3-LABEL: shuffle_v4i32_0412:
442 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
443 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
444 ; SSSE3-NEXT: movaps %xmm1, %xmm0
447 ; SSE41-LABEL: shuffle_v4i32_0412:
449 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
450 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
451 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
454 ; AVX1-LABEL: shuffle_v4i32_0412:
456 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
457 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
458 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
461 ; AVX2-LABEL: shuffle_v4i32_0412:
463 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
464 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
465 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
468 ; AVX512VL-LABEL: shuffle_v4i32_0412:
470 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,1,2]
471 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
472 ; AVX512VL-NEXT: retq
473 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
474 ret <4 x i32> %shuffle
476 define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
477 ; SSE2-LABEL: shuffle_v4i32_4012:
479 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
480 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
481 ; SSE2-NEXT: movaps %xmm1, %xmm0
484 ; SSE3-LABEL: shuffle_v4i32_4012:
486 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
487 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
488 ; SSE3-NEXT: movaps %xmm1, %xmm0
491 ; SSSE3-LABEL: shuffle_v4i32_4012:
493 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
494 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
495 ; SSSE3-NEXT: movaps %xmm1, %xmm0
498 ; SSE41-LABEL: shuffle_v4i32_4012:
500 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
501 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
504 ; AVX1OR2-LABEL: shuffle_v4i32_4012:
506 ; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2]
507 ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
510 ; AVX512VL-LABEL: shuffle_v4i32_4012:
512 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,2]
513 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
514 ; AVX512VL-NEXT: retq
515 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
516 ret <4 x i32> %shuffle
518 define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
519 ; SSE-LABEL: shuffle_v4i32_0145:
521 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
524 ; AVX-LABEL: shuffle_v4i32_0145:
526 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
528 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
529 ret <4 x i32> %shuffle
531 define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
532 ; SSE2-LABEL: shuffle_v4i32_0451:
534 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
535 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
538 ; SSE3-LABEL: shuffle_v4i32_0451:
540 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
541 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
544 ; SSSE3-LABEL: shuffle_v4i32_0451:
546 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
547 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
550 ; SSE41-LABEL: shuffle_v4i32_0451:
552 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
553 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
554 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
557 ; AVX1-LABEL: shuffle_v4i32_0451:
559 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
560 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
561 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
564 ; AVX2-LABEL: shuffle_v4i32_0451:
566 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
567 ; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
568 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
571 ; AVX512VL-LABEL: shuffle_v4i32_0451:
573 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,5,1]
574 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
575 ; AVX512VL-NEXT: retq
576 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
577 ret <4 x i32> %shuffle
579 define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
580 ; SSE-LABEL: shuffle_v4i32_4501:
582 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
583 ; SSE-NEXT: movaps %xmm1, %xmm0
586 ; AVX-LABEL: shuffle_v4i32_4501:
588 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
590 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
591 ret <4 x i32> %shuffle
593 define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
594 ; SSE2-LABEL: shuffle_v4i32_4015:
596 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
597 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
600 ; SSE3-LABEL: shuffle_v4i32_4015:
602 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
603 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
606 ; SSSE3-LABEL: shuffle_v4i32_4015:
608 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
609 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
612 ; SSE41-LABEL: shuffle_v4i32_4015:
614 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
615 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
616 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
619 ; AVX1-LABEL: shuffle_v4i32_4015:
621 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
622 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
623 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
626 ; AVX2-LABEL: shuffle_v4i32_4015:
628 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
629 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
630 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
633 ; AVX512VL-LABEL: shuffle_v4i32_4015:
635 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,5]
636 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
637 ; AVX512VL-NEXT: retq
638 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
639 ret <4 x i32> %shuffle
642 define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
643 ; SSE2-LABEL: shuffle_v4f32_4zzz:
645 ; SSE2-NEXT: xorps %xmm1, %xmm1
646 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
647 ; SSE2-NEXT: movaps %xmm1, %xmm0
650 ; SSE3-LABEL: shuffle_v4f32_4zzz:
652 ; SSE3-NEXT: xorps %xmm1, %xmm1
653 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
654 ; SSE3-NEXT: movaps %xmm1, %xmm0
657 ; SSSE3-LABEL: shuffle_v4f32_4zzz:
659 ; SSSE3-NEXT: xorps %xmm1, %xmm1
660 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
661 ; SSSE3-NEXT: movaps %xmm1, %xmm0
664 ; SSE41-LABEL: shuffle_v4f32_4zzz:
666 ; SSE41-NEXT: xorps %xmm1, %xmm1
667 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
670 ; AVX-LABEL: shuffle_v4f32_4zzz:
672 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
673 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
675 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
676 ret <4 x float> %shuffle
679 define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
680 ; SSE2-LABEL: shuffle_v4f32_z4zz:
682 ; SSE2-NEXT: xorps %xmm1, %xmm1
683 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
684 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
687 ; SSE3-LABEL: shuffle_v4f32_z4zz:
689 ; SSE3-NEXT: xorps %xmm1, %xmm1
690 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
691 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
694 ; SSSE3-LABEL: shuffle_v4f32_z4zz:
696 ; SSSE3-NEXT: xorps %xmm1, %xmm1
697 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
698 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
701 ; SSE41-LABEL: shuffle_v4f32_z4zz:
703 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
706 ; AVX-LABEL: shuffle_v4f32_z4zz:
708 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
710 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
711 ret <4 x float> %shuffle
714 define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
715 ; SSE2-LABEL: shuffle_v4f32_zz4z:
717 ; SSE2-NEXT: xorps %xmm1, %xmm1
718 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
719 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
720 ; SSE2-NEXT: movaps %xmm1, %xmm0
723 ; SSE3-LABEL: shuffle_v4f32_zz4z:
725 ; SSE3-NEXT: xorps %xmm1, %xmm1
726 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
727 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
728 ; SSE3-NEXT: movaps %xmm1, %xmm0
731 ; SSSE3-LABEL: shuffle_v4f32_zz4z:
733 ; SSSE3-NEXT: xorps %xmm1, %xmm1
734 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
735 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
736 ; SSSE3-NEXT: movaps %xmm1, %xmm0
739 ; SSE41-LABEL: shuffle_v4f32_zz4z:
741 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
744 ; AVX-LABEL: shuffle_v4f32_zz4z:
746 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
748 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
749 ret <4 x float> %shuffle
752 define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
753 ; SSE2-LABEL: shuffle_v4f32_zuu4:
755 ; SSE2-NEXT: xorps %xmm1, %xmm1
756 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
757 ; SSE2-NEXT: movaps %xmm1, %xmm0
760 ; SSE3-LABEL: shuffle_v4f32_zuu4:
762 ; SSE3-NEXT: xorps %xmm1, %xmm1
763 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
764 ; SSE3-NEXT: movaps %xmm1, %xmm0
767 ; SSSE3-LABEL: shuffle_v4f32_zuu4:
769 ; SSSE3-NEXT: xorps %xmm1, %xmm1
770 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
771 ; SSSE3-NEXT: movaps %xmm1, %xmm0
774 ; SSE41-LABEL: shuffle_v4f32_zuu4:
776 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
779 ; AVX-LABEL: shuffle_v4f32_zuu4:
781 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
783 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
784 ret <4 x float> %shuffle
787 define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
788 ; SSE2-LABEL: shuffle_v4f32_zzz7:
790 ; SSE2-NEXT: xorps %xmm1, %xmm1
791 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
792 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
793 ; SSE2-NEXT: movaps %xmm1, %xmm0
796 ; SSE3-LABEL: shuffle_v4f32_zzz7:
798 ; SSE3-NEXT: xorps %xmm1, %xmm1
799 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
800 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
801 ; SSE3-NEXT: movaps %xmm1, %xmm0
804 ; SSSE3-LABEL: shuffle_v4f32_zzz7:
806 ; SSSE3-NEXT: xorps %xmm1, %xmm1
807 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
808 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
809 ; SSSE3-NEXT: movaps %xmm1, %xmm0
812 ; SSE41-LABEL: shuffle_v4f32_zzz7:
814 ; SSE41-NEXT: xorps %xmm1, %xmm1
815 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
818 ; AVX-LABEL: shuffle_v4f32_zzz7:
820 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
821 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
823 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
824 ret <4 x float> %shuffle
827 define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
828 ; SSE2-LABEL: shuffle_v4f32_z6zz:
830 ; SSE2-NEXT: xorps %xmm1, %xmm1
831 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
832 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
835 ; SSE3-LABEL: shuffle_v4f32_z6zz:
837 ; SSE3-NEXT: xorps %xmm1, %xmm1
838 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
839 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
842 ; SSSE3-LABEL: shuffle_v4f32_z6zz:
844 ; SSSE3-NEXT: xorps %xmm1, %xmm1
845 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
846 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
849 ; SSE41-LABEL: shuffle_v4f32_z6zz:
851 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
854 ; AVX-LABEL: shuffle_v4f32_z6zz:
856 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
858 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
859 ret <4 x float> %shuffle
862 define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
863 ; SSE2-LABEL: shuffle_v4f32_0z23:
865 ; SSE2-NEXT: xorps %xmm1, %xmm1
866 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
867 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
868 ; SSE2-NEXT: movaps %xmm1, %xmm0
871 ; SSE3-LABEL: shuffle_v4f32_0z23:
873 ; SSE3-NEXT: xorps %xmm1, %xmm1
874 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
875 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
876 ; SSE3-NEXT: movaps %xmm1, %xmm0
879 ; SSSE3-LABEL: shuffle_v4f32_0z23:
881 ; SSSE3-NEXT: xorps %xmm1, %xmm1
882 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
883 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
884 ; SSSE3-NEXT: movaps %xmm1, %xmm0
887 ; SSE41-LABEL: shuffle_v4f32_0z23:
889 ; SSE41-NEXT: xorps %xmm1, %xmm1
890 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
893 ; AVX-LABEL: shuffle_v4f32_0z23:
895 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
896 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
898 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
899 ret <4 x float> %shuffle
902 define <4 x float> @shuffle_v4f32_01z3(<4 x float> %a) {
903 ; SSE2-LABEL: shuffle_v4f32_01z3:
905 ; SSE2-NEXT: xorps %xmm1, %xmm1
906 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
907 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
910 ; SSE3-LABEL: shuffle_v4f32_01z3:
912 ; SSE3-NEXT: xorps %xmm1, %xmm1
913 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
914 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
917 ; SSSE3-LABEL: shuffle_v4f32_01z3:
919 ; SSSE3-NEXT: xorps %xmm1, %xmm1
920 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
921 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
924 ; SSE41-LABEL: shuffle_v4f32_01z3:
926 ; SSE41-NEXT: xorps %xmm1, %xmm1
927 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
930 ; AVX-LABEL: shuffle_v4f32_01z3:
932 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
933 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
935 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
936 ret <4 x float> %shuffle
939 define <4 x float> @shuffle_v4f32_012z(<4 x float> %a) {
940 ; SSE2-LABEL: shuffle_v4f32_012z:
942 ; SSE2-NEXT: xorps %xmm1, %xmm1
943 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
944 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
947 ; SSE3-LABEL: shuffle_v4f32_012z:
949 ; SSE3-NEXT: xorps %xmm1, %xmm1
950 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
951 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
954 ; SSSE3-LABEL: shuffle_v4f32_012z:
956 ; SSSE3-NEXT: xorps %xmm1, %xmm1
957 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
958 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
961 ; SSE41-LABEL: shuffle_v4f32_012z:
963 ; SSE41-NEXT: xorps %xmm1, %xmm1
964 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
967 ; AVX-LABEL: shuffle_v4f32_012z:
969 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
970 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
972 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
973 ret <4 x float> %shuffle
976 define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) {
977 ; SSE2-LABEL: shuffle_v4f32_0zz3:
979 ; SSE2-NEXT: xorps %xmm1, %xmm1
980 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
981 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
984 ; SSE3-LABEL: shuffle_v4f32_0zz3:
986 ; SSE3-NEXT: xorps %xmm1, %xmm1
987 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
988 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
991 ; SSSE3-LABEL: shuffle_v4f32_0zz3:
993 ; SSSE3-NEXT: xorps %xmm1, %xmm1
994 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
995 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
998 ; SSE41-LABEL: shuffle_v4f32_0zz3:
1000 ; SSE41-NEXT: xorps %xmm1, %xmm1
1001 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1004 ; AVX-LABEL: shuffle_v4f32_0zz3:
1006 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1007 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1009 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
1010 ret <4 x float> %shuffle
1013 define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) {
1014 ; SSE2-LABEL: shuffle_v4f32_0z2z:
1016 ; SSE2-NEXT: xorps %xmm1, %xmm1
1017 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
1018 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1021 ; SSE3-LABEL: shuffle_v4f32_0z2z:
1023 ; SSE3-NEXT: xorps %xmm1, %xmm1
1024 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
1025 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1028 ; SSSE3-LABEL: shuffle_v4f32_0z2z:
1030 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1031 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
1032 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1035 ; SSE41-LABEL: shuffle_v4f32_0z2z:
1037 ; SSE41-NEXT: xorps %xmm1, %xmm1
1038 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1041 ; AVX-LABEL: shuffle_v4f32_0z2z:
1043 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1044 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1046 %shuffle = shufflevector <4 x float> %v, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
1047 ret <4 x float> %shuffle
1050 define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
1051 ; SSE-LABEL: shuffle_v4f32_u051:
1053 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1054 ; SSE-NEXT: movaps %xmm1, %xmm0
1057 ; AVX-LABEL: shuffle_v4f32_u051:
1059 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1061 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 5, i32 1>
1062 ret <4 x float> %shuffle
1065 define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
1066 ; SSE2-LABEL: shuffle_v4f32_0zz4:
1068 ; SSE2-NEXT: xorps %xmm2, %xmm2
1069 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
1070 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
1071 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
1072 ; SSE2-NEXT: movaps %xmm2, %xmm0
1075 ; SSE3-LABEL: shuffle_v4f32_0zz4:
1077 ; SSE3-NEXT: xorps %xmm2, %xmm2
1078 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
1079 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
1080 ; SSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
1081 ; SSE3-NEXT: movaps %xmm2, %xmm0
1084 ; SSSE3-LABEL: shuffle_v4f32_0zz4:
1086 ; SSSE3-NEXT: xorps %xmm2, %xmm2
1087 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
1088 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
1089 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
1090 ; SSSE3-NEXT: movaps %xmm2, %xmm0
1093 ; SSE41-LABEL: shuffle_v4f32_0zz4:
1095 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
1098 ; AVX-LABEL: shuffle_v4f32_0zz4:
1100 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
1102 %shuffle = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 6, i32 0>
1103 %shuffle1 = shufflevector <4 x float> %a, <4 x float> %shuffle, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1104 ret <4 x float> %shuffle1
1107 define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
1108 ; SSE2-LABEL: shuffle_v4f32_0zz6:
1110 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1111 ; SSE2-NEXT: xorps %xmm1, %xmm1
1112 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1113 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1114 ; SSE2-NEXT: movaps %xmm1, %xmm0
1117 ; SSE3-LABEL: shuffle_v4f32_0zz6:
1119 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1120 ; SSE3-NEXT: xorps %xmm1, %xmm1
1121 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1122 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1123 ; SSE3-NEXT: movaps %xmm1, %xmm0
1126 ; SSSE3-LABEL: shuffle_v4f32_0zz6:
1128 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1129 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1130 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1131 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1132 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1135 ; SSE41-LABEL: shuffle_v4f32_0zz6:
1137 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
1140 ; AVX-LABEL: shuffle_v4f32_0zz6:
1142 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
1144 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
1145 %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
1146 ret <4 x float> %shuffle1
1149 define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
1150 ; SSE2-LABEL: shuffle_v4f32_0z24:
1152 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
1153 ; SSE2-NEXT: xorps %xmm2, %xmm2
1154 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
1155 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1156 ; SSE2-NEXT: movaps %xmm2, %xmm0
1159 ; SSE3-LABEL: shuffle_v4f32_0z24:
1161 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
1162 ; SSE3-NEXT: xorps %xmm2, %xmm2
1163 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
1164 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1165 ; SSE3-NEXT: movaps %xmm2, %xmm0
1168 ; SSSE3-LABEL: shuffle_v4f32_0z24:
1170 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
1171 ; SSSE3-NEXT: xorps %xmm2, %xmm2
1172 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
1173 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1174 ; SSSE3-NEXT: movaps %xmm2, %xmm0
1177 ; SSE41-LABEL: shuffle_v4f32_0z24:
1179 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
1182 ; AVX-LABEL: shuffle_v4f32_0z24:
1184 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
1186 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
1187 %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1188 ret <4 x float> %shuffle1
1191 define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
1192 ; SSE2-LABEL: shuffle_v4i32_4zzz:
1194 ; SSE2-NEXT: xorps %xmm1, %xmm1
1195 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1196 ; SSE2-NEXT: movaps %xmm1, %xmm0
1199 ; SSE3-LABEL: shuffle_v4i32_4zzz:
1201 ; SSE3-NEXT: xorps %xmm1, %xmm1
1202 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1203 ; SSE3-NEXT: movaps %xmm1, %xmm0
1206 ; SSSE3-LABEL: shuffle_v4i32_4zzz:
1208 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1209 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1210 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1213 ; SSE41-LABEL: shuffle_v4i32_4zzz:
1215 ; SSE41-NEXT: xorps %xmm1, %xmm1
1216 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1219 ; AVX-LABEL: shuffle_v4i32_4zzz:
1221 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1222 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1224 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1225 ret <4 x i32> %shuffle
1228 define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
1229 ; SSE2-LABEL: shuffle_v4i32_z4zz:
1231 ; SSE2-NEXT: xorps %xmm1, %xmm1
1232 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1233 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1236 ; SSE3-LABEL: shuffle_v4i32_z4zz:
1238 ; SSE3-NEXT: xorps %xmm1, %xmm1
1239 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1240 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1243 ; SSSE3-LABEL: shuffle_v4i32_z4zz:
1245 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1246 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1247 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1250 ; SSE41-LABEL: shuffle_v4i32_z4zz:
1252 ; SSE41-NEXT: pxor %xmm1, %xmm1
1253 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1254 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1257 ; AVX1-LABEL: shuffle_v4i32_z4zz:
1259 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1260 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1261 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
1264 ; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
1265 ; AVX2-SLOW: # %bb.0:
1266 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1267 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1268 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
1269 ; AVX2-SLOW-NEXT: retq
1271 ; AVX2-FAST-LABEL: shuffle_v4i32_z4zz:
1272 ; AVX2-FAST: # %bb.0:
1273 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
1274 ; AVX2-FAST-NEXT: retq
1276 ; AVX512VL-LABEL: shuffle_v4i32_z4zz:
1277 ; AVX512VL: # %bb.0:
1278 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
1279 ; AVX512VL-NEXT: retq
1280 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
1281 ret <4 x i32> %shuffle
1284 define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
1285 ; SSE2-LABEL: shuffle_v4i32_zz4z:
1287 ; SSE2-NEXT: xorps %xmm1, %xmm1
1288 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1289 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1292 ; SSE3-LABEL: shuffle_v4i32_zz4z:
1294 ; SSE3-NEXT: xorps %xmm1, %xmm1
1295 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1296 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1299 ; SSSE3-LABEL: shuffle_v4i32_zz4z:
1301 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1302 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1303 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1306 ; SSE41-LABEL: shuffle_v4i32_zz4z:
1308 ; SSE41-NEXT: pxor %xmm1, %xmm1
1309 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1310 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1313 ; AVX1-LABEL: shuffle_v4i32_zz4z:
1315 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1316 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1317 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
1320 ; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
1321 ; AVX2-SLOW: # %bb.0:
1322 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1323 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1324 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
1325 ; AVX2-SLOW-NEXT: retq
1327 ; AVX2-FAST-LABEL: shuffle_v4i32_zz4z:
1328 ; AVX2-FAST: # %bb.0:
1329 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
1330 ; AVX2-FAST-NEXT: retq
1332 ; AVX512VL-LABEL: shuffle_v4i32_zz4z:
1333 ; AVX512VL: # %bb.0:
1334 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
1335 ; AVX512VL-NEXT: retq
1336 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
1337 ret <4 x i32> %shuffle
1340 define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
1341 ; SSE-LABEL: shuffle_v4i32_zuu4:
1343 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1346 ; AVX-LABEL: shuffle_v4i32_zuu4:
1348 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1350 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
1351 ret <4 x i32> %shuffle
1354 define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
1355 ; SSE2-LABEL: shuffle_v4i32_z6zz:
1357 ; SSE2-NEXT: xorps %xmm1, %xmm1
1358 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
1359 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1362 ; SSE3-LABEL: shuffle_v4i32_z6zz:
1364 ; SSE3-NEXT: xorps %xmm1, %xmm1
1365 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
1366 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1369 ; SSSE3-LABEL: shuffle_v4i32_z6zz:
1371 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1372 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
1373 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1376 ; SSE41-LABEL: shuffle_v4i32_z6zz:
1378 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
1379 ; SSE41-NEXT: pxor %xmm0, %xmm0
1380 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1383 ; AVX1-LABEL: shuffle_v4i32_z6zz:
1385 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1386 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1387 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1390 ; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz:
1391 ; AVX2-SLOW: # %bb.0:
1392 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1393 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1394 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1395 ; AVX2-SLOW-NEXT: retq
1397 ; AVX2-FAST-LABEL: shuffle_v4i32_z6zz:
1398 ; AVX2-FAST: # %bb.0:
1399 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
1400 ; AVX2-FAST-NEXT: retq
1402 ; AVX512VL-LABEL: shuffle_v4i32_z6zz:
1403 ; AVX512VL: # %bb.0:
1404 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
1405 ; AVX512VL-NEXT: retq
1406 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
1407 ret <4 x i32> %shuffle
1410 define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) {
1411 ; SSE2-LABEL: shuffle_v4i32_7012:
1413 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
1414 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
1415 ; SSE2-NEXT: movaps %xmm1, %xmm0
1418 ; SSE3-LABEL: shuffle_v4i32_7012:
1420 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
1421 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
1422 ; SSE3-NEXT: movaps %xmm1, %xmm0
1425 ; SSSE3-LABEL: shuffle_v4i32_7012:
1427 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1430 ; SSE41-LABEL: shuffle_v4i32_7012:
1432 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1435 ; AVX-LABEL: shuffle_v4i32_7012:
1437 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1439 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
1440 ret <4 x i32> %shuffle
1443 define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) {
1444 ; SSE2-LABEL: shuffle_v4i32_6701:
1446 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
1447 ; SSE2-NEXT: movaps %xmm1, %xmm0
1450 ; SSE3-LABEL: shuffle_v4i32_6701:
1452 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
1453 ; SSE3-NEXT: movaps %xmm1, %xmm0
1456 ; SSSE3-LABEL: shuffle_v4i32_6701:
1458 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1461 ; SSE41-LABEL: shuffle_v4i32_6701:
1463 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1466 ; AVX-LABEL: shuffle_v4i32_6701:
1468 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1470 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1471 ret <4 x i32> %shuffle
1474 define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) {
1475 ; SSE2-LABEL: shuffle_v4i32_5670:
1477 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1478 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
1479 ; SSE2-NEXT: movaps %xmm1, %xmm0
1482 ; SSE3-LABEL: shuffle_v4i32_5670:
1484 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1485 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
1486 ; SSE3-NEXT: movaps %xmm1, %xmm0
1489 ; SSSE3-LABEL: shuffle_v4i32_5670:
1491 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1494 ; SSE41-LABEL: shuffle_v4i32_5670:
1496 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1499 ; AVX-LABEL: shuffle_v4i32_5670:
1501 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1503 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
1504 ret <4 x i32> %shuffle
1507 define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) {
1508 ; SSE2-LABEL: shuffle_v4i32_1234:
1510 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1511 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
1514 ; SSE3-LABEL: shuffle_v4i32_1234:
1516 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1517 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
1520 ; SSSE3-LABEL: shuffle_v4i32_1234:
1522 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1523 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1526 ; SSE41-LABEL: shuffle_v4i32_1234:
1528 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1529 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1532 ; AVX-LABEL: shuffle_v4i32_1234:
1534 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1536 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
1537 ret <4 x i32> %shuffle
1540 define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
1541 ; SSE2-LABEL: shuffle_v4i32_2345:
1543 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1546 ; SSE3-LABEL: shuffle_v4i32_2345:
1548 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1551 ; SSSE3-LABEL: shuffle_v4i32_2345:
1553 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1554 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1557 ; SSE41-LABEL: shuffle_v4i32_2345:
1559 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1560 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1563 ; AVX-LABEL: shuffle_v4i32_2345:
1565 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1567 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1568 ret <4 x i32> %shuffle
1572 define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) {
1573 ; SSE2-LABEL: shuffle_v4i32_2456:
1575 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
1576 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1579 ; SSE3-LABEL: shuffle_v4i32_2456:
1581 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
1582 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1585 ; SSSE3-LABEL: shuffle_v4i32_2456:
1587 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
1588 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1589 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1592 ; SSE41-LABEL: shuffle_v4i32_2456:
1594 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
1595 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1596 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1599 ; AVX-LABEL: shuffle_v4i32_2456:
1601 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
1602 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1604 %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1605 %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1609 define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) {
1610 ; SSE-LABEL: shuffle_v4i32_40u1:
1612 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1613 ; SSE-NEXT: movaps %xmm1, %xmm0
1616 ; AVX-LABEL: shuffle_v4i32_40u1:
1618 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1620 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 1>
1621 ret <4 x i32> %shuffle
1624 define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
1625 ; SSE2-LABEL: shuffle_v4i32_3456:
1627 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
1628 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1631 ; SSE3-LABEL: shuffle_v4i32_3456:
1633 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
1634 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1637 ; SSSE3-LABEL: shuffle_v4i32_3456:
1639 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1640 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1643 ; SSE41-LABEL: shuffle_v4i32_3456:
1645 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1646 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1649 ; AVX-LABEL: shuffle_v4i32_3456:
1651 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1653 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1654 ret <4 x i32> %shuffle
1657 define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
1658 ; SSE2-LABEL: shuffle_v4i32_0u1u:
1660 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1663 ; SSE3-LABEL: shuffle_v4i32_0u1u:
1665 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1668 ; SSSE3-LABEL: shuffle_v4i32_0u1u:
1670 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1673 ; SSE41-LABEL: shuffle_v4i32_0u1u:
1675 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1678 ; AVX-LABEL: shuffle_v4i32_0u1u:
1680 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1682 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
1683 ret <4 x i32> %shuffle
1686 define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
1687 ; SSE2-LABEL: shuffle_v4i32_0z1z:
1689 ; SSE2-NEXT: xorps %xmm1, %xmm1
1690 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1693 ; SSE3-LABEL: shuffle_v4i32_0z1z:
1695 ; SSE3-NEXT: xorps %xmm1, %xmm1
1696 ; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1699 ; SSSE3-LABEL: shuffle_v4i32_0z1z:
1701 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1702 ; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1705 ; SSE41-LABEL: shuffle_v4i32_0z1z:
1707 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1710 ; AVX-LABEL: shuffle_v4i32_0z1z:
1712 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1714 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1715 ret <4 x i32> %shuffle
1718 define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) {
1719 ; SSE-LABEL: shuffle_v4i32_01zu:
1721 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
1724 ; AVX-LABEL: shuffle_v4i32_01zu:
1726 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1728 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 7, i32 undef>
1729 ret <4 x i32> %shuffle
1732 define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
1733 ; SSE2-LABEL: shuffle_v4i32_0z23:
1735 ; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
1738 ; SSE3-LABEL: shuffle_v4i32_0z23:
1740 ; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
1743 ; SSSE3-LABEL: shuffle_v4i32_0z23:
1745 ; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
1748 ; SSE41-LABEL: shuffle_v4i32_0z23:
1750 ; SSE41-NEXT: xorps %xmm1, %xmm1
1751 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1754 ; AVX-LABEL: shuffle_v4i32_0z23:
1756 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1757 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1759 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
1760 ret <4 x i32> %shuffle
1763 define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
1764 ; SSE2-LABEL: shuffle_v4i32_01z3:
1766 ; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
1769 ; SSE3-LABEL: shuffle_v4i32_01z3:
1771 ; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
1774 ; SSSE3-LABEL: shuffle_v4i32_01z3:
1776 ; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
1779 ; SSE41-LABEL: shuffle_v4i32_01z3:
1781 ; SSE41-NEXT: xorps %xmm1, %xmm1
1782 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1785 ; AVX-LABEL: shuffle_v4i32_01z3:
1787 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1788 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1790 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
1791 ret <4 x i32> %shuffle
1794 define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
1795 ; SSE2-LABEL: shuffle_v4i32_012z:
1797 ; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
1800 ; SSE3-LABEL: shuffle_v4i32_012z:
1802 ; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
1805 ; SSSE3-LABEL: shuffle_v4i32_012z:
1807 ; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
1810 ; SSE41-LABEL: shuffle_v4i32_012z:
1812 ; SSE41-NEXT: xorps %xmm1, %xmm1
1813 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1816 ; AVX-LABEL: shuffle_v4i32_012z:
1818 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1819 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1821 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1822 ret <4 x i32> %shuffle
1825 define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
1826 ; SSE2-LABEL: shuffle_v4i32_0zz3:
1828 ; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
1831 ; SSE3-LABEL: shuffle_v4i32_0zz3:
1833 ; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
1836 ; SSSE3-LABEL: shuffle_v4i32_0zz3:
1838 ; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
1841 ; SSE41-LABEL: shuffle_v4i32_0zz3:
1843 ; SSE41-NEXT: xorps %xmm1, %xmm1
1844 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1847 ; AVX-LABEL: shuffle_v4i32_0zz3:
1849 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1850 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1852 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
1853 ret <4 x i32> %shuffle
1856 define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) {
1857 ; SSE-LABEL: shuffle_v4i32_bitcast_0415:
1859 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1862 ; AVX-LABEL: shuffle_v4i32_bitcast_0415:
1864 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1866 %shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 0, i32 4>
1867 %bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double>
1868 %shuffle64 = shufflevector <2 x double> %bitcast64, <2 x double> undef, <2 x i32> <i32 1, i32 0>
1869 %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x i32>
1870 ret <4 x i32> %bitcast32
1873 define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) {
1874 ; SSE-LABEL: shuffle_v4f32_bitcast_4401:
1876 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
1877 ; SSE-NEXT: movaps %xmm1, %xmm0
1880 ; AVX-LABEL: shuffle_v4f32_bitcast_4401:
1882 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
1884 %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
1885 %2 = bitcast <4 x i32> %1 to <2 x double>
1886 %3 = bitcast <4 x float> %a to <2 x double>
1887 %4 = shufflevector <2 x double> %2, <2 x double> %3, <2 x i32> <i32 0, i32 2>
1888 %5 = bitcast <2 x double> %4 to <4 x float>
1892 define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) {
1893 ; SSE-LABEL: shuffle_v4f32_bitcast_0045:
1895 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
1898 ; AVX-LABEL: shuffle_v4f32_bitcast_0045:
1900 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
1902 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
1903 %2 = bitcast <4 x i32> %b to <4 x float>
1904 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 1, i32 0, i32 4, i32 5>
1908 define <4 x float> @mask_v4f32_4127(<4 x float> %a, <4 x float> %b) {
1909 ; SSE2-LABEL: mask_v4f32_4127:
1911 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1912 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1913 ; SSE2-NEXT: movaps %xmm1, %xmm0
1916 ; SSE3-LABEL: mask_v4f32_4127:
1918 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1919 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1920 ; SSE3-NEXT: movaps %xmm1, %xmm0
1923 ; SSSE3-LABEL: mask_v4f32_4127:
1925 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1926 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1927 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1930 ; SSE41-LABEL: mask_v4f32_4127:
1932 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
1935 ; AVX-LABEL: mask_v4f32_4127:
1937 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
1939 %1 = bitcast <4 x float> %a to <4 x i32>
1940 %2 = bitcast <4 x float> %b to <4 x i32>
1941 %3 = and <4 x i32> %1, <i32 0, i32 -1, i32 -1, i32 0>
1942 %4 = and <4 x i32> %2, <i32 -1, i32 0, i32 0, i32 -1>
1943 %5 = or <4 x i32> %4, %3
1944 %6 = bitcast <4 x i32> %5 to <4 x float>
1948 define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) {
1949 ; SSE2-LABEL: mask_v4f32_0127:
1951 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1952 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1953 ; SSE2-NEXT: movaps %xmm1, %xmm0
1956 ; SSE3-LABEL: mask_v4f32_0127:
1958 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1959 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1960 ; SSE3-NEXT: movaps %xmm1, %xmm0
1963 ; SSSE3-LABEL: mask_v4f32_0127:
1965 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1966 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1967 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1970 ; SSE41-LABEL: mask_v4f32_0127:
1972 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1975 ; AVX-LABEL: mask_v4f32_0127:
1977 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1979 %1 = bitcast <4 x float> %a to <2 x i64>
1980 %2 = bitcast <4 x float> %b to <2 x i64>
1981 %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
1982 %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
1983 %5 = or <2 x i64> %4, %3
1984 %6 = bitcast <2 x i64> %5 to <4 x float>
1988 define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
1989 ; SSE2-LABEL: mask_v4i32_0127:
1991 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1992 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1993 ; SSE2-NEXT: movaps %xmm1, %xmm0
1996 ; SSE3-LABEL: mask_v4i32_0127:
1998 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1999 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2000 ; SSE3-NEXT: movaps %xmm1, %xmm0
2003 ; SSSE3-LABEL: mask_v4i32_0127:
2005 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
2006 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2007 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2010 ; SSE41-LABEL: mask_v4i32_0127:
2012 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2015 ; AVX-LABEL: mask_v4i32_0127:
2017 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2019 %1 = bitcast <4 x i32> %a to <2 x i64>
2020 %2 = bitcast <4 x i32> %b to <2 x i64>
2021 %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
2022 %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
2023 %5 = or <2 x i64> %4, %3
2024 %6 = bitcast <2 x i64> %5 to <4 x i32>
2028 define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) {
2029 ; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
2031 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
2032 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2035 ; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
2037 ; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
2040 ; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
2042 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
2045 ; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32:
2047 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
2050 ; AVX-LABEL: broadcast_v4f32_0101_from_v2f32:
2052 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
2054 %1 = load <2 x float>, <2 x float>* %x, align 1
2055 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
2059 define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
2060 ; SSE-LABEL: insert_reg_and_zero_v4i32:
2062 ; SSE-NEXT: movd %edi, %xmm0
2065 ; AVX-LABEL: insert_reg_and_zero_v4i32:
2067 ; AVX-NEXT: vmovd %edi, %xmm0
2069 %v = insertelement <4 x i32> undef, i32 %a, i32 0
2070 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2071 ret <4 x i32> %shuffle
2074 define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
2075 ; SSE-LABEL: insert_mem_and_zero_v4i32:
2077 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2080 ; AVX-LABEL: insert_mem_and_zero_v4i32:
2082 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2084 %a = load i32, i32* %ptr
2085 %v = insertelement <4 x i32> undef, i32 %a, i32 0
2086 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2087 ret <4 x i32> %shuffle
2090 define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
2091 ; SSE2-LABEL: insert_reg_and_zero_v4f32:
2093 ; SSE2-NEXT: xorps %xmm1, %xmm1
2094 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2095 ; SSE2-NEXT: movaps %xmm1, %xmm0
2098 ; SSE3-LABEL: insert_reg_and_zero_v4f32:
2100 ; SSE3-NEXT: xorps %xmm1, %xmm1
2101 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2102 ; SSE3-NEXT: movaps %xmm1, %xmm0
2105 ; SSSE3-LABEL: insert_reg_and_zero_v4f32:
2107 ; SSSE3-NEXT: xorps %xmm1, %xmm1
2108 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2109 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2112 ; SSE41-LABEL: insert_reg_and_zero_v4f32:
2114 ; SSE41-NEXT: xorps %xmm1, %xmm1
2115 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2118 ; AVX-LABEL: insert_reg_and_zero_v4f32:
2120 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
2121 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2123 %v = insertelement <4 x float> undef, float %a, i32 0
2124 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2125 ret <4 x float> %shuffle
2128 define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
2129 ; SSE-LABEL: insert_mem_and_zero_v4f32:
2131 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2134 ; AVX-LABEL: insert_mem_and_zero_v4f32:
2136 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2138 %a = load float, float* %ptr
2139 %v = insertelement <4 x float> undef, float %a, i32 0
2140 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2141 ret <4 x float> %shuffle
2144 define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
2145 ; SSE2-LABEL: insert_reg_lo_v4i32:
2147 ; SSE2-NEXT: movq %rdi, %xmm1
2148 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2151 ; SSE3-LABEL: insert_reg_lo_v4i32:
2153 ; SSE3-NEXT: movq %rdi, %xmm1
2154 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2157 ; SSSE3-LABEL: insert_reg_lo_v4i32:
2159 ; SSSE3-NEXT: movq %rdi, %xmm1
2160 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2163 ; SSE41-LABEL: insert_reg_lo_v4i32:
2165 ; SSE41-NEXT: movq %rdi, %xmm1
2166 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2169 ; AVX1-LABEL: insert_reg_lo_v4i32:
2171 ; AVX1-NEXT: vmovq %rdi, %xmm1
2172 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2175 ; AVX2OR512VL-LABEL: insert_reg_lo_v4i32:
2176 ; AVX2OR512VL: # %bb.0:
2177 ; AVX2OR512VL-NEXT: vmovq %rdi, %xmm1
2178 ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2179 ; AVX2OR512VL-NEXT: retq
2180 %a.cast = bitcast i64 %a to <2 x i32>
2181 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2182 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2183 ret <4 x i32> %shuffle
2186 define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
2187 ; SSE2-LABEL: insert_mem_lo_v4i32:
2189 ; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2192 ; SSE3-LABEL: insert_mem_lo_v4i32:
2194 ; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2197 ; SSSE3-LABEL: insert_mem_lo_v4i32:
2199 ; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2202 ; SSE41-LABEL: insert_mem_lo_v4i32:
2204 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
2205 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2208 ; AVX-LABEL: insert_mem_lo_v4i32:
2210 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2211 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2213 %a = load <2 x i32>, <2 x i32>* %ptr
2214 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2215 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2216 ret <4 x i32> %shuffle
2219 define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
2220 ; SSE-LABEL: insert_reg_hi_v4i32:
2222 ; SSE-NEXT: movq %rdi, %xmm1
2223 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2226 ; AVX-LABEL: insert_reg_hi_v4i32:
2228 ; AVX-NEXT: vmovq %rdi, %xmm1
2229 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2231 %a.cast = bitcast i64 %a to <2 x i32>
2232 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2233 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2234 ret <4 x i32> %shuffle
2237 define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
2238 ; SSE-LABEL: insert_mem_hi_v4i32:
2240 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
2241 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2244 ; AVX-LABEL: insert_mem_hi_v4i32:
2246 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2247 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2249 %a = load <2 x i32>, <2 x i32>* %ptr
2250 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2251 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2252 ret <4 x i32> %shuffle
2255 define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
2256 ; SSE2-LABEL: insert_reg_lo_v4f32:
2258 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2261 ; SSE3-LABEL: insert_reg_lo_v4f32:
2263 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2266 ; SSSE3-LABEL: insert_reg_lo_v4f32:
2268 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2271 ; SSE41-LABEL: insert_reg_lo_v4f32:
2273 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2276 ; AVX-LABEL: insert_reg_lo_v4f32:
2278 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2280 %a.cast = bitcast double %a to <2 x float>
2281 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2282 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2283 ret <4 x float> %shuffle
2286 define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
2287 ; SSE-LABEL: insert_mem_lo_v4f32:
2289 ; SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2292 ; AVX-LABEL: insert_mem_lo_v4f32:
2294 ; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2296 %a = load <2 x float>, <2 x float>* %ptr
2297 %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2298 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2299 ret <4 x float> %shuffle
2302 define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
2303 ; SSE-LABEL: insert_reg_hi_v4f32:
2305 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2306 ; SSE-NEXT: movaps %xmm1, %xmm0
2309 ; AVX-LABEL: insert_reg_hi_v4f32:
2311 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2313 %a.cast = bitcast double %a to <2 x float>
2314 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2315 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2316 ret <4 x float> %shuffle
2319 define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
2320 ; SSE-LABEL: insert_mem_hi_v4f32:
2322 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2325 ; AVX-LABEL: insert_mem_hi_v4f32:
2327 ; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2329 %a = load <2 x float>, <2 x float>* %ptr
2330 %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2331 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2332 ret <4 x float> %shuffle
2336 define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
2337 ; SSE-LABEL: shuffle_mem_v4f32_3210:
2339 ; SSE-NEXT: movaps (%rdi), %xmm0
2340 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
2343 ; AVX-LABEL: shuffle_mem_v4f32_3210:
2345 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
2347 %a = load <4 x float>, <4 x float>* %ptr
2348 %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2349 ret <4 x float> %shuffle
2352 define <4 x i32> @insert_dup_mem_v4i32(i32* %ptr) {
2353 ; SSE-LABEL: insert_dup_mem_v4i32:
2355 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2356 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2359 ; AVX-LABEL: insert_dup_mem_v4i32:
2361 ; AVX-NEXT: vbroadcastss (%rdi), %xmm0
2363 %tmp = load i32, i32* %ptr, align 4
2364 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2365 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
2370 define <4 x float> @shuffle_mem_pmovzx_v4f32(<2 x float>* %p0, <4 x float>* %p1) {
2371 ; SSE-LABEL: shuffle_mem_pmovzx_v4f32:
2373 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
2374 ; SSE-NEXT: xorps %xmm1, %xmm1
2375 ; SSE-NEXT: movaps %xmm0, %xmm2
2376 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2377 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2378 ; SSE-NEXT: movaps %xmm2, (%rsi)
2381 ; AVX1-LABEL: shuffle_mem_pmovzx_v4f32:
2383 ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2384 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
2385 ; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2386 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2387 ; AVX1-NEXT: vmovaps %xmm1, (%rsi)
2390 ; AVX2OR512VL-LABEL: shuffle_mem_pmovzx_v4f32:
2391 ; AVX2OR512VL: # %bb.0:
2392 ; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2393 ; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
2394 ; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2395 ; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %xmm0
2396 ; AVX2OR512VL-NEXT: vmovaps %xmm1, (%rsi)
2397 ; AVX2OR512VL-NEXT: retq
2398 %1 = load <2 x float>, <2 x float>* %p0
2399 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
2400 %3 = shufflevector <4 x float> %2, <4 x float> <float undef, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
2401 %4 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> zeroinitializer
2402 store <4 x float> %3, <4 x float>* %p1
2407 ; Shuffle to logical bit shifts
2410 define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) {
2411 ; SSE-LABEL: shuffle_v4i32_z0zX:
2413 ; SSE-NEXT: psllq $32, %xmm0
2416 ; AVX-LABEL: shuffle_v4i32_z0zX:
2418 ; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
2420 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 undef>
2421 ret <4 x i32> %shuffle
2424 define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) {
2425 ; SSE-LABEL: shuffle_v4i32_1z3z:
2427 ; SSE-NEXT: psrlq $32, %xmm0
2430 ; AVX-LABEL: shuffle_v4i32_1z3z:
2432 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
2434 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
2435 ret <4 x i32> %shuffle
2438 define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) {
2439 ; SSE-LABEL: shuffle_mem_v4f32_0145:
2441 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2444 ; AVX-LABEL: shuffle_mem_v4f32_0145:
2446 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
2448 %b = load <4 x float>, <4 x float>* %pb, align 1
2449 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
2450 ret <4 x float> %shuffle
2453 define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, <4 x float>* %pb) {
2454 ; SSE2-LABEL: shuffle_mem_v4f32_4523:
2456 ; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2459 ; SSE3-LABEL: shuffle_mem_v4f32_4523:
2461 ; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2464 ; SSSE3-LABEL: shuffle_mem_v4f32_4523:
2466 ; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2469 ; SSE41-LABEL: shuffle_mem_v4f32_4523:
2471 ; SSE41-NEXT: movups (%rdi), %xmm1
2472 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2475 ; AVX-LABEL: shuffle_mem_v4f32_4523:
2477 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2479 %b = load <4 x float>, <4 x float>* %pb, align 1
2480 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
2481 ret <4 x float> %shuffle