1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3
4 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
5 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
6 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
7 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
8 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
9 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
10 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL
11 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL
13 define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
14 ; SSE-LABEL: shuffle_v4i32_0001:
16 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
19 ; AVX-LABEL: shuffle_v4i32_0001:
21 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
23 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
24 ret <4 x i32> %shuffle
26 define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) {
27 ; SSE-LABEL: shuffle_v4i32_0020:
29 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
32 ; AVX-LABEL: shuffle_v4i32_0020:
34 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
36 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
37 ret <4 x i32> %shuffle
39 define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) {
40 ; SSE-LABEL: shuffle_v4i32_0112:
42 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
45 ; AVX-LABEL: shuffle_v4i32_0112:
47 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2]
49 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
50 ret <4 x i32> %shuffle
52 define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) {
53 ; SSE-LABEL: shuffle_v4i32_0300:
55 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
58 ; AVX-LABEL: shuffle_v4i32_0300:
60 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
62 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
63 ret <4 x i32> %shuffle
65 define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) {
66 ; SSE-LABEL: shuffle_v4i32_1000:
68 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
71 ; AVX-LABEL: shuffle_v4i32_1000:
73 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
75 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
76 ret <4 x i32> %shuffle
78 define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) {
79 ; SSE-LABEL: shuffle_v4i32_2200:
81 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
84 ; AVX-LABEL: shuffle_v4i32_2200:
86 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
88 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
89 ret <4 x i32> %shuffle
91 define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) {
92 ; SSE-LABEL: shuffle_v4i32_3330:
94 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
97 ; AVX-LABEL: shuffle_v4i32_3330:
99 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
101 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
102 ret <4 x i32> %shuffle
104 define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) {
105 ; SSE-LABEL: shuffle_v4i32_3210:
107 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
110 ; AVX-LABEL: shuffle_v4i32_3210:
112 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
114 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
115 ret <4 x i32> %shuffle
118 define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) {
119 ; SSE-LABEL: shuffle_v4i32_2121:
121 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
124 ; AVX-LABEL: shuffle_v4i32_2121:
126 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,2,1]
128 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1>
129 ret <4 x i32> %shuffle
132 define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
133 ; SSE-LABEL: shuffle_v4f32_0001:
135 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
138 ; AVX-LABEL: shuffle_v4f32_0001:
140 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
142 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
143 ret <4 x float> %shuffle
145 define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
146 ; SSE-LABEL: shuffle_v4f32_0020:
148 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
151 ; AVX-LABEL: shuffle_v4f32_0020:
153 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
155 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
156 ret <4 x float> %shuffle
158 define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
159 ; SSE-LABEL: shuffle_v4f32_0300:
161 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
164 ; AVX-LABEL: shuffle_v4f32_0300:
166 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
168 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
169 ret <4 x float> %shuffle
171 define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
172 ; SSE-LABEL: shuffle_v4f32_1000:
174 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
177 ; AVX-LABEL: shuffle_v4f32_1000:
179 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
181 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
182 ret <4 x float> %shuffle
184 define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
185 ; SSE-LABEL: shuffle_v4f32_2200:
187 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
190 ; AVX-LABEL: shuffle_v4f32_2200:
192 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
194 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
195 ret <4 x float> %shuffle
197 define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
198 ; SSE-LABEL: shuffle_v4f32_3330:
200 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
203 ; AVX-LABEL: shuffle_v4f32_3330:
205 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
207 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
208 ret <4 x float> %shuffle
210 define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
211 ; SSE-LABEL: shuffle_v4f32_3210:
213 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
216 ; AVX-LABEL: shuffle_v4f32_3210:
218 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
220 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
221 ret <4 x float> %shuffle
223 define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
224 ; SSE-LABEL: shuffle_v4f32_0011:
226 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
229 ; AVX-LABEL: shuffle_v4f32_0011:
231 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
233 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
234 ret <4 x float> %shuffle
236 define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
237 ; SSE-LABEL: shuffle_v4f32_2233:
239 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
242 ; AVX-LABEL: shuffle_v4f32_2233:
244 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
246 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
247 ret <4 x float> %shuffle
249 define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
250 ; SSE2-LABEL: shuffle_v4f32_0022:
252 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
255 ; SSE3-LABEL: shuffle_v4f32_0022:
257 ; SSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
260 ; SSSE3-LABEL: shuffle_v4f32_0022:
262 ; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
265 ; SSE41-LABEL: shuffle_v4f32_0022:
267 ; SSE41-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
270 ; AVX-LABEL: shuffle_v4f32_0022:
272 ; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
274 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
275 ret <4 x float> %shuffle
277 define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
278 ; SSE2-LABEL: shuffle_v4f32_1133:
280 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
283 ; SSE3-LABEL: shuffle_v4f32_1133:
285 ; SSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
288 ; SSSE3-LABEL: shuffle_v4f32_1133:
290 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
293 ; SSE41-LABEL: shuffle_v4f32_1133:
295 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
298 ; AVX-LABEL: shuffle_v4f32_1133:
300 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
302 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
303 ret <4 x float> %shuffle
306 define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
307 ; SSE-LABEL: shuffle_v4f32_0145:
309 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
312 ; AVX-LABEL: shuffle_v4f32_0145:
314 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
316 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
317 ret <4 x float> %shuffle
320 define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
321 ; SSE-LABEL: shuffle_v4f32_6723:
323 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
326 ; AVX-LABEL: shuffle_v4f32_6723:
328 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
330 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
331 ret <4 x float> %shuffle
334 define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
335 ; SSE2-LABEL: shuffle_v4i32_0124:
337 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
338 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
341 ; SSE3-LABEL: shuffle_v4i32_0124:
343 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
344 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
347 ; SSSE3-LABEL: shuffle_v4i32_0124:
349 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
350 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
353 ; SSE41-LABEL: shuffle_v4i32_0124:
355 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
356 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
359 ; AVX1-LABEL: shuffle_v4i32_0124:
361 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
362 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
365 ; AVX2-LABEL: shuffle_v4i32_0124:
367 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
368 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
371 ; AVX512VL-LABEL: shuffle_v4i32_0124:
373 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,2,4]
374 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
375 ; AVX512VL-NEXT: retq
376 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
377 ret <4 x i32> %shuffle
379 define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
380 ; SSE2-LABEL: shuffle_v4i32_0142:
382 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
383 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
386 ; SSE3-LABEL: shuffle_v4i32_0142:
388 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
389 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
392 ; SSSE3-LABEL: shuffle_v4i32_0142:
394 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
395 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
398 ; SSE41-LABEL: shuffle_v4i32_0142:
400 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
401 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
402 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
405 ; AVX1-LABEL: shuffle_v4i32_0142:
407 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
408 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
409 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
412 ; AVX2-LABEL: shuffle_v4i32_0142:
414 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
415 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
416 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
419 ; AVX512VL-LABEL: shuffle_v4i32_0142:
421 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,4,2]
422 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
423 ; AVX512VL-NEXT: retq
424 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
425 ret <4 x i32> %shuffle
427 define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
428 ; SSE2-LABEL: shuffle_v4i32_0412:
430 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
431 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
432 ; SSE2-NEXT: movaps %xmm1, %xmm0
435 ; SSE3-LABEL: shuffle_v4i32_0412:
437 ; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
438 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
439 ; SSE3-NEXT: movaps %xmm1, %xmm0
442 ; SSSE3-LABEL: shuffle_v4i32_0412:
444 ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
445 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
446 ; SSSE3-NEXT: movaps %xmm1, %xmm0
449 ; SSE41-LABEL: shuffle_v4i32_0412:
451 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
452 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
453 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
456 ; AVX1-LABEL: shuffle_v4i32_0412:
458 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,1,1]
459 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2]
460 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
463 ; AVX2-LABEL: shuffle_v4i32_0412:
465 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
466 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2]
467 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
470 ; AVX512VL-LABEL: shuffle_v4i32_0412:
472 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4,1,2]
473 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
474 ; AVX512VL-NEXT: retq
475 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
476 ret <4 x i32> %shuffle
478 define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
479 ; SSE2-LABEL: shuffle_v4i32_4012:
481 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
482 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
483 ; SSE2-NEXT: movaps %xmm1, %xmm0
486 ; SSE3-LABEL: shuffle_v4i32_4012:
488 ; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
489 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
490 ; SSE3-NEXT: movaps %xmm1, %xmm0
493 ; SSSE3-LABEL: shuffle_v4i32_4012:
495 ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
496 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
497 ; SSSE3-NEXT: movaps %xmm1, %xmm0
500 ; SSE41-LABEL: shuffle_v4i32_4012:
502 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
503 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
506 ; AVX1OR2-LABEL: shuffle_v4i32_4012:
508 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,2]
509 ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
512 ; AVX512VL-LABEL: shuffle_v4i32_4012:
514 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,0,1,2]
515 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
516 ; AVX512VL-NEXT: retq
517 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
518 ret <4 x i32> %shuffle
520 define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
521 ; SSE-LABEL: shuffle_v4i32_0145:
523 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
526 ; AVX-LABEL: shuffle_v4i32_0145:
528 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
530 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
531 ret <4 x i32> %shuffle
533 define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
534 ; SSE-LABEL: shuffle_v4i32_0451:
536 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
537 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
540 ; AVX1OR2-LABEL: shuffle_v4i32_0451:
542 ; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
543 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
546 ; AVX512VL-LABEL: shuffle_v4i32_0451:
548 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4,5,1]
549 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
550 ; AVX512VL-NEXT: retq
551 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
552 ret <4 x i32> %shuffle
554 define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
555 ; SSE-LABEL: shuffle_v4i32_4501:
557 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
558 ; SSE-NEXT: movaps %xmm1, %xmm0
561 ; AVX-LABEL: shuffle_v4i32_4501:
563 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
565 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
566 ret <4 x i32> %shuffle
568 define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
569 ; SSE-LABEL: shuffle_v4i32_4015:
571 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
572 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
575 ; AVX1OR2-LABEL: shuffle_v4i32_4015:
577 ; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
578 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
581 ; AVX512VL-LABEL: shuffle_v4i32_4015:
583 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,0,1,5]
584 ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
585 ; AVX512VL-NEXT: retq
586 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
587 ret <4 x i32> %shuffle
590 define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
591 ; SSE2-LABEL: shuffle_v4f32_4zzz:
593 ; SSE2-NEXT: xorps %xmm1, %xmm1
594 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
595 ; SSE2-NEXT: movaps %xmm1, %xmm0
598 ; SSE3-LABEL: shuffle_v4f32_4zzz:
600 ; SSE3-NEXT: xorps %xmm1, %xmm1
601 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
602 ; SSE3-NEXT: movaps %xmm1, %xmm0
605 ; SSSE3-LABEL: shuffle_v4f32_4zzz:
607 ; SSSE3-NEXT: xorps %xmm1, %xmm1
608 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
609 ; SSSE3-NEXT: movaps %xmm1, %xmm0
612 ; SSE41-LABEL: shuffle_v4f32_4zzz:
614 ; SSE41-NEXT: xorps %xmm1, %xmm1
615 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
618 ; AVX-LABEL: shuffle_v4f32_4zzz:
620 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
621 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
623 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
624 ret <4 x float> %shuffle
627 define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
628 ; SSE2-LABEL: shuffle_v4f32_z4zz:
630 ; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
631 ; SSE2-NEXT: xorps %xmm1, %xmm1
632 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
635 ; SSE3-LABEL: shuffle_v4f32_z4zz:
637 ; SSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
638 ; SSE3-NEXT: xorps %xmm1, %xmm1
639 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
642 ; SSSE3-LABEL: shuffle_v4f32_z4zz:
644 ; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
645 ; SSSE3-NEXT: xorps %xmm1, %xmm1
646 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
649 ; SSE41-LABEL: shuffle_v4f32_z4zz:
651 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
654 ; AVX-LABEL: shuffle_v4f32_z4zz:
656 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
658 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
659 ret <4 x float> %shuffle
662 define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
663 ; SSE2-LABEL: shuffle_v4f32_zz4z:
665 ; SSE2-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
666 ; SSE2-NEXT: pxor %xmm0, %xmm0
667 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
670 ; SSE3-LABEL: shuffle_v4f32_zz4z:
672 ; SSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
673 ; SSE3-NEXT: pxor %xmm0, %xmm0
674 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
677 ; SSSE3-LABEL: shuffle_v4f32_zz4z:
679 ; SSSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
680 ; SSSE3-NEXT: pxor %xmm0, %xmm0
681 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
684 ; SSE41-LABEL: shuffle_v4f32_zz4z:
686 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
689 ; AVX-LABEL: shuffle_v4f32_zz4z:
691 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
693 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
694 ret <4 x float> %shuffle
697 define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
698 ; SSE2-LABEL: shuffle_v4f32_zuu4:
700 ; SSE2-NEXT: xorps %xmm1, %xmm1
701 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
702 ; SSE2-NEXT: movaps %xmm1, %xmm0
705 ; SSE3-LABEL: shuffle_v4f32_zuu4:
707 ; SSE3-NEXT: xorps %xmm1, %xmm1
708 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
709 ; SSE3-NEXT: movaps %xmm1, %xmm0
712 ; SSSE3-LABEL: shuffle_v4f32_zuu4:
714 ; SSSE3-NEXT: xorps %xmm1, %xmm1
715 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
716 ; SSSE3-NEXT: movaps %xmm1, %xmm0
719 ; SSE41-LABEL: shuffle_v4f32_zuu4:
721 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
724 ; AVX-LABEL: shuffle_v4f32_zuu4:
726 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
728 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
729 ret <4 x float> %shuffle
732 define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
733 ; SSE2-LABEL: shuffle_v4f32_zzz7:
735 ; SSE2-NEXT: xorps %xmm1, %xmm1
736 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
737 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
738 ; SSE2-NEXT: movaps %xmm1, %xmm0
741 ; SSE3-LABEL: shuffle_v4f32_zzz7:
743 ; SSE3-NEXT: xorps %xmm1, %xmm1
744 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
745 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
746 ; SSE3-NEXT: movaps %xmm1, %xmm0
749 ; SSSE3-LABEL: shuffle_v4f32_zzz7:
751 ; SSSE3-NEXT: xorps %xmm1, %xmm1
752 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
753 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
754 ; SSSE3-NEXT: movaps %xmm1, %xmm0
757 ; SSE41-LABEL: shuffle_v4f32_zzz7:
759 ; SSE41-NEXT: xorps %xmm1, %xmm1
760 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
763 ; AVX-LABEL: shuffle_v4f32_zzz7:
765 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
766 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
768 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
769 ret <4 x float> %shuffle
772 define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
773 ; SSE2-LABEL: shuffle_v4f32_z6zz:
775 ; SSE2-NEXT: xorps %xmm1, %xmm1
776 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
777 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
780 ; SSE3-LABEL: shuffle_v4f32_z6zz:
782 ; SSE3-NEXT: xorps %xmm1, %xmm1
783 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
784 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
787 ; SSSE3-LABEL: shuffle_v4f32_z6zz:
789 ; SSSE3-NEXT: xorps %xmm1, %xmm1
790 ; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
791 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
794 ; SSE41-LABEL: shuffle_v4f32_z6zz:
796 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
799 ; AVX-LABEL: shuffle_v4f32_z6zz:
801 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
803 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
804 ret <4 x float> %shuffle
807 define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
808 ; SSE2-LABEL: shuffle_v4f32_0z23:
810 ; SSE2-NEXT: xorps %xmm1, %xmm1
811 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
812 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
813 ; SSE2-NEXT: movaps %xmm1, %xmm0
816 ; SSE3-LABEL: shuffle_v4f32_0z23:
818 ; SSE3-NEXT: xorps %xmm1, %xmm1
819 ; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
820 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
821 ; SSE3-NEXT: movaps %xmm1, %xmm0
824 ; SSSE3-LABEL: shuffle_v4f32_0z23:
826 ; SSSE3-NEXT: xorps %xmm1, %xmm1
827 ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
828 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
829 ; SSSE3-NEXT: movaps %xmm1, %xmm0
832 ; SSE41-LABEL: shuffle_v4f32_0z23:
834 ; SSE41-NEXT: xorps %xmm1, %xmm1
835 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
838 ; AVX-LABEL: shuffle_v4f32_0z23:
840 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
841 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
843 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
844 ret <4 x float> %shuffle
847 define <4 x float> @shuffle_v4f32_01z3(<4 x float> %a) {
848 ; SSE2-LABEL: shuffle_v4f32_01z3:
850 ; SSE2-NEXT: xorps %xmm1, %xmm1
851 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
852 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
855 ; SSE3-LABEL: shuffle_v4f32_01z3:
857 ; SSE3-NEXT: xorps %xmm1, %xmm1
858 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
859 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
862 ; SSSE3-LABEL: shuffle_v4f32_01z3:
864 ; SSSE3-NEXT: xorps %xmm1, %xmm1
865 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
866 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
869 ; SSE41-LABEL: shuffle_v4f32_01z3:
871 ; SSE41-NEXT: xorps %xmm1, %xmm1
872 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
875 ; AVX-LABEL: shuffle_v4f32_01z3:
877 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
878 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
880 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
881 ret <4 x float> %shuffle
884 define <4 x float> @shuffle_v4f32_012z(<4 x float> %a) {
885 ; SSE2-LABEL: shuffle_v4f32_012z:
887 ; SSE2-NEXT: xorps %xmm1, %xmm1
888 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
889 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
892 ; SSE3-LABEL: shuffle_v4f32_012z:
894 ; SSE3-NEXT: xorps %xmm1, %xmm1
895 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
896 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
899 ; SSSE3-LABEL: shuffle_v4f32_012z:
901 ; SSSE3-NEXT: xorps %xmm1, %xmm1
902 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
903 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
906 ; SSE41-LABEL: shuffle_v4f32_012z:
908 ; SSE41-NEXT: xorps %xmm1, %xmm1
909 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
912 ; AVX-LABEL: shuffle_v4f32_012z:
914 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
915 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
917 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
918 ret <4 x float> %shuffle
921 define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) {
922 ; SSE2-LABEL: shuffle_v4f32_0zz3:
924 ; SSE2-NEXT: xorps %xmm1, %xmm1
925 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
926 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
929 ; SSE3-LABEL: shuffle_v4f32_0zz3:
931 ; SSE3-NEXT: xorps %xmm1, %xmm1
932 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
933 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
936 ; SSSE3-LABEL: shuffle_v4f32_0zz3:
938 ; SSSE3-NEXT: xorps %xmm1, %xmm1
939 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
940 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
943 ; SSE41-LABEL: shuffle_v4f32_0zz3:
945 ; SSE41-NEXT: xorps %xmm1, %xmm1
946 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
949 ; AVX-LABEL: shuffle_v4f32_0zz3:
951 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
952 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
954 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
955 ret <4 x float> %shuffle
958 define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) {
959 ; SSE2-LABEL: shuffle_v4f32_0z2z:
961 ; SSE2-NEXT: xorps %xmm1, %xmm1
962 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
963 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
966 ; SSE3-LABEL: shuffle_v4f32_0z2z:
968 ; SSE3-NEXT: xorps %xmm1, %xmm1
969 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
970 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
973 ; SSSE3-LABEL: shuffle_v4f32_0z2z:
975 ; SSSE3-NEXT: xorps %xmm1, %xmm1
976 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
977 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
980 ; SSE41-LABEL: shuffle_v4f32_0z2z:
982 ; SSE41-NEXT: xorps %xmm1, %xmm1
983 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
986 ; AVX-LABEL: shuffle_v4f32_0z2z:
988 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
989 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
991 %shuffle = shufflevector <4 x float> %v, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
992 ret <4 x float> %shuffle
995 define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
996 ; SSE-LABEL: shuffle_v4f32_u051:
998 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
999 ; SSE-NEXT: movaps %xmm1, %xmm0
1002 ; AVX-LABEL: shuffle_v4f32_u051:
1004 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1006 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 5, i32 1>
1007 ret <4 x float> %shuffle
1010 define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
1011 ; SSE2-LABEL: shuffle_v4f32_0zz4:
1013 ; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
1014 ; SSE2-NEXT: pxor %xmm1, %xmm1
1015 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1016 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1017 ; SSE2-NEXT: movaps %xmm1, %xmm0
1020 ; SSE3-LABEL: shuffle_v4f32_0zz4:
1022 ; SSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
1023 ; SSE3-NEXT: pxor %xmm1, %xmm1
1024 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1025 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1026 ; SSE3-NEXT: movaps %xmm1, %xmm0
1029 ; SSSE3-LABEL: shuffle_v4f32_0zz4:
1031 ; SSSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
1032 ; SSSE3-NEXT: pxor %xmm1, %xmm1
1033 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1034 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1035 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1038 ; SSE41-LABEL: shuffle_v4f32_0zz4:
1040 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
1043 ; AVX-LABEL: shuffle_v4f32_0zz4:
1045 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
1047 %shuffle = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 6, i32 0>
1048 %shuffle1 = shufflevector <4 x float> %a, <4 x float> %shuffle, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1049 ret <4 x float> %shuffle1
1052 define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
1053 ; SSE2-LABEL: shuffle_v4f32_0zz6:
1055 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1056 ; SSE2-NEXT: xorps %xmm1, %xmm1
1057 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1058 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1059 ; SSE2-NEXT: movaps %xmm1, %xmm0
1062 ; SSE3-LABEL: shuffle_v4f32_0zz6:
1064 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1065 ; SSE3-NEXT: xorps %xmm1, %xmm1
1066 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1067 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1068 ; SSE3-NEXT: movaps %xmm1, %xmm0
1071 ; SSSE3-LABEL: shuffle_v4f32_0zz6:
1073 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1074 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1075 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1076 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1077 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1080 ; SSE41-LABEL: shuffle_v4f32_0zz6:
1082 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
1085 ; AVX-LABEL: shuffle_v4f32_0zz6:
1087 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
1089 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
1090 %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
1091 ret <4 x float> %shuffle1
1094 define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
1095 ; SSE2-LABEL: shuffle_v4f32_0z24:
1097 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1098 ; SSE2-NEXT: xorps %xmm2, %xmm2
1099 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1100 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1101 ; SSE2-NEXT: movaps %xmm2, %xmm0
1104 ; SSE3-LABEL: shuffle_v4f32_0z24:
1106 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1107 ; SSE3-NEXT: xorps %xmm2, %xmm2
1108 ; SSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1109 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1110 ; SSE3-NEXT: movaps %xmm2, %xmm0
1113 ; SSSE3-LABEL: shuffle_v4f32_0z24:
1115 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1116 ; SSSE3-NEXT: xorps %xmm2, %xmm2
1117 ; SSSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1118 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1119 ; SSSE3-NEXT: movaps %xmm2, %xmm0
1122 ; SSE41-LABEL: shuffle_v4f32_0z24:
1124 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
1127 ; AVX-LABEL: shuffle_v4f32_0z24:
1129 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
1131 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
1132 %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1133 ret <4 x float> %shuffle1
1136 define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
1137 ; SSE2-LABEL: shuffle_v4i32_4zzz:
1139 ; SSE2-NEXT: xorps %xmm1, %xmm1
1140 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1141 ; SSE2-NEXT: movaps %xmm1, %xmm0
1144 ; SSE3-LABEL: shuffle_v4i32_4zzz:
1146 ; SSE3-NEXT: xorps %xmm1, %xmm1
1147 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1148 ; SSE3-NEXT: movaps %xmm1, %xmm0
1151 ; SSSE3-LABEL: shuffle_v4i32_4zzz:
1153 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1154 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1155 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1158 ; SSE41-LABEL: shuffle_v4i32_4zzz:
1160 ; SSE41-NEXT: xorps %xmm1, %xmm1
1161 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1164 ; AVX-LABEL: shuffle_v4i32_4zzz:
1166 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1167 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1169 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1170 ret <4 x i32> %shuffle
1173 define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
1174 ; SSE2-LABEL: shuffle_v4i32_z4zz:
1176 ; SSE2-NEXT: xorps %xmm1, %xmm1
1177 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1178 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1181 ; SSE3-LABEL: shuffle_v4i32_z4zz:
1183 ; SSE3-NEXT: xorps %xmm1, %xmm1
1184 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1185 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1188 ; SSSE3-LABEL: shuffle_v4i32_z4zz:
1190 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1191 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1192 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1195 ; SSE41-LABEL: shuffle_v4i32_z4zz:
1197 ; SSE41-NEXT: pxor %xmm1, %xmm1
1198 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1199 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1202 ; AVX1-LABEL: shuffle_v4i32_z4zz:
1204 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1205 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1206 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
1209 ; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
1210 ; AVX2-SLOW: # %bb.0:
1211 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1212 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1213 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
1214 ; AVX2-SLOW-NEXT: retq
1216 ; AVX2-FAST-LABEL: shuffle_v4i32_z4zz:
1217 ; AVX2-FAST: # %bb.0:
1218 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
1219 ; AVX2-FAST-NEXT: retq
1221 ; AVX512VL-LABEL: shuffle_v4i32_z4zz:
1222 ; AVX512VL: # %bb.0:
1223 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
1224 ; AVX512VL-NEXT: retq
1225 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
1226 ret <4 x i32> %shuffle
1229 define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
1230 ; SSE2-LABEL: shuffle_v4i32_zz4z:
1232 ; SSE2-NEXT: xorps %xmm1, %xmm1
1233 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1234 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1237 ; SSE3-LABEL: shuffle_v4i32_zz4z:
1239 ; SSE3-NEXT: xorps %xmm1, %xmm1
1240 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1241 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1244 ; SSSE3-LABEL: shuffle_v4i32_zz4z:
1246 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1247 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1248 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1251 ; SSE41-LABEL: shuffle_v4i32_zz4z:
1253 ; SSE41-NEXT: pxor %xmm1, %xmm1
1254 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1255 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1258 ; AVX1-LABEL: shuffle_v4i32_zz4z:
1260 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1261 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1262 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
1265 ; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
1266 ; AVX2-SLOW: # %bb.0:
1267 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1268 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1269 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
1270 ; AVX2-SLOW-NEXT: retq
1272 ; AVX2-FAST-LABEL: shuffle_v4i32_zz4z:
1273 ; AVX2-FAST: # %bb.0:
1274 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
1275 ; AVX2-FAST-NEXT: retq
1277 ; AVX512VL-LABEL: shuffle_v4i32_zz4z:
1278 ; AVX512VL: # %bb.0:
1279 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
1280 ; AVX512VL-NEXT: retq
1281 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
1282 ret <4 x i32> %shuffle
1285 define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
1286 ; SSE-LABEL: shuffle_v4i32_zuu4:
1288 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1291 ; AVX-LABEL: shuffle_v4i32_zuu4:
1293 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1295 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
1296 ret <4 x i32> %shuffle
1299 define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
1300 ; SSE2-LABEL: shuffle_v4i32_z6zz:
1302 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1303 ; SSE2-NEXT: xorps %xmm1, %xmm1
1304 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1307 ; SSE3-LABEL: shuffle_v4i32_z6zz:
1309 ; SSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1310 ; SSE3-NEXT: xorps %xmm1, %xmm1
1311 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1314 ; SSSE3-LABEL: shuffle_v4i32_z6zz:
1316 ; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1317 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1318 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1321 ; SSE41-LABEL: shuffle_v4i32_z6zz:
1323 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
1324 ; SSE41-NEXT: pxor %xmm0, %xmm0
1325 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1328 ; AVX1-LABEL: shuffle_v4i32_z6zz:
1330 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1331 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1332 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1335 ; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz:
1336 ; AVX2-SLOW: # %bb.0:
1337 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1338 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1339 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1340 ; AVX2-SLOW-NEXT: retq
1342 ; AVX2-FAST-LABEL: shuffle_v4i32_z6zz:
1343 ; AVX2-FAST: # %bb.0:
1344 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
1345 ; AVX2-FAST-NEXT: retq
1347 ; AVX512VL-LABEL: shuffle_v4i32_z6zz:
1348 ; AVX512VL: # %bb.0:
1349 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
1350 ; AVX512VL-NEXT: retq
1351 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
1352 ret <4 x i32> %shuffle
1355 define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) {
1356 ; SSE2-LABEL: shuffle_v4i32_7012:
1358 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
1359 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
1360 ; SSE2-NEXT: movaps %xmm1, %xmm0
1363 ; SSE3-LABEL: shuffle_v4i32_7012:
1365 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
1366 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
1367 ; SSE3-NEXT: movaps %xmm1, %xmm0
1370 ; SSSE3-LABEL: shuffle_v4i32_7012:
1372 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1375 ; SSE41-LABEL: shuffle_v4i32_7012:
1377 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1380 ; AVX-LABEL: shuffle_v4i32_7012:
1382 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1384 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
1385 ret <4 x i32> %shuffle
1388 define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) {
1389 ; SSE2-LABEL: shuffle_v4i32_6701:
1391 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
1392 ; SSE2-NEXT: movaps %xmm1, %xmm0
1395 ; SSE3-LABEL: shuffle_v4i32_6701:
1397 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
1398 ; SSE3-NEXT: movaps %xmm1, %xmm0
1401 ; SSSE3-LABEL: shuffle_v4i32_6701:
1403 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1406 ; SSE41-LABEL: shuffle_v4i32_6701:
1408 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1411 ; AVX-LABEL: shuffle_v4i32_6701:
1413 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1415 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1416 ret <4 x i32> %shuffle
1419 define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) {
1420 ; SSE2-LABEL: shuffle_v4i32_5670:
1422 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1423 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
1424 ; SSE2-NEXT: movaps %xmm1, %xmm0
1427 ; SSE3-LABEL: shuffle_v4i32_5670:
1429 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1430 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
1431 ; SSE3-NEXT: movaps %xmm1, %xmm0
1434 ; SSSE3-LABEL: shuffle_v4i32_5670:
1436 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1439 ; SSE41-LABEL: shuffle_v4i32_5670:
1441 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1444 ; AVX-LABEL: shuffle_v4i32_5670:
1446 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1448 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
1449 ret <4 x i32> %shuffle
1452 define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) {
1453 ; SSE2-LABEL: shuffle_v4i32_1234:
1455 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1456 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
1459 ; SSE3-LABEL: shuffle_v4i32_1234:
1461 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1462 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
1465 ; SSSE3-LABEL: shuffle_v4i32_1234:
1467 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1468 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1471 ; SSE41-LABEL: shuffle_v4i32_1234:
1473 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1474 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1477 ; AVX-LABEL: shuffle_v4i32_1234:
1479 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1481 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
1482 ret <4 x i32> %shuffle
1485 define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
1486 ; SSE2-LABEL: shuffle_v4i32_2345:
1488 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1491 ; SSE3-LABEL: shuffle_v4i32_2345:
1493 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1496 ; SSSE3-LABEL: shuffle_v4i32_2345:
1498 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1499 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1502 ; SSE41-LABEL: shuffle_v4i32_2345:
1504 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1505 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1508 ; AVX-LABEL: shuffle_v4i32_2345:
1510 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1512 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1513 ret <4 x i32> %shuffle
1517 define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) {
1518 ; SSE2-LABEL: shuffle_v4i32_2456:
1520 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1521 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1524 ; SSE3-LABEL: shuffle_v4i32_2456:
1526 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1527 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1530 ; SSSE3-LABEL: shuffle_v4i32_2456:
1532 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1533 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1534 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1537 ; SSE41-LABEL: shuffle_v4i32_2456:
1539 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1540 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1541 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1544 ; AVX1OR2-LABEL: shuffle_v4i32_2456:
1546 ; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1547 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1548 ; AVX1OR2-NEXT: retq
1550 ; AVX512VL-LABEL: shuffle_v4i32_2456:
1551 ; AVX512VL: # %bb.0:
1552 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,0,1,2]
1553 ; AVX512VL-NEXT: vpermi2d %xmm0, %xmm1, %xmm2
1554 ; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
1555 ; AVX512VL-NEXT: retq
1556 %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1557 %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1561 define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) {
1562 ; SSE-LABEL: shuffle_v4i32_40u1:
1564 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1565 ; SSE-NEXT: movaps %xmm1, %xmm0
1568 ; AVX-LABEL: shuffle_v4i32_40u1:
1570 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1572 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 1>
1573 ret <4 x i32> %shuffle
1576 define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
1577 ; SSE2-LABEL: shuffle_v4i32_3456:
1579 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
1580 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1583 ; SSE3-LABEL: shuffle_v4i32_3456:
1585 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
1586 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1589 ; SSSE3-LABEL: shuffle_v4i32_3456:
1591 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1592 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1595 ; SSE41-LABEL: shuffle_v4i32_3456:
1597 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1598 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1601 ; AVX-LABEL: shuffle_v4i32_3456:
1603 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1605 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1606 ret <4 x i32> %shuffle
1609 define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
1610 ; SSE2-LABEL: shuffle_v4i32_0u1u:
1612 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1615 ; SSE3-LABEL: shuffle_v4i32_0u1u:
1617 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1620 ; SSSE3-LABEL: shuffle_v4i32_0u1u:
1622 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1625 ; SSE41-LABEL: shuffle_v4i32_0u1u:
1627 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1630 ; AVX-LABEL: shuffle_v4i32_0u1u:
1632 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1634 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
1635 ret <4 x i32> %shuffle
1638 define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
1639 ; SSE2-LABEL: shuffle_v4i32_0z1z:
1641 ; SSE2-NEXT: xorps %xmm1, %xmm1
1642 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1645 ; SSE3-LABEL: shuffle_v4i32_0z1z:
1647 ; SSE3-NEXT: xorps %xmm1, %xmm1
1648 ; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1651 ; SSSE3-LABEL: shuffle_v4i32_0z1z:
1653 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1654 ; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1657 ; SSE41-LABEL: shuffle_v4i32_0z1z:
1659 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1662 ; AVX-LABEL: shuffle_v4i32_0z1z:
1664 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1666 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1667 ret <4 x i32> %shuffle
1670 define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) {
1671 ; SSE-LABEL: shuffle_v4i32_01zu:
1673 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
1676 ; AVX-LABEL: shuffle_v4i32_01zu:
1678 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1680 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 7, i32 undef>
1681 ret <4 x i32> %shuffle
1684 define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
1685 ; SSE2-LABEL: shuffle_v4i32_0z23:
1687 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1690 ; SSE3-LABEL: shuffle_v4i32_0z23:
1692 ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1695 ; SSSE3-LABEL: shuffle_v4i32_0z23:
1697 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1700 ; SSE41-LABEL: shuffle_v4i32_0z23:
1702 ; SSE41-NEXT: xorps %xmm1, %xmm1
1703 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1706 ; AVX-LABEL: shuffle_v4i32_0z23:
1708 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1709 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1711 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
1712 ret <4 x i32> %shuffle
1715 define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
1716 ; SSE2-LABEL: shuffle_v4i32_01z3:
1718 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1721 ; SSE3-LABEL: shuffle_v4i32_01z3:
1723 ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1726 ; SSSE3-LABEL: shuffle_v4i32_01z3:
1728 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1731 ; SSE41-LABEL: shuffle_v4i32_01z3:
1733 ; SSE41-NEXT: xorps %xmm1, %xmm1
1734 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1737 ; AVX-LABEL: shuffle_v4i32_01z3:
1739 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1740 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1742 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
1743 ret <4 x i32> %shuffle
1746 define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
1747 ; SSE2-LABEL: shuffle_v4i32_012z:
1749 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1752 ; SSE3-LABEL: shuffle_v4i32_012z:
1754 ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1757 ; SSSE3-LABEL: shuffle_v4i32_012z:
1759 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1762 ; SSE41-LABEL: shuffle_v4i32_012z:
1764 ; SSE41-NEXT: xorps %xmm1, %xmm1
1765 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1768 ; AVX-LABEL: shuffle_v4i32_012z:
1770 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1771 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1773 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1774 ret <4 x i32> %shuffle
1777 define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
1778 ; SSE2-LABEL: shuffle_v4i32_0zz3:
1780 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1783 ; SSE3-LABEL: shuffle_v4i32_0zz3:
1785 ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1788 ; SSSE3-LABEL: shuffle_v4i32_0zz3:
1790 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1793 ; SSE41-LABEL: shuffle_v4i32_0zz3:
1795 ; SSE41-NEXT: xorps %xmm1, %xmm1
1796 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1799 ; AVX-LABEL: shuffle_v4i32_0zz3:
1801 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1802 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1804 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
1805 ret <4 x i32> %shuffle
1808 define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) {
1809 ; SSE-LABEL: shuffle_v4i32_bitcast_0415:
1811 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1814 ; AVX-LABEL: shuffle_v4i32_bitcast_0415:
1816 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1818 %shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 0, i32 4>
1819 %bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double>
1820 %shuffle64 = shufflevector <2 x double> %bitcast64, <2 x double> undef, <2 x i32> <i32 1, i32 0>
1821 %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x i32>
1822 ret <4 x i32> %bitcast32
1825 define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) {
1826 ; SSE-LABEL: shuffle_v4f32_bitcast_4401:
1828 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
1829 ; SSE-NEXT: movaps %xmm1, %xmm0
1832 ; AVX-LABEL: shuffle_v4f32_bitcast_4401:
1834 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
1836 %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
1837 %2 = bitcast <4 x i32> %1 to <2 x double>
1838 %3 = bitcast <4 x float> %a to <2 x double>
1839 %4 = shufflevector <2 x double> %2, <2 x double> %3, <2 x i32> <i32 0, i32 2>
1840 %5 = bitcast <2 x double> %4 to <4 x float>
1844 define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) {
1845 ; SSE-LABEL: shuffle_v4f32_bitcast_0045:
1847 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
1850 ; AVX-LABEL: shuffle_v4f32_bitcast_0045:
1852 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
1854 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
1855 %2 = bitcast <4 x i32> %b to <4 x float>
1856 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 1, i32 0, i32 4, i32 5>
1860 define <4 x float> @mask_v4f32_4127(<4 x float> %a, <4 x float> %b) {
1861 ; SSE2-LABEL: mask_v4f32_4127:
1863 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1864 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1865 ; SSE2-NEXT: movaps %xmm1, %xmm0
1868 ; SSE3-LABEL: mask_v4f32_4127:
1870 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1871 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1872 ; SSE3-NEXT: movaps %xmm1, %xmm0
1875 ; SSSE3-LABEL: mask_v4f32_4127:
1877 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1878 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1879 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1882 ; SSE41-LABEL: mask_v4f32_4127:
1884 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
1887 ; AVX-LABEL: mask_v4f32_4127:
1889 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
1891 %1 = bitcast <4 x float> %a to <4 x i32>
1892 %2 = bitcast <4 x float> %b to <4 x i32>
1893 %3 = and <4 x i32> %1, <i32 0, i32 -1, i32 -1, i32 0>
1894 %4 = and <4 x i32> %2, <i32 -1, i32 0, i32 0, i32 -1>
1895 %5 = or <4 x i32> %4, %3
1896 %6 = bitcast <4 x i32> %5 to <4 x float>
1900 define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) {
1901 ; SSE2-LABEL: mask_v4f32_0127:
1903 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1904 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1905 ; SSE2-NEXT: movaps %xmm1, %xmm0
1908 ; SSE3-LABEL: mask_v4f32_0127:
1910 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1911 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1912 ; SSE3-NEXT: movaps %xmm1, %xmm0
1915 ; SSSE3-LABEL: mask_v4f32_0127:
1917 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1918 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1919 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1922 ; SSE41-LABEL: mask_v4f32_0127:
1924 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1927 ; AVX-LABEL: mask_v4f32_0127:
1929 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1931 %1 = bitcast <4 x float> %a to <2 x i64>
1932 %2 = bitcast <4 x float> %b to <2 x i64>
1933 %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
1934 %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
1935 %5 = or <2 x i64> %4, %3
1936 %6 = bitcast <2 x i64> %5 to <4 x float>
1940 define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
1941 ; SSE2-LABEL: mask_v4i32_0127:
1943 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1944 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1945 ; SSE2-NEXT: movaps %xmm1, %xmm0
1948 ; SSE3-LABEL: mask_v4i32_0127:
1950 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1951 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1952 ; SSE3-NEXT: movaps %xmm1, %xmm0
1955 ; SSSE3-LABEL: mask_v4i32_0127:
1957 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1958 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1959 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1962 ; SSE41-LABEL: mask_v4i32_0127:
1964 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1967 ; AVX-LABEL: mask_v4i32_0127:
1969 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1971 %1 = bitcast <4 x i32> %a to <2 x i64>
1972 %2 = bitcast <4 x i32> %b to <2 x i64>
1973 %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
1974 %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
1975 %5 = or <2 x i64> %4, %3
1976 %6 = bitcast <2 x i64> %5 to <4 x i32>
1980 define <4 x float> @broadcast_v4f32_0101_from_v2f32(ptr %x) {
1981 ; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
1983 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1984 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
1987 ; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
1989 ; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
1992 ; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
1994 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
1997 ; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32:
1999 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
2002 ; AVX-LABEL: broadcast_v4f32_0101_from_v2f32:
2004 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
2006 %1 = load <2 x float>, ptr %x, align 1
2007 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
2011 define <4 x i32> @extract3_insert0_v4i32_7123(<4 x i32> %a0, <4 x i32> %a1) {
2012 ; SSE2-LABEL: extract3_insert0_v4i32_7123:
2014 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2015 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2018 ; SSE3-LABEL: extract3_insert0_v4i32_7123:
2020 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2021 ; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2024 ; SSSE3-LABEL: extract3_insert0_v4i32_7123:
2026 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2027 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2030 ; SSE41-LABEL: extract3_insert0_v4i32_7123:
2032 ; SSE41-NEXT: extractps $3, %xmm1, %eax
2033 ; SSE41-NEXT: pinsrd $0, %eax, %xmm0
2036 ; AVX-LABEL: extract3_insert0_v4i32_7123:
2038 ; AVX-NEXT: vextractps $3, %xmm1, %eax
2039 ; AVX-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
2041 %1 = extractelement <4 x i32> %a1, i32 3
2042 %2 = insertelement <4 x i32> %a0, i32 %1, i32 0
2046 define <4 x i32> @extract3_insert3_v4i32_0127(<4 x i32> %a0, <4 x i32> %a1) {
2047 ; SSE2-LABEL: extract3_insert3_v4i32_0127:
2049 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2050 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2053 ; SSE3-LABEL: extract3_insert3_v4i32_0127:
2055 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2056 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2059 ; SSSE3-LABEL: extract3_insert3_v4i32_0127:
2061 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2062 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2065 ; SSE41-LABEL: extract3_insert3_v4i32_0127:
2067 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2070 ; AVX-LABEL: extract3_insert3_v4i32_0127:
2072 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2074 %1 = extractelement <4 x i32> %a1, i32 3
2075 %2 = insertelement <4 x i32> %a0, i32 %1, i32 3
2079 define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
2080 ; SSE-LABEL: insert_reg_and_zero_v4i32:
2082 ; SSE-NEXT: movd %edi, %xmm0
2085 ; AVX-LABEL: insert_reg_and_zero_v4i32:
2087 ; AVX-NEXT: vmovd %edi, %xmm0
2089 %v = insertelement <4 x i32> undef, i32 %a, i32 0
2090 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2091 ret <4 x i32> %shuffle
2094 define <4 x i32> @insert_mem_and_zero_v4i32(ptr %ptr) {
2095 ; SSE-LABEL: insert_mem_and_zero_v4i32:
2097 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2100 ; AVX-LABEL: insert_mem_and_zero_v4i32:
2102 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2104 %a = load i32, ptr %ptr
2105 %v = insertelement <4 x i32> undef, i32 %a, i32 0
2106 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2107 ret <4 x i32> %shuffle
2110 define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
2111 ; SSE2-LABEL: insert_reg_and_zero_v4f32:
2113 ; SSE2-NEXT: xorps %xmm1, %xmm1
2114 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2115 ; SSE2-NEXT: movaps %xmm1, %xmm0
2118 ; SSE3-LABEL: insert_reg_and_zero_v4f32:
2120 ; SSE3-NEXT: xorps %xmm1, %xmm1
2121 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2122 ; SSE3-NEXT: movaps %xmm1, %xmm0
2125 ; SSSE3-LABEL: insert_reg_and_zero_v4f32:
2127 ; SSSE3-NEXT: xorps %xmm1, %xmm1
2128 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2129 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2132 ; SSE41-LABEL: insert_reg_and_zero_v4f32:
2134 ; SSE41-NEXT: xorps %xmm1, %xmm1
2135 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2138 ; AVX-LABEL: insert_reg_and_zero_v4f32:
2140 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
2141 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2143 %v = insertelement <4 x float> undef, float %a, i32 0
2144 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2145 ret <4 x float> %shuffle
2148 define <4 x float> @insert_mem_and_zero_v4f32(ptr %ptr) {
2149 ; SSE-LABEL: insert_mem_and_zero_v4f32:
2151 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2154 ; AVX-LABEL: insert_mem_and_zero_v4f32:
2156 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2158 %a = load float, ptr %ptr
2159 %v = insertelement <4 x float> undef, float %a, i32 0
2160 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2161 ret <4 x float> %shuffle
2164 define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
2165 ; SSE2-LABEL: insert_reg_lo_v4i32:
2167 ; SSE2-NEXT: movq %rdi, %xmm1
2168 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2171 ; SSE3-LABEL: insert_reg_lo_v4i32:
2173 ; SSE3-NEXT: movq %rdi, %xmm1
2174 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2177 ; SSSE3-LABEL: insert_reg_lo_v4i32:
2179 ; SSSE3-NEXT: movq %rdi, %xmm1
2180 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2183 ; SSE41-LABEL: insert_reg_lo_v4i32:
2185 ; SSE41-NEXT: movq %rdi, %xmm1
2186 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2189 ; AVX1-LABEL: insert_reg_lo_v4i32:
2191 ; AVX1-NEXT: vmovq %rdi, %xmm1
2192 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2195 ; AVX2OR512VL-LABEL: insert_reg_lo_v4i32:
2196 ; AVX2OR512VL: # %bb.0:
2197 ; AVX2OR512VL-NEXT: vmovq %rdi, %xmm1
2198 ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2199 ; AVX2OR512VL-NEXT: retq
2200 %a.cast = bitcast i64 %a to <2 x i32>
2201 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2202 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2203 ret <4 x i32> %shuffle
2206 define <4 x i32> @insert_mem_lo_v4i32(ptr %ptr, <4 x i32> %b) {
2207 ; SSE2-LABEL: insert_mem_lo_v4i32:
2209 ; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2212 ; SSE3-LABEL: insert_mem_lo_v4i32:
2214 ; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2217 ; SSSE3-LABEL: insert_mem_lo_v4i32:
2219 ; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2222 ; SSE41-LABEL: insert_mem_lo_v4i32:
2224 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
2225 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2228 ; AVX-LABEL: insert_mem_lo_v4i32:
2230 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2231 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2233 %a = load <2 x i32>, ptr %ptr
2234 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2235 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2236 ret <4 x i32> %shuffle
2239 define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
2240 ; SSE-LABEL: insert_reg_hi_v4i32:
2242 ; SSE-NEXT: movq %rdi, %xmm1
2243 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2246 ; AVX-LABEL: insert_reg_hi_v4i32:
2248 ; AVX-NEXT: vmovq %rdi, %xmm1
2249 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2251 %a.cast = bitcast i64 %a to <2 x i32>
2252 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2253 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2254 ret <4 x i32> %shuffle
2257 define <4 x i32> @insert_mem_hi_v4i32(ptr %ptr, <4 x i32> %b) {
2258 ; SSE-LABEL: insert_mem_hi_v4i32:
2260 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
2261 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2264 ; AVX-LABEL: insert_mem_hi_v4i32:
2266 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2267 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2269 %a = load <2 x i32>, ptr %ptr
2270 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2271 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2272 ret <4 x i32> %shuffle
2275 define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
2276 ; SSE2-LABEL: insert_reg_lo_v4f32:
2278 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2281 ; SSE3-LABEL: insert_reg_lo_v4f32:
2283 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2286 ; SSSE3-LABEL: insert_reg_lo_v4f32:
2288 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2291 ; SSE41-LABEL: insert_reg_lo_v4f32:
2293 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2296 ; AVX-LABEL: insert_reg_lo_v4f32:
2298 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2300 %a.cast = bitcast double %a to <2 x float>
2301 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2302 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2303 ret <4 x float> %shuffle
2306 define <4 x float> @insert_mem_lo_v4f32(ptr %ptr, <4 x float> %b) {
2307 ; SSE-LABEL: insert_mem_lo_v4f32:
2309 ; SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2312 ; AVX-LABEL: insert_mem_lo_v4f32:
2314 ; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2316 %a = load <2 x float>, ptr %ptr
2317 %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2318 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2319 ret <4 x float> %shuffle
2322 define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
2323 ; SSE-LABEL: insert_reg_hi_v4f32:
2325 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2326 ; SSE-NEXT: movaps %xmm1, %xmm0
2329 ; AVX-LABEL: insert_reg_hi_v4f32:
2331 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2333 %a.cast = bitcast double %a to <2 x float>
2334 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2335 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2336 ret <4 x float> %shuffle
2339 define <4 x float> @insert_mem_hi_v4f32(ptr %ptr, <4 x float> %b) {
2340 ; SSE-LABEL: insert_mem_hi_v4f32:
2342 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2345 ; AVX-LABEL: insert_mem_hi_v4f32:
2347 ; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2349 %a = load <2 x float>, ptr %ptr
2350 %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2351 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2352 ret <4 x float> %shuffle
2356 define <4 x float> @shuffle_mem_v4f32_3210(ptr %ptr) {
2357 ; SSE-LABEL: shuffle_mem_v4f32_3210:
2359 ; SSE-NEXT: movaps (%rdi), %xmm0
2360 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
2363 ; AVX-LABEL: shuffle_mem_v4f32_3210:
2365 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
2367 %a = load <4 x float>, ptr %ptr
2368 %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2369 ret <4 x float> %shuffle
2372 define <4 x i32> @insert_dup_mem_v4i32(ptr %ptr) {
2373 ; SSE-LABEL: insert_dup_mem_v4i32:
2375 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2376 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2379 ; AVX-LABEL: insert_dup_mem_v4i32:
2381 ; AVX-NEXT: vbroadcastss (%rdi), %xmm0
2383 %tmp = load i32, ptr %ptr, align 4
2384 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2385 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
2390 define <4 x float> @shuffle_mem_pmovzx_v4f32(ptr %p0, ptr %p1) {
2391 ; SSE-LABEL: shuffle_mem_pmovzx_v4f32:
2393 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
2394 ; SSE-NEXT: xorps %xmm1, %xmm1
2395 ; SSE-NEXT: movaps %xmm0, %xmm2
2396 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2397 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2398 ; SSE-NEXT: movaps %xmm2, (%rsi)
2401 ; AVX1-LABEL: shuffle_mem_pmovzx_v4f32:
2403 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
2404 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
2405 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2406 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2407 ; AVX1-NEXT: vmovaps %xmm1, (%rsi)
2410 ; AVX2OR512VL-LABEL: shuffle_mem_pmovzx_v4f32:
2411 ; AVX2OR512VL: # %bb.0:
2412 ; AVX2OR512VL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
2413 ; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
2414 ; AVX2OR512VL-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2415 ; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %xmm0
2416 ; AVX2OR512VL-NEXT: vmovaps %xmm1, (%rsi)
2417 ; AVX2OR512VL-NEXT: retq
2418 %1 = load <2 x float>, ptr %p0
2419 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
2420 %3 = shufflevector <4 x float> %2, <4 x float> <float undef, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
2421 %4 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> zeroinitializer
2422 store <4 x float> %3, ptr %p1
2427 ; Shuffle to logical bit shifts
2430 define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) {
2431 ; SSE-LABEL: shuffle_v4i32_z0zX:
2433 ; SSE-NEXT: psllq $32, %xmm0
2436 ; AVX-LABEL: shuffle_v4i32_z0zX:
2438 ; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
2440 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 undef>
2441 ret <4 x i32> %shuffle
2444 define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) {
2445 ; SSE-LABEL: shuffle_v4i32_1z3z:
2447 ; SSE-NEXT: psrlq $32, %xmm0
2450 ; AVX-LABEL: shuffle_v4i32_1z3z:
2452 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
2454 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
2455 ret <4 x i32> %shuffle
2458 define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, ptr %pb) {
2459 ; SSE-LABEL: shuffle_mem_v4f32_0145:
2461 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2464 ; AVX-LABEL: shuffle_mem_v4f32_0145:
2466 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
2468 %b = load <4 x float>, ptr %pb, align 1
2469 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
2470 ret <4 x float> %shuffle
2473 define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, ptr %pb) {
2474 ; SSE2-LABEL: shuffle_mem_v4f32_4523:
2476 ; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2479 ; SSE3-LABEL: shuffle_mem_v4f32_4523:
2481 ; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2484 ; SSSE3-LABEL: shuffle_mem_v4f32_4523:
2486 ; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2489 ; SSE41-LABEL: shuffle_mem_v4f32_4523:
2491 ; SSE41-NEXT: movups (%rdi), %xmm1
2492 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2495 ; AVX-LABEL: shuffle_mem_v4f32_4523:
2497 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2499 %b = load <4 x float>, ptr %pb, align 1
2500 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
2501 ret <4 x float> %shuffle
2504 define <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, ptr %a1) {
2505 ; SSE-LABEL: shuffle_mem_v4f32_0624:
2507 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
2508 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
2511 ; AVX1OR2-LABEL: shuffle_mem_v4f32_0624:
2513 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
2514 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
2515 ; AVX1OR2-NEXT: retq
2517 ; AVX512VL-LABEL: shuffle_mem_v4f32_0624:
2518 ; AVX512VL: # %bb.0:
2519 ; AVX512VL-NEXT: vmovaps (%rdi), %xmm2
2520 ; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,2,4]
2521 ; AVX512VL-NEXT: vpermi2ps %xmm0, %xmm2, %xmm1
2522 ; AVX512VL-NEXT: vmovaps %xmm1, %xmm0
2523 ; AVX512VL-NEXT: retq
2524 %1 = load <4 x float>, ptr %a1
2525 %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
2529 define <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, ptr %a1) {
2530 ; SSE-LABEL: shuffle_mem_v4f32_4760:
2532 ; SSE-NEXT: movaps %xmm0, %xmm1
2533 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
2534 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
2537 ; AVX1OR2-LABEL: shuffle_mem_v4f32_4760:
2539 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0]
2540 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
2541 ; AVX1OR2-NEXT: retq
2543 ; AVX512VL-LABEL: shuffle_mem_v4f32_4760:
2544 ; AVX512VL: # %bb.0:
2545 ; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,3,2,4]
2546 ; AVX512VL-NEXT: vpermt2ps (%rdi), %xmm1, %xmm0
2547 ; AVX512VL-NEXT: retq
2548 %1 = load <4 x float>, ptr %a1
2549 %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 4, i32 7, i32 6, i32 0>