1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3 ; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
4 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL
8 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE
10 ; Verify that the DAG combiner correctly folds bitwise operations across
11 ; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
12 ; basic and always-safe patterns. Also test that the DAG combiner will combine
13 ; target-specific shuffle instructions where reasonable.
15 target triple = "x86_64-unknown-unknown"
17 declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
18 declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
19 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
21 define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
22 ; CHECK-LABEL: combine_pshufd1:
23 ; CHECK: # %bb.0: # %entry
26 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
27 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
31 define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
32 ; CHECK-LABEL: combine_pshufd2:
33 ; CHECK: # %bb.0: # %entry
36 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
37 %b.cast = bitcast <4 x i32> %b to <8 x i16>
38 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
39 %c.cast = bitcast <8 x i16> %c to <4 x i32>
40 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
44 define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
45 ; CHECK-LABEL: combine_pshufd3:
46 ; CHECK: # %bb.0: # %entry
49 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
50 %b.cast = bitcast <4 x i32> %b to <8 x i16>
51 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
52 %c.cast = bitcast <8 x i16> %c to <4 x i32>
53 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
57 define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
58 ; SSE-LABEL: combine_pshufd4:
59 ; SSE: # %bb.0: # %entry
60 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
63 ; AVX-LABEL: combine_pshufd4:
64 ; AVX: # %bb.0: # %entry
65 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
68 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
69 %b.cast = bitcast <4 x i32> %b to <8 x i16>
70 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
71 %c.cast = bitcast <8 x i16> %c to <4 x i32>
72 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
76 define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
77 ; SSE-LABEL: combine_pshufd5:
78 ; SSE: # %bb.0: # %entry
79 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
82 ; AVX-LABEL: combine_pshufd5:
83 ; AVX: # %bb.0: # %entry
84 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
87 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
88 %b.cast = bitcast <4 x i32> %b to <8 x i16>
89 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
90 %c.cast = bitcast <8 x i16> %c to <4 x i32>
91 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
95 define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
96 ; SSE-LABEL: combine_pshufd6:
97 ; SSE: # %bb.0: # %entry
98 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
101 ; AVX1-LABEL: combine_pshufd6:
102 ; AVX1: # %bb.0: # %entry
103 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
106 ; AVX2-LABEL: combine_pshufd6:
107 ; AVX2: # %bb.0: # %entry
108 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
111 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
112 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
116 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
117 ; CHECK-LABEL: combine_pshuflw1:
118 ; CHECK: # %bb.0: # %entry
121 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
122 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
126 define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
127 ; CHECK-LABEL: combine_pshuflw2:
128 ; CHECK: # %bb.0: # %entry
131 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
132 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
133 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
137 define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
138 ; SSE-LABEL: combine_pshuflw3:
139 ; SSE: # %bb.0: # %entry
140 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
143 ; AVX-LABEL: combine_pshuflw3:
144 ; AVX: # %bb.0: # %entry
145 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
148 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
149 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
150 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
154 define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
155 ; SSE-LABEL: combine_pshufhw1:
156 ; SSE: # %bb.0: # %entry
157 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
160 ; AVX-LABEL: combine_pshufhw1:
161 ; AVX: # %bb.0: # %entry
162 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
165 %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
166 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
167 %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
171 define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
172 ; SSE-LABEL: combine_bitwise_ops_test1:
174 ; SSE-NEXT: pand %xmm1, %xmm0
175 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
178 ; AVX-LABEL: combine_bitwise_ops_test1:
180 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
181 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
183 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
184 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
185 %and = and <4 x i32> %shuf1, %shuf2
189 define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
190 ; SSE-LABEL: combine_bitwise_ops_test2:
192 ; SSE-NEXT: por %xmm1, %xmm0
193 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
196 ; AVX-LABEL: combine_bitwise_ops_test2:
198 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
199 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
201 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
202 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
203 %or = or <4 x i32> %shuf1, %shuf2
207 define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
208 ; SSE-LABEL: combine_bitwise_ops_test3:
210 ; SSE-NEXT: pxor %xmm1, %xmm0
211 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
214 ; AVX-LABEL: combine_bitwise_ops_test3:
216 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
217 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
219 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
220 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
221 %xor = xor <4 x i32> %shuf1, %shuf2
225 define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
226 ; SSE-LABEL: combine_bitwise_ops_test4:
228 ; SSE-NEXT: pand %xmm1, %xmm0
229 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
232 ; AVX-LABEL: combine_bitwise_ops_test4:
234 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
235 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
237 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
238 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
239 %and = and <4 x i32> %shuf1, %shuf2
243 define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
244 ; SSE-LABEL: combine_bitwise_ops_test5:
246 ; SSE-NEXT: por %xmm1, %xmm0
247 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
250 ; AVX-LABEL: combine_bitwise_ops_test5:
252 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
253 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
255 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
256 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
257 %or = or <4 x i32> %shuf1, %shuf2
261 define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
262 ; SSE-LABEL: combine_bitwise_ops_test6:
264 ; SSE-NEXT: pxor %xmm1, %xmm0
265 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
268 ; AVX-LABEL: combine_bitwise_ops_test6:
270 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
271 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
273 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
274 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
275 %xor = xor <4 x i32> %shuf1, %shuf2
280 ; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
281 ; are not performing a swizzle operations.
283 define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
284 ; SSE2-LABEL: combine_bitwise_ops_test1b:
286 ; SSE2-NEXT: pand %xmm1, %xmm0
287 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
288 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
289 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
292 ; SSSE3-LABEL: combine_bitwise_ops_test1b:
294 ; SSSE3-NEXT: pand %xmm1, %xmm0
295 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
296 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
297 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
300 ; SSE41-LABEL: combine_bitwise_ops_test1b:
302 ; SSE41-NEXT: andps %xmm1, %xmm0
303 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
306 ; AVX-LABEL: combine_bitwise_ops_test1b:
308 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
309 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
311 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
312 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
313 %and = and <4 x i32> %shuf1, %shuf2
317 define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
318 ; SSE2-LABEL: combine_bitwise_ops_test2b:
320 ; SSE2-NEXT: por %xmm1, %xmm0
321 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
322 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
323 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
326 ; SSSE3-LABEL: combine_bitwise_ops_test2b:
328 ; SSSE3-NEXT: por %xmm1, %xmm0
329 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
330 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
331 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
334 ; SSE41-LABEL: combine_bitwise_ops_test2b:
336 ; SSE41-NEXT: orps %xmm1, %xmm0
337 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
340 ; AVX-LABEL: combine_bitwise_ops_test2b:
342 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
343 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
345 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
346 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
347 %or = or <4 x i32> %shuf1, %shuf2
351 define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
352 ; SSE2-LABEL: combine_bitwise_ops_test3b:
354 ; SSE2-NEXT: xorps %xmm1, %xmm0
355 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
358 ; SSSE3-LABEL: combine_bitwise_ops_test3b:
360 ; SSSE3-NEXT: xorps %xmm1, %xmm0
361 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
364 ; SSE41-LABEL: combine_bitwise_ops_test3b:
366 ; SSE41-NEXT: xorps %xmm1, %xmm0
367 ; SSE41-NEXT: xorps %xmm1, %xmm1
368 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
371 ; AVX-LABEL: combine_bitwise_ops_test3b:
373 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
374 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
375 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
377 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
378 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
379 %xor = xor <4 x i32> %shuf1, %shuf2
383 define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
384 ; SSE2-LABEL: combine_bitwise_ops_test4b:
386 ; SSE2-NEXT: pand %xmm1, %xmm0
387 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
388 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
389 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
392 ; SSSE3-LABEL: combine_bitwise_ops_test4b:
394 ; SSSE3-NEXT: pand %xmm1, %xmm0
395 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
396 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
397 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
400 ; SSE41-LABEL: combine_bitwise_ops_test4b:
402 ; SSE41-NEXT: andps %xmm1, %xmm0
403 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
406 ; AVX-LABEL: combine_bitwise_ops_test4b:
408 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
409 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
411 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
412 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
413 %and = and <4 x i32> %shuf1, %shuf2
417 define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
418 ; SSE2-LABEL: combine_bitwise_ops_test5b:
420 ; SSE2-NEXT: por %xmm1, %xmm0
421 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
422 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
423 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
426 ; SSSE3-LABEL: combine_bitwise_ops_test5b:
428 ; SSSE3-NEXT: por %xmm1, %xmm0
429 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
430 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
431 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
434 ; SSE41-LABEL: combine_bitwise_ops_test5b:
436 ; SSE41-NEXT: orps %xmm1, %xmm0
437 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
440 ; AVX-LABEL: combine_bitwise_ops_test5b:
442 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
443 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
445 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
446 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
447 %or = or <4 x i32> %shuf1, %shuf2
451 define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
452 ; SSE2-LABEL: combine_bitwise_ops_test6b:
454 ; SSE2-NEXT: xorps %xmm1, %xmm0
455 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
458 ; SSSE3-LABEL: combine_bitwise_ops_test6b:
460 ; SSSE3-NEXT: xorps %xmm1, %xmm0
461 ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
464 ; SSE41-LABEL: combine_bitwise_ops_test6b:
466 ; SSE41-NEXT: xorps %xmm1, %xmm0
467 ; SSE41-NEXT: xorps %xmm1, %xmm1
468 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
471 ; AVX-LABEL: combine_bitwise_ops_test6b:
473 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
474 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
475 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
477 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
478 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
479 %xor = xor <4 x i32> %shuf1, %shuf2
483 define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
484 ; SSE-LABEL: combine_bitwise_ops_test1c:
486 ; SSE-NEXT: andps %xmm1, %xmm0
487 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
490 ; AVX-LABEL: combine_bitwise_ops_test1c:
492 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
493 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
495 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
496 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
497 %and = and <4 x i32> %shuf1, %shuf2
501 define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
502 ; SSE-LABEL: combine_bitwise_ops_test2c:
504 ; SSE-NEXT: orps %xmm1, %xmm0
505 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
508 ; AVX-LABEL: combine_bitwise_ops_test2c:
510 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
511 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
513 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
514 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
515 %or = or <4 x i32> %shuf1, %shuf2
519 define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
520 ; SSE2-LABEL: combine_bitwise_ops_test3c:
522 ; SSE2-NEXT: xorps %xmm1, %xmm0
523 ; SSE2-NEXT: xorps %xmm1, %xmm1
524 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
527 ; SSSE3-LABEL: combine_bitwise_ops_test3c:
529 ; SSSE3-NEXT: xorps %xmm1, %xmm0
530 ; SSSE3-NEXT: xorps %xmm1, %xmm1
531 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
534 ; SSE41-LABEL: combine_bitwise_ops_test3c:
536 ; SSE41-NEXT: xorps %xmm1, %xmm0
537 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
540 ; AVX-LABEL: combine_bitwise_ops_test3c:
542 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
543 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
545 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
546 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
547 %xor = xor <4 x i32> %shuf1, %shuf2
551 define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
552 ; SSE-LABEL: combine_bitwise_ops_test4c:
554 ; SSE-NEXT: andps %xmm1, %xmm0
555 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
556 ; SSE-NEXT: movaps %xmm2, %xmm0
559 ; AVX-LABEL: combine_bitwise_ops_test4c:
561 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
562 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
564 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
565 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
566 %and = and <4 x i32> %shuf1, %shuf2
570 define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
571 ; SSE-LABEL: combine_bitwise_ops_test5c:
573 ; SSE-NEXT: orps %xmm1, %xmm0
574 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
575 ; SSE-NEXT: movaps %xmm2, %xmm0
578 ; AVX-LABEL: combine_bitwise_ops_test5c:
580 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
581 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
583 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
584 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
585 %or = or <4 x i32> %shuf1, %shuf2
589 define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
590 ; SSE2-LABEL: combine_bitwise_ops_test6c:
592 ; SSE2-NEXT: xorps %xmm1, %xmm0
593 ; SSE2-NEXT: xorps %xmm1, %xmm1
594 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
595 ; SSE2-NEXT: movaps %xmm1, %xmm0
598 ; SSSE3-LABEL: combine_bitwise_ops_test6c:
600 ; SSSE3-NEXT: xorps %xmm1, %xmm0
601 ; SSSE3-NEXT: xorps %xmm1, %xmm1
602 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
603 ; SSSE3-NEXT: movaps %xmm1, %xmm0
606 ; SSE41-LABEL: combine_bitwise_ops_test6c:
608 ; SSE41-NEXT: xorps %xmm1, %xmm0
609 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
612 ; AVX-LABEL: combine_bitwise_ops_test6c:
614 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
615 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
617 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
618 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
619 %xor = xor <4 x i32> %shuf1, %shuf2
623 define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
624 ; SSE-LABEL: combine_nested_undef_test1:
626 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
629 ; AVX-LABEL: combine_nested_undef_test1:
631 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
633 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
634 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
638 define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
639 ; SSE-LABEL: combine_nested_undef_test2:
641 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
644 ; AVX-LABEL: combine_nested_undef_test2:
646 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
648 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
649 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
653 define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
654 ; SSE-LABEL: combine_nested_undef_test3:
656 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
659 ; AVX-LABEL: combine_nested_undef_test3:
661 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
663 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
664 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
668 define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
669 ; SSE-LABEL: combine_nested_undef_test4:
671 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
674 ; AVX1-LABEL: combine_nested_undef_test4:
676 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
679 ; AVX2-LABEL: combine_nested_undef_test4:
681 ; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
683 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
684 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
688 define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
689 ; SSE-LABEL: combine_nested_undef_test5:
691 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
694 ; AVX-LABEL: combine_nested_undef_test5:
696 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
698 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
699 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
703 define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
704 ; SSE-LABEL: combine_nested_undef_test6:
706 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
709 ; AVX-LABEL: combine_nested_undef_test6:
711 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
713 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
714 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
718 define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
719 ; SSE-LABEL: combine_nested_undef_test7:
721 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
724 ; AVX-LABEL: combine_nested_undef_test7:
726 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
728 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
729 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
733 define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
734 ; SSE-LABEL: combine_nested_undef_test8:
736 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
739 ; AVX-LABEL: combine_nested_undef_test8:
741 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
743 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
744 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
748 define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
749 ; SSE-LABEL: combine_nested_undef_test9:
751 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
754 ; AVX-LABEL: combine_nested_undef_test9:
756 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2]
758 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
759 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
763 define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
764 ; SSE-LABEL: combine_nested_undef_test10:
766 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
769 ; AVX-LABEL: combine_nested_undef_test10:
771 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
773 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
774 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
778 define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
779 ; SSE-LABEL: combine_nested_undef_test11:
781 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
784 ; AVX-LABEL: combine_nested_undef_test11:
786 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1]
788 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
789 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
793 define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
794 ; SSE-LABEL: combine_nested_undef_test12:
796 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
799 ; AVX1-LABEL: combine_nested_undef_test12:
801 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
804 ; AVX2-LABEL: combine_nested_undef_test12:
806 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
808 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
809 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
813 ; The following pair of shuffles is folded into vector %A.
814 define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
815 ; CHECK-LABEL: combine_nested_undef_test13:
818 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
819 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
823 ; The following pair of shuffles is folded into vector %B.
824 define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
825 ; SSE-LABEL: combine_nested_undef_test14:
827 ; SSE-NEXT: movaps %xmm1, %xmm0
830 ; AVX-LABEL: combine_nested_undef_test14:
832 ; AVX-NEXT: vmovaps %xmm1, %xmm0
834 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
835 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
840 ; Verify that we don't optimize the following cases. We expect more than one shuffle.
842 ; FIXME: Many of these already don't make sense, and the rest should stop
843 ; making sense with th enew vector shuffle lowering. Revisit at least testing for
846 define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
847 ; SSE2-LABEL: combine_nested_undef_test15:
849 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
850 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
851 ; SSE2-NEXT: movaps %xmm1, %xmm0
854 ; SSSE3-LABEL: combine_nested_undef_test15:
856 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
857 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
858 ; SSSE3-NEXT: movaps %xmm1, %xmm0
861 ; SSE41-LABEL: combine_nested_undef_test15:
863 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
864 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
865 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
868 ; AVX1-LABEL: combine_nested_undef_test15:
870 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
871 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
872 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
875 ; AVX2-LABEL: combine_nested_undef_test15:
877 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
878 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
879 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
881 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
882 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
886 define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
887 ; SSE2-LABEL: combine_nested_undef_test16:
889 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
890 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
891 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
894 ; SSSE3-LABEL: combine_nested_undef_test16:
896 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
897 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
898 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
901 ; SSE41-LABEL: combine_nested_undef_test16:
903 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
904 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
907 ; AVX-LABEL: combine_nested_undef_test16:
909 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
910 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
912 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
913 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
917 define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
918 ; SSE2-LABEL: combine_nested_undef_test17:
920 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
921 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
924 ; SSSE3-LABEL: combine_nested_undef_test17:
926 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
927 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
930 ; SSE41-LABEL: combine_nested_undef_test17:
932 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
933 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
936 ; AVX-LABEL: combine_nested_undef_test17:
938 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
939 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
941 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
942 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
946 define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
947 ; SSE-LABEL: combine_nested_undef_test18:
949 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
952 ; AVX-LABEL: combine_nested_undef_test18:
954 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3]
956 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
957 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
961 define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
962 ; SSE2-LABEL: combine_nested_undef_test19:
964 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
965 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
968 ; SSSE3-LABEL: combine_nested_undef_test19:
970 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
971 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
974 ; SSE41-LABEL: combine_nested_undef_test19:
976 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
977 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
980 ; AVX-LABEL: combine_nested_undef_test19:
982 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
983 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
985 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
986 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
990 define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
991 ; SSE2-LABEL: combine_nested_undef_test20:
993 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
994 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
995 ; SSE2-NEXT: movaps %xmm1, %xmm0
998 ; SSSE3-LABEL: combine_nested_undef_test20:
1000 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1001 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1002 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1005 ; SSE41-LABEL: combine_nested_undef_test20:
1007 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1008 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1011 ; AVX-LABEL: combine_nested_undef_test20:
1013 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1014 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0]
1016 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
1017 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1021 define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
1022 ; SSE2-LABEL: combine_nested_undef_test21:
1024 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1025 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1028 ; SSSE3-LABEL: combine_nested_undef_test21:
1030 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1031 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1034 ; SSE41-LABEL: combine_nested_undef_test21:
1036 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1037 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1040 ; AVX1-LABEL: combine_nested_undef_test21:
1042 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1043 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1046 ; AVX2-LABEL: combine_nested_undef_test21:
1048 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1049 ; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1051 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1052 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1057 ; Test that we correctly combine shuffles according to rule
1058 ; shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
1060 define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
1061 ; SSE-LABEL: combine_nested_undef_test22:
1063 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1066 ; AVX-LABEL: combine_nested_undef_test22:
1068 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3]
1070 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1071 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
1075 define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
1076 ; SSE-LABEL: combine_nested_undef_test23:
1078 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1081 ; AVX-LABEL: combine_nested_undef_test23:
1083 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3]
1085 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1086 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1090 define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
1091 ; SSE-LABEL: combine_nested_undef_test24:
1093 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1096 ; AVX-LABEL: combine_nested_undef_test24:
1098 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3]
1100 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1101 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
1105 define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
1106 ; SSE-LABEL: combine_nested_undef_test25:
1108 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1111 ; AVX1-LABEL: combine_nested_undef_test25:
1113 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1116 ; AVX2-LABEL: combine_nested_undef_test25:
1118 ; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1120 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
1121 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
1125 define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
1126 ; SSE-LABEL: combine_nested_undef_test26:
1128 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1131 ; AVX-LABEL: combine_nested_undef_test26:
1133 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
1135 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
1136 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
1140 define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
1141 ; SSE-LABEL: combine_nested_undef_test27:
1143 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1146 ; AVX1-LABEL: combine_nested_undef_test27:
1148 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1151 ; AVX2-LABEL: combine_nested_undef_test27:
1153 ; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1155 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
1156 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
1160 define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
1161 ; SSE-LABEL: combine_nested_undef_test28:
1163 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1166 ; AVX-LABEL: combine_nested_undef_test28:
1168 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
1170 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
1171 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
1175 define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
1176 ; SSE-LABEL: combine_test1:
1178 ; SSE-NEXT: movaps %xmm1, %xmm0
1181 ; AVX-LABEL: combine_test1:
1183 ; AVX-NEXT: vmovaps %xmm1, %xmm0
1185 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1186 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1190 define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
1191 ; SSE2-LABEL: combine_test2:
1193 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1194 ; SSE2-NEXT: movaps %xmm1, %xmm0
1197 ; SSSE3-LABEL: combine_test2:
1199 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1200 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1203 ; SSE41-LABEL: combine_test2:
1205 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1208 ; AVX-LABEL: combine_test2:
1210 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1212 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1213 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1217 define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
1218 ; SSE-LABEL: combine_test3:
1220 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1223 ; AVX-LABEL: combine_test3:
1225 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1227 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1228 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1232 define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
1233 ; SSE-LABEL: combine_test4:
1235 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1238 ; AVX-LABEL: combine_test4:
1240 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1242 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1243 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1247 define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
1248 ; SSE2-LABEL: combine_test5:
1250 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1251 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1254 ; SSSE3-LABEL: combine_test5:
1256 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1257 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1260 ; SSE41-LABEL: combine_test5:
1262 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1265 ; AVX-LABEL: combine_test5:
1267 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1269 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1270 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1274 define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
1275 ; SSE-LABEL: combine_test6:
1277 ; SSE-NEXT: movaps %xmm1, %xmm0
1280 ; AVX-LABEL: combine_test6:
1282 ; AVX-NEXT: vmovaps %xmm1, %xmm0
1284 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1285 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1289 define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
1290 ; SSE2-LABEL: combine_test7:
1292 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1293 ; SSE2-NEXT: movaps %xmm1, %xmm0
1296 ; SSSE3-LABEL: combine_test7:
1298 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1299 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1302 ; SSE41-LABEL: combine_test7:
1304 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1307 ; AVX-LABEL: combine_test7:
1309 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1311 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1312 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1316 define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
1317 ; SSE-LABEL: combine_test8:
1319 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1322 ; AVX-LABEL: combine_test8:
1324 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1326 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1327 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1331 define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
1332 ; SSE-LABEL: combine_test9:
1334 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1335 ; SSE-NEXT: movaps %xmm1, %xmm0
1338 ; AVX-LABEL: combine_test9:
1340 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1342 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1343 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1347 define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
1348 ; SSE2-LABEL: combine_test10:
1350 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1351 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1354 ; SSSE3-LABEL: combine_test10:
1356 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1357 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1360 ; SSE41-LABEL: combine_test10:
1362 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1365 ; AVX-LABEL: combine_test10:
1367 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1369 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1370 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1374 define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
1375 ; CHECK-LABEL: combine_test11:
1378 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1379 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1383 define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
1384 ; SSE2-LABEL: combine_test12:
1386 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1387 ; SSE2-NEXT: movaps %xmm1, %xmm0
1390 ; SSSE3-LABEL: combine_test12:
1392 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1393 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1396 ; SSE41-LABEL: combine_test12:
1398 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1401 ; AVX-LABEL: combine_test12:
1403 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1405 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1406 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1410 define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
1411 ; SSE-LABEL: combine_test13:
1413 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1416 ; AVX-LABEL: combine_test13:
1418 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1420 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1421 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1425 define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
1426 ; SSE-LABEL: combine_test14:
1428 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1431 ; AVX-LABEL: combine_test14:
1433 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1435 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1436 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1440 define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
1441 ; SSE2-LABEL: combine_test15:
1443 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1444 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1447 ; SSSE3-LABEL: combine_test15:
1449 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1450 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1453 ; SSE41-LABEL: combine_test15:
1455 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1458 ; AVX-LABEL: combine_test15:
1460 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1462 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1463 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1467 define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
1468 ; CHECK-LABEL: combine_test16:
1471 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1472 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1476 define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
1477 ; SSE2-LABEL: combine_test17:
1479 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1480 ; SSE2-NEXT: movaps %xmm1, %xmm0
1483 ; SSSE3-LABEL: combine_test17:
1485 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1486 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1489 ; SSE41-LABEL: combine_test17:
1491 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1494 ; AVX-LABEL: combine_test17:
1496 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1498 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1499 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1503 define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
1504 ; SSE-LABEL: combine_test18:
1506 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1509 ; AVX-LABEL: combine_test18:
1511 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1513 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1514 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1518 define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
1519 ; SSE-LABEL: combine_test19:
1521 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1524 ; AVX-LABEL: combine_test19:
1526 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1528 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1529 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1533 define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
1534 ; SSE2-LABEL: combine_test20:
1536 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1537 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1540 ; SSSE3-LABEL: combine_test20:
1542 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1543 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1546 ; SSE41-LABEL: combine_test20:
1548 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1551 ; AVX-LABEL: combine_test20:
1553 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1555 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1556 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1560 define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
1561 ; SSE-LABEL: combine_test21:
1563 ; SSE-NEXT: movaps %xmm0, %xmm2
1564 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1565 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1566 ; SSE-NEXT: movaps %xmm2, (%rdi)
1569 ; AVX-LABEL: combine_test21:
1571 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
1572 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1573 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1574 ; AVX-NEXT: vmovaps %xmm2, (%rdi)
1575 ; AVX-NEXT: vzeroupper
1577 %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1578 %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1579 store <4 x i32> %1, <4 x i32>* %ptr, align 16
1583 define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
1584 ; SSE-LABEL: combine_test22:
1586 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1587 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1590 ; AVX-LABEL: combine_test22:
1592 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1593 ; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1595 ; Current AVX2 lowering of this is still awful, not adding a test case.
1596 %1 = load <2 x float>, <2 x float>* %a, align 8
1597 %2 = load <2 x float>, <2 x float>* %b, align 8
1598 %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1603 define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) {
1604 ; SSE-LABEL: combine_test23:
1606 ; SSE-NEXT: movups %xmm0, (%rdi)
1609 ; AVX-LABEL: combine_test23:
1611 ; AVX-NEXT: vmovups %xmm0, (%rdi)
1612 ; AVX-NEXT: vzeroupper
1614 %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1
1615 %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1>
1616 %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3>
1617 store <2 x float> %shuffle0, <2 x float>* %ptr, align 8
1618 store <2 x float> %shuffle1, <2 x float>* %idx2, align 8
1622 ; Check some negative cases.
1623 ; FIXME: Do any of these really make sense? Are they redundant with the above tests?
1625 define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
1626 ; SSE-LABEL: combine_test1b:
1628 ; SSE-NEXT: movaps %xmm1, %xmm0
1629 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
1632 ; AVX-LABEL: combine_test1b:
1634 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
1636 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1637 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
1641 define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
1642 ; SSE2-LABEL: combine_test2b:
1644 ; SSE2-NEXT: movaps %xmm1, %xmm0
1645 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1648 ; SSSE3-LABEL: combine_test2b:
1650 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
1653 ; SSE41-LABEL: combine_test2b:
1655 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
1658 ; AVX-LABEL: combine_test2b:
1660 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0]
1662 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1663 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
1667 define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
1668 ; SSE2-LABEL: combine_test3b:
1670 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1671 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1674 ; SSSE3-LABEL: combine_test3b:
1676 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1677 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1680 ; SSE41-LABEL: combine_test3b:
1682 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1683 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1686 ; AVX-LABEL: combine_test3b:
1688 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1689 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1691 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
1692 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
1696 define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
1697 ; SSE-LABEL: combine_test4b:
1699 ; SSE-NEXT: movaps %xmm1, %xmm0
1700 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
1703 ; AVX-LABEL: combine_test4b:
1705 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
1707 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1708 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
1713 ; Verify that we correctly fold shuffles even when we use illegal vector types.
1715 define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
1716 ; SSE2-LABEL: combine_test1c:
1718 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1719 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1720 ; SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1721 ; SSE2-NEXT: andps %xmm0, %xmm2
1722 ; SSE2-NEXT: andnps %xmm1, %xmm0
1723 ; SSE2-NEXT: orps %xmm2, %xmm0
1726 ; SSSE3-LABEL: combine_test1c:
1728 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1729 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1730 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1731 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1734 ; SSE41-LABEL: combine_test1c:
1736 ; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1737 ; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1738 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1739 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
1740 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1743 ; AVX-LABEL: combine_test1c:
1745 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1746 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1747 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1748 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1750 %A = load <4 x i8>, <4 x i8>* %a
1751 %B = load <4 x i8>, <4 x i8>* %b
1752 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1753 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1757 define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
1758 ; SSE-LABEL: combine_test2c:
1760 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1761 ; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1762 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1765 ; AVX-LABEL: combine_test2c:
1767 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1768 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1769 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1771 %A = load <4 x i8>, <4 x i8>* %a
1772 %B = load <4 x i8>, <4 x i8>* %b
1773 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
1774 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1778 define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
1779 ; SSE-LABEL: combine_test3c:
1781 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1782 ; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1783 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1784 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1787 ; AVX-LABEL: combine_test3c:
1789 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1790 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1791 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1792 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1794 %A = load <4 x i8>, <4 x i8>* %a
1795 %B = load <4 x i8>, <4 x i8>* %b
1796 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1797 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1801 define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
1802 ; SSE2-LABEL: combine_test4c:
1804 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1805 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1806 ; SSE2-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1807 ; SSE2-NEXT: andps %xmm0, %xmm2
1808 ; SSE2-NEXT: andnps %xmm1, %xmm0
1809 ; SSE2-NEXT: orps %xmm2, %xmm0
1812 ; SSSE3-LABEL: combine_test4c:
1814 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1815 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1816 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1817 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1820 ; SSE41-LABEL: combine_test4c:
1822 ; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1823 ; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1824 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1825 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
1826 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1829 ; AVX-LABEL: combine_test4c:
1831 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1832 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1833 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1834 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1836 %A = load <4 x i8>, <4 x i8>* %a
1837 %B = load <4 x i8>, <4 x i8>* %b
1838 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1839 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1844 ; The following test cases are generated from this C++ code
1846 ;__m128 blend_01(__m128 a, __m128 b)
1849 ; s = _mm_blend_ps( s, b, 1<<0 );
1850 ; s = _mm_blend_ps( s, b, 1<<1 );
1854 ;__m128 blend_02(__m128 a, __m128 b)
1857 ; s = _mm_blend_ps( s, b, 1<<0 );
1858 ; s = _mm_blend_ps( s, b, 1<<2 );
1862 ;__m128 blend_123(__m128 a, __m128 b)
1865 ; s = _mm_blend_ps( s, b, 1<<1 );
1866 ; s = _mm_blend_ps( s, b, 1<<2 );
1867 ; s = _mm_blend_ps( s, b, 1<<3 );
1871 ; Ideally, we should collapse the following shuffles into a single one.
1873 define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
1874 ; SSE2-LABEL: combine_blend_01:
1876 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1879 ; SSSE3-LABEL: combine_blend_01:
1881 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1884 ; SSE41-LABEL: combine_blend_01:
1886 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1889 ; AVX-LABEL: combine_blend_01:
1891 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1893 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
1894 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1895 ret <4 x float> %shuffle6
1898 define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
1899 ; SSE2-LABEL: combine_blend_02:
1901 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
1902 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1903 ; SSE2-NEXT: movaps %xmm1, %xmm0
1906 ; SSSE3-LABEL: combine_blend_02:
1908 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
1909 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1910 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1913 ; SSE41-LABEL: combine_blend_02:
1915 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1918 ; AVX-LABEL: combine_blend_02:
1920 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1922 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
1923 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1924 ret <4 x float> %shuffle6
1927 define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
1928 ; SSE2-LABEL: combine_blend_123:
1930 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1931 ; SSE2-NEXT: movaps %xmm1, %xmm0
1934 ; SSSE3-LABEL: combine_blend_123:
1936 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1937 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1940 ; SSE41-LABEL: combine_blend_123:
1942 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1945 ; AVX-LABEL: combine_blend_123:
1947 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1949 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
1950 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
1951 %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1952 ret <4 x float> %shuffle12
1955 define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
1956 ; SSE-LABEL: combine_test_movhl_1:
1958 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1959 ; SSE-NEXT: movaps %xmm1, %xmm0
1962 ; AVX-LABEL: combine_test_movhl_1:
1964 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1966 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
1967 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
1971 define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
1972 ; SSE-LABEL: combine_test_movhl_2:
1974 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1975 ; SSE-NEXT: movaps %xmm1, %xmm0
1978 ; AVX-LABEL: combine_test_movhl_2:
1980 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1982 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
1983 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
1987 define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
1988 ; SSE-LABEL: combine_test_movhl_3:
1990 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1991 ; SSE-NEXT: movaps %xmm1, %xmm0
1994 ; AVX-LABEL: combine_test_movhl_3:
1996 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1998 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
1999 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
2004 ; Verify that we fold shuffles according to rule:
2005 ; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
2007 define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
2008 ; SSE2-LABEL: combine_undef_input_test1:
2010 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2013 ; SSSE3-LABEL: combine_undef_input_test1:
2015 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2018 ; SSE41-LABEL: combine_undef_input_test1:
2020 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2023 ; AVX-LABEL: combine_undef_input_test1:
2025 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2027 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2028 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2032 define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
2033 ; SSE-LABEL: combine_undef_input_test2:
2035 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2038 ; AVX-LABEL: combine_undef_input_test2:
2040 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2042 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2043 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2047 define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
2048 ; SSE-LABEL: combine_undef_input_test3:
2050 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2053 ; AVX-LABEL: combine_undef_input_test3:
2055 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2057 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2058 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2062 define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
2063 ; SSE-LABEL: combine_undef_input_test4:
2065 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2068 ; AVX-LABEL: combine_undef_input_test4:
2070 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2072 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2073 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2077 define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
2078 ; SSE2-LABEL: combine_undef_input_test5:
2080 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2083 ; SSSE3-LABEL: combine_undef_input_test5:
2085 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2088 ; SSE41-LABEL: combine_undef_input_test5:
2090 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2093 ; AVX-LABEL: combine_undef_input_test5:
2095 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2097 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2098 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2103 ; Verify that we fold shuffles according to rule:
2104 ; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2106 define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
2107 ; CHECK-LABEL: combine_undef_input_test6:
2110 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2111 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2115 define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
2116 ; SSE2-LABEL: combine_undef_input_test7:
2118 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2121 ; SSSE3-LABEL: combine_undef_input_test7:
2123 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2126 ; SSE41-LABEL: combine_undef_input_test7:
2128 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2131 ; AVX-LABEL: combine_undef_input_test7:
2133 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2135 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2136 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2140 define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
2141 ; SSE2-LABEL: combine_undef_input_test8:
2143 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2146 ; SSSE3-LABEL: combine_undef_input_test8:
2148 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2151 ; SSE41-LABEL: combine_undef_input_test8:
2153 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2156 ; AVX-LABEL: combine_undef_input_test8:
2158 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2160 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2161 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2165 define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
2166 ; SSE-LABEL: combine_undef_input_test9:
2168 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
2171 ; AVX-LABEL: combine_undef_input_test9:
2173 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
2175 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2176 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2180 define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
2181 ; CHECK-LABEL: combine_undef_input_test10:
2184 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2185 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2189 define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
2190 ; SSE2-LABEL: combine_undef_input_test11:
2192 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2195 ; SSSE3-LABEL: combine_undef_input_test11:
2197 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2200 ; SSE41-LABEL: combine_undef_input_test11:
2202 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2205 ; AVX-LABEL: combine_undef_input_test11:
2207 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2209 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2210 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
2214 define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
2215 ; SSE-LABEL: combine_undef_input_test12:
2217 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2220 ; AVX-LABEL: combine_undef_input_test12:
2222 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2224 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2225 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2229 define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
2230 ; SSE-LABEL: combine_undef_input_test13:
2232 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2235 ; AVX-LABEL: combine_undef_input_test13:
2237 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2239 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2240 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
2244 define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
2245 ; SSE-LABEL: combine_undef_input_test14:
2247 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2250 ; AVX-LABEL: combine_undef_input_test14:
2252 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2254 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2255 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2259 define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
2260 ; SSE2-LABEL: combine_undef_input_test15:
2262 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2265 ; SSSE3-LABEL: combine_undef_input_test15:
2267 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2270 ; SSE41-LABEL: combine_undef_input_test15:
2272 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2275 ; AVX-LABEL: combine_undef_input_test15:
2277 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2279 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2280 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2285 ; Verify that shuffles are canonicalized according to rules:
2286 ; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
2288 ; This allows to trigger the following combine rule:
2289 ; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2291 ; As a result, all the shuffle pairs in each function below should be
2292 ; combined into a single legal shuffle operation.
2294 define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
2295 ; CHECK-LABEL: combine_undef_input_test16:
2298 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2299 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
2303 define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
2304 ; SSE2-LABEL: combine_undef_input_test17:
2306 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2309 ; SSSE3-LABEL: combine_undef_input_test17:
2311 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2314 ; SSE41-LABEL: combine_undef_input_test17:
2316 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2319 ; AVX-LABEL: combine_undef_input_test17:
2321 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2323 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2324 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2328 define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
2329 ; SSE2-LABEL: combine_undef_input_test18:
2331 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2334 ; SSSE3-LABEL: combine_undef_input_test18:
2336 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2339 ; SSE41-LABEL: combine_undef_input_test18:
2341 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2344 ; AVX-LABEL: combine_undef_input_test18:
2346 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2348 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2349 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
2353 define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
2354 ; SSE-LABEL: combine_undef_input_test19:
2356 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
2359 ; AVX-LABEL: combine_undef_input_test19:
2361 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
2363 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2364 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2368 define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
2369 ; CHECK-LABEL: combine_undef_input_test20:
2372 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2373 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2377 ; These tests are designed to test the ability to combine away unnecessary
2378 ; operations feeding into a shuffle. The AVX cases are the important ones as
2379 ; they leverage operations which cannot be done naturally on the entire vector
2380 ; and thus are decomposed into multiple smaller operations.
2382 define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
2383 ; SSE-LABEL: combine_unneeded_subvector1:
2385 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2386 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
2387 ; SSE-NEXT: movdqa %xmm0, %xmm1
2390 ; AVX1-LABEL: combine_unneeded_subvector1:
2392 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2393 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2394 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2395 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2398 ; AVX2-SLOW-LABEL: combine_unneeded_subvector1:
2399 ; AVX2-SLOW: # %bb.0:
2400 ; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2401 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2402 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
2403 ; AVX2-SLOW-NEXT: retq
2405 ; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1:
2406 ; AVX2-FAST-ALL: # %bb.0:
2407 ; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2408 ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
2409 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
2410 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
2411 ; AVX2-FAST-ALL-NEXT: retq
2413 ; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1:
2414 ; AVX2-FAST-PERLANE: # %bb.0:
2415 ; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2416 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2417 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
2418 ; AVX2-FAST-PERLANE-NEXT: retq
2419 %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2420 %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
2424 define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
2425 ; SSE-LABEL: combine_unneeded_subvector2:
2427 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2428 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
2429 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
2432 ; AVX1-LABEL: combine_unneeded_subvector2:
2434 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2435 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2436 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2437 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2438 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2441 ; AVX2-LABEL: combine_unneeded_subvector2:
2443 ; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2444 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2445 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2447 %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2448 %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
2452 define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
2453 ; SSE2-LABEL: combine_insertps1:
2455 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2456 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2457 ; SSE2-NEXT: movaps %xmm1, %xmm0
2460 ; SSSE3-LABEL: combine_insertps1:
2462 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2463 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2464 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2467 ; SSE41-LABEL: combine_insertps1:
2469 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2472 ; AVX-LABEL: combine_insertps1:
2474 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2477 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
2478 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
2482 define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
2483 ; SSE2-LABEL: combine_insertps2:
2485 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2486 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2487 ; SSE2-NEXT: movaps %xmm1, %xmm0
2490 ; SSSE3-LABEL: combine_insertps2:
2492 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2493 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2494 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2497 ; SSE41-LABEL: combine_insertps2:
2499 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2502 ; AVX-LABEL: combine_insertps2:
2504 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2507 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
2508 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2512 define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
2513 ; SSE2-LABEL: combine_insertps3:
2515 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2516 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2519 ; SSSE3-LABEL: combine_insertps3:
2521 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2522 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2525 ; SSE41-LABEL: combine_insertps3:
2527 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2530 ; AVX-LABEL: combine_insertps3:
2532 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2535 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2536 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
2540 define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
2541 ; SSE2-LABEL: combine_insertps4:
2543 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2544 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2547 ; SSSE3-LABEL: combine_insertps4:
2549 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2550 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2553 ; SSE41-LABEL: combine_insertps4:
2555 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2558 ; AVX-LABEL: combine_insertps4:
2560 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2563 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2564 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
2568 define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) {
2569 ; SSE-LABEL: combine_scalar_load_with_blend_with_zero:
2571 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
2572 ; SSE-NEXT: movaps %xmm0, (%rsi)
2575 ; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
2577 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2578 ; AVX-NEXT: vmovaps %xmm0, (%rsi)
2580 %1 = load double, double* %a0, align 8
2581 %2 = insertelement <2 x double> undef, double %1, i32 0
2582 %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1
2583 %4 = bitcast <2 x double> %3 to <4 x float>
2584 %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
2585 store <4 x float> %5, <4 x float>* %a1, align 16
2590 define <4 x float> @combine_constant_insertion_v4f32(float %f) {
2591 ; SSE2-LABEL: combine_constant_insertion_v4f32:
2593 ; SSE2-NEXT: movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
2594 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2595 ; SSE2-NEXT: movaps %xmm1, %xmm0
2598 ; SSSE3-LABEL: combine_constant_insertion_v4f32:
2600 ; SSSE3-NEXT: movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
2601 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2602 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2605 ; SSE41-LABEL: combine_constant_insertion_v4f32:
2607 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2610 ; AVX-LABEL: combine_constant_insertion_v4f32:
2612 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2614 %a0 = insertelement <4 x float> undef, float %f, i32 0
2615 %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2616 ret <4 x float> %ret
2619 define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
2620 ; SSE2-LABEL: combine_constant_insertion_v4i32:
2622 ; SSE2-NEXT: movd %edi, %xmm1
2623 ; SSE2-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30>
2624 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2627 ; SSSE3-LABEL: combine_constant_insertion_v4i32:
2629 ; SSSE3-NEXT: movd %edi, %xmm1
2630 ; SSSE3-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30>
2631 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2634 ; SSE41-LABEL: combine_constant_insertion_v4i32:
2636 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <u,4,5,30>
2637 ; SSE41-NEXT: pinsrd $0, %edi, %xmm0
2640 ; AVX-LABEL: combine_constant_insertion_v4i32:
2642 ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <u,4,5,30>
2643 ; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0
2645 %a0 = insertelement <4 x i32> undef, i32 %f, i32 0
2646 %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2650 define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
2651 ; SSE2-LABEL: PR22377:
2652 ; SSE2: # %bb.0: # %entry
2653 ; SSE2-NEXT: movaps %xmm0, %xmm1
2654 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3]
2655 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2656 ; SSE2-NEXT: addps %xmm0, %xmm1
2657 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2660 ; SSSE3-LABEL: PR22377:
2661 ; SSSE3: # %bb.0: # %entry
2662 ; SSSE3-NEXT: movaps %xmm0, %xmm1
2663 ; SSSE3-NEXT: haddps %xmm0, %xmm1
2664 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2665 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2668 ; SSE41-LABEL: PR22377:
2669 ; SSE41: # %bb.0: # %entry
2670 ; SSE41-NEXT: movaps %xmm0, %xmm1
2671 ; SSE41-NEXT: haddps %xmm0, %xmm1
2672 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2673 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2676 ; AVX-LABEL: PR22377:
2677 ; AVX: # %bb.0: # %entry
2678 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1
2679 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2680 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2683 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
2684 %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2685 %r2 = fadd <4 x float> %s1, %s2
2686 %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2690 define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
2691 ; SSE2-LABEL: PR22390:
2692 ; SSE2: # %bb.0: # %entry
2693 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2694 ; SSE2-NEXT: movaps %xmm0, %xmm2
2695 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2696 ; SSE2-NEXT: addps %xmm0, %xmm2
2697 ; SSE2-NEXT: movaps %xmm2, %xmm0
2700 ; SSSE3-LABEL: PR22390:
2701 ; SSSE3: # %bb.0: # %entry
2702 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2703 ; SSSE3-NEXT: movaps %xmm0, %xmm2
2704 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2705 ; SSSE3-NEXT: addps %xmm0, %xmm2
2706 ; SSSE3-NEXT: movaps %xmm2, %xmm0
2709 ; SSE41-LABEL: PR22390:
2710 ; SSE41: # %bb.0: # %entry
2711 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2712 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2713 ; SSE41-NEXT: addps %xmm1, %xmm0
2716 ; AVX-LABEL: PR22390:
2717 ; AVX: # %bb.0: # %entry
2718 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2719 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2720 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
2723 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
2724 %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
2725 %r2 = fadd <4 x float> %s1, %s2
2729 define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
2730 ; SSE-LABEL: PR22412:
2731 ; SSE: # %bb.0: # %entry
2732 ; SSE-NEXT: movaps %xmm3, %xmm1
2733 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2734 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[3,2]
2737 ; AVX1-LABEL: PR22412:
2738 ; AVX1: # %bb.0: # %entry
2739 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
2740 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2741 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[3,2],ymm0[5,4],ymm2[7,6]
2744 ; AVX2-LABEL: PR22412:
2745 ; AVX2: # %bb.0: # %entry
2746 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2747 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
2748 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
2751 %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2752 %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
2756 define <4 x float> @PR30264(<4 x float> %x) {
2757 ; SSE2-LABEL: PR30264:
2759 ; SSE2-NEXT: xorps %xmm1, %xmm1
2760 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2761 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
2762 ; SSE2-NEXT: movaps %xmm1, %xmm0
2765 ; SSSE3-LABEL: PR30264:
2767 ; SSSE3-NEXT: xorps %xmm1, %xmm1
2768 ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2769 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
2770 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2773 ; SSE41-LABEL: PR30264:
2775 ; SSE41-NEXT: movaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
2776 ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3]
2777 ; SSE41-NEXT: movaps %xmm1, %xmm0
2780 ; AVX-LABEL: PR30264:
2782 ; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
2783 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3]
2785 %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
2786 %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2787 ret <4 x float> %shuf2
2790 define <8 x i16> @PR39549(<16 x i8> %x) {
2791 ; SSE-LABEL: PR39549:
2793 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2794 ; SSE-NEXT: psraw $8, %xmm0
2797 ; AVX-LABEL: PR39549:
2799 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2800 ; AVX-NEXT: vpsraw $8, %xmm0, %xmm0
2802 %a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15, i32 undef>
2803 %b = bitcast <16 x i8> %a to <8 x i16>
2804 %c = shl <8 x i16> %b, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2805 %d = ashr <8 x i16> %c, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2809 define <4 x i32> @PR41545(<4 x i32> %a0, <16 x i8> %a1) {
2810 ; SSE-LABEL: PR41545:
2812 ; SSE-NEXT: paddd %xmm1, %xmm0
2815 ; AVX-LABEL: PR41545:
2817 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2819 %1 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
2820 %2 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
2821 %3 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
2822 %4 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
2823 %5 = zext <4 x i8> %1 to <4 x i32>
2824 %6 = zext <4 x i8> %2 to <4 x i32>
2825 %7 = zext <4 x i8> %3 to <4 x i32>
2826 %8 = zext <4 x i8> %4 to <4 x i32>
2827 %9 = shl <4 x i32> %6, <i32 8, i32 8, i32 8, i32 8>
2828 %10 = shl <4 x i32> %7, <i32 16, i32 16, i32 16, i32 16>
2829 %11 = shl <4 x i32> %8, <i32 24, i32 24, i32 24, i32 24>
2830 %12 = or <4 x i32> %5, %9
2831 %13 = or <4 x i32> %12, %10
2832 %14 = or <4 x i32> %13, %11
2833 %15 = add <4 x i32> %a0, %14
2837 define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) {
2838 ; SSE-LABEL: shuffle_extract_insert:
2840 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2841 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2844 ; AVX1-LABEL: shuffle_extract_insert:
2846 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2847 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2850 ; AVX2-SLOW-LABEL: shuffle_extract_insert:
2851 ; AVX2-SLOW: # %bb.0:
2852 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2853 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2854 ; AVX2-SLOW-NEXT: retq
2856 ; AVX2-FAST-LABEL: shuffle_extract_insert:
2857 ; AVX2-FAST: # %bb.0:
2858 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15]
2859 ; AVX2-FAST-NEXT: retq
2860 %a0 = extractelement <8 x i16> %a, i32 0
2861 %a1 = extractelement <8 x i16> %a, i32 1
2862 %a3 = extractelement <8 x i16> %a, i32 3
2863 %a4 = extractelement <8 x i16> %a, i32 4
2864 %a5 = extractelement <8 x i16> %a, i32 5
2865 %a6 = extractelement <8 x i16> %a, i32 6
2866 %a7 = extractelement <8 x i16> %a, i32 7
2867 %1 = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2868 %2 = insertelement <8 x i16> %1, i16 %a1, i32 1
2869 %3 = insertelement <8 x i16> %2, i16 %a0, i32 2
2870 %4 = insertelement <8 x i16> %3, i16 %a3, i32 3
2871 %5 = insertelement <8 x i16> %4, i16 %a6, i32 4
2872 %6 = insertelement <8 x i16> %5, i16 %a5, i32 5
2873 %7 = insertelement <8 x i16> %6, i16 %a4, i32 6
2874 %8 = insertelement <8 x i16> %7, i16 %a7, i32 7
2878 define <8 x i16> @shuffle_extract_insert_double(<8 x i16> %a, <8 x i16> %b) {
2879 ; SSE2-LABEL: shuffle_extract_insert_double:
2881 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
2882 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
2883 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2884 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
2885 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2886 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2887 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2888 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2889 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2892 ; SSSE3-LABEL: shuffle_extract_insert_double:
2894 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2895 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2896 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2899 ; SSE41-LABEL: shuffle_extract_insert_double:
2901 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2902 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2903 ; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2906 ; AVX-LABEL: shuffle_extract_insert_double:
2908 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2909 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2910 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2912 %a0 = extractelement <8 x i16> %a, i32 0
2913 %a4 = extractelement <8 x i16> %a, i32 4
2914 %a6 = extractelement <8 x i16> %a, i32 6
2915 %b11 = extractelement <8 x i16> %b, i32 3
2916 %b13 = extractelement <8 x i16> %b, i32 5
2917 %b15 = extractelement <8 x i16> %b, i32 7
2918 %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2919 %2 = insertelement <8 x i16> %1, i16 %a0, i32 2
2920 %3 = insertelement <8 x i16> %2, i16 %b11, i32 3
2921 %4 = insertelement <8 x i16> %3, i16 %a6, i32 4
2922 %5 = insertelement <8 x i16> %4, i16 %b13, i32 5
2923 %6 = insertelement <8 x i16> %5, i16 %a4, i32 6
2924 %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
2928 define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) {
2929 ; SSE2-LABEL: shuffle_extract_concat_insert:
2931 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2932 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2933 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2934 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2935 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2936 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
2937 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
2938 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2939 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
2940 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2943 ; SSSE3-LABEL: shuffle_extract_concat_insert:
2945 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2946 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2947 ; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2948 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2951 ; SSE41-LABEL: shuffle_extract_concat_insert:
2953 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2954 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2955 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2956 ; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2959 ; AVX-LABEL: shuffle_extract_concat_insert:
2961 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2962 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2963 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2964 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2966 %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2967 %a0 = extractelement <8 x i16> %a, i32 0
2968 %a4 = extractelement <8 x i16> %a, i32 4
2969 %a6 = extractelement <8 x i16> %a, i32 6
2970 %b11 = extractelement <8 x i16> %b, i32 3
2971 %b13 = extractelement <8 x i16> %b, i32 5
2972 %b15 = extractelement <8 x i16> %b, i32 7
2973 %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2974 %2 = insertelement <8 x i16> %1, i16 %a0, i32 2
2975 %3 = insertelement <8 x i16> %2, i16 %b11, i32 3
2976 %4 = insertelement <8 x i16> %3, i16 %a6, i32 4
2977 %5 = insertelement <8 x i16> %4, i16 %b13, i32 5
2978 %6 = insertelement <8 x i16> %5, i16 %a4, i32 6
2979 %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
2983 define <8 x i16> @shuffle_scalar_to_vector_extract(<8 x i8>* %p0, i8* %p1, i8* %p2) {
2984 ; SSE2-LABEL: shuffle_scalar_to_vector_extract:
2986 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2987 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2988 ; SSE2-NEXT: psraw $8, %xmm1
2989 ; SSE2-NEXT: pextrw $7, %xmm1, %eax
2990 ; SSE2-NEXT: movd %eax, %xmm2
2991 ; SSE2-NEXT: movsbl (%rsi), %eax
2992 ; SSE2-NEXT: movd %eax, %xmm0
2993 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2994 ; SSE2-NEXT: movsbl (%rdx), %eax
2995 ; SSE2-NEXT: movd %eax, %xmm0
2996 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2997 ; SSE2-NEXT: pxor %xmm0, %xmm0
2998 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2999 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3000 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3003 ; SSSE3-LABEL: shuffle_scalar_to_vector_extract:
3005 ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3006 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3007 ; SSSE3-NEXT: psraw $8, %xmm1
3008 ; SSSE3-NEXT: movsbl (%rsi), %eax
3009 ; SSSE3-NEXT: movd %eax, %xmm2
3010 ; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
3011 ; SSSE3-NEXT: movsbl (%rdx), %eax
3012 ; SSSE3-NEXT: movd %eax, %xmm0
3013 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3014 ; SSSE3-NEXT: pxor %xmm0, %xmm0
3015 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3016 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3017 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3020 ; SSE41-LABEL: shuffle_scalar_to_vector_extract:
3022 ; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
3023 ; SSE41-NEXT: pextrw $4, %xmm0, %eax
3024 ; SSE41-NEXT: pextrw $7, %xmm0, %ecx
3025 ; SSE41-NEXT: pxor %xmm0, %xmm0
3026 ; SSE41-NEXT: pinsrw $1, %eax, %xmm0
3027 ; SSE41-NEXT: movl $65531, %eax # imm = 0xFFFB
3028 ; SSE41-NEXT: pinsrw $2, %eax, %xmm0
3029 ; SSE41-NEXT: pinsrw $4, %ecx, %xmm0
3030 ; SSE41-NEXT: movsbl (%rsi), %eax
3031 ; SSE41-NEXT: pinsrw $5, %eax, %xmm0
3032 ; SSE41-NEXT: movsbl (%rdx), %eax
3033 ; SSE41-NEXT: pinsrw $6, %eax, %xmm0
3036 ; AVX-LABEL: shuffle_scalar_to_vector_extract:
3038 ; AVX-NEXT: vpmovsxbw (%rdi), %xmm0
3039 ; AVX-NEXT: vpextrw $4, %xmm0, %eax
3040 ; AVX-NEXT: vpextrw $7, %xmm0, %ecx
3041 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
3042 ; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
3043 ; AVX-NEXT: movl $65531, %eax # imm = 0xFFFB
3044 ; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
3045 ; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
3046 ; AVX-NEXT: movsbl (%rsi), %eax
3047 ; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
3048 ; AVX-NEXT: movsbl (%rdx), %eax
3049 ; AVX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
3051 %tmp = load <8 x i8>, <8 x i8>* %p0, align 1
3052 %tmp1 = sext <8 x i8> %tmp to <8 x i16>
3053 %tmp2 = load i8, i8* %p1, align 1
3054 %cvt1 = sext i8 %tmp2 to i16
3055 %tmp3 = load i8, i8* %p2, align 1
3056 %cvt2 = sext i8 %tmp3 to i16
3057 %tmp4 = extractelement <8 x i16> %tmp1, i32 4
3058 %tmp5 = extractelement <8 x i16> %tmp1, i32 7
3059 %tmp6 = insertelement <8 x i16> <i16 undef, i16 undef, i16 -5, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 undef, i32 0
3060 %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp4, i32 1
3061 %tmp8 = insertelement <8 x i16> %tmp7, i16 undef, i32 3
3062 %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp5, i32 4
3063 %tmp10 = insertelement <8 x i16> %tmp9, i16 %cvt1, i32 5
3064 %tmp11 = insertelement <8 x i16> %tmp10, i16 %cvt2, i32 6
3065 %tmp12 = insertelement <8 x i16> %tmp11, i16 undef, i32 7
3066 %tmp13 = shufflevector <8 x i16> %tmp12, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
3067 ret <8 x i16> %tmp13
3070 ; Bug noticed in D96345
3071 define i32 @shuffle_binops_with_undef() {
3072 ; SSE-LABEL: shuffle_binops_with_undef:
3073 ; SSE: # %bb.0: # %entry
3074 ; SSE-NEXT: movdqa (%rax), %xmm0
3075 ; SSE-NEXT: paddw %xmm0, %xmm0
3076 ; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
3077 ; SSE-NEXT: psrlw %xmm1, %xmm0
3078 ; SSE-NEXT: movdqa %xmm0, (%rax)
3081 ; AVX-LABEL: shuffle_binops_with_undef:
3082 ; AVX: # %bb.0: # %entry
3083 ; AVX-NEXT: vmovdqa (%rax), %xmm0
3084 ; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0
3085 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
3086 ; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
3087 ; AVX-NEXT: vmovdqa %xmm0, (%rax)
3090 %load0 = load <8 x i16>, <8 x i16>* undef, align 16
3091 %load1 = load <8 x i16>, <8 x i16>* undef, align 16
3092 %shuf0 = shufflevector <16 x i8> undef, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
3093 %addi = add <8 x i16> %load0, %load1
3094 %bc0 = bitcast <8 x i16> %addi to <2 x i64>
3095 %bc1 = bitcast <16 x i8> %shuf0 to <8 x i16>
3096 %shuf1 = shufflevector <8 x i16> %load1, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
3097 %addi24 = add <8 x i16> %shuf1, %bc1
3098 %bc2 = bitcast <8 x i16> %addi24 to <2 x i64>
3099 %shuf2 = shufflevector <2 x i64> %bc0, <2 x i64> %bc2, <2 x i32> <i32 0, i32 2>
3100 %bc3 = bitcast <2 x i64> %shuf2 to <8 x i16>
3101 %psrli = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %bc3, i32 ptrtoint (i32 ()* @shuffle_binops_with_undef to i32))
3102 store <8 x i16> %psrli, <8 x i16>* undef, align 16
3105 declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
3107 define void @PR43024() {
3108 ; SSE-LABEL: PR43024:
3110 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3111 ; SSE-NEXT: movaps %xmm0, (%rax)
3112 ; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3113 ; SSE-NEXT: xorps %xmm1, %xmm1
3114 ; SSE-NEXT: addss %xmm1, %xmm0
3115 ; SSE-NEXT: addss %xmm1, %xmm0
3116 ; SSE-NEXT: movss %xmm0, (%rax)
3119 ; AVX-LABEL: PR43024:
3121 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3122 ; AVX-NEXT: vmovaps %xmm0, (%rax)
3123 ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0
3124 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
3125 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
3126 ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+12(%rip), %xmm0, %xmm0
3127 ; AVX-NEXT: vmovss %xmm0, (%rax)
3129 store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16
3130 %1 = load <4 x float>, <4 x float>* undef, align 16
3131 %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0>
3132 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
3133 %4 = fadd <4 x float> %2, %3
3134 %5 = fadd <4 x float> zeroinitializer, %4
3135 %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
3136 %7 = fadd <4 x float> %6, %5
3137 %8 = extractelement <4 x float> %7, i32 0
3138 store float %8, float* undef, align 8
3142 define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) {
3143 ; SSE2-LABEL: PR45604:
3145 ; SSE2-NEXT: movdqa (%rsi), %xmm1
3146 ; SSE2-NEXT: movd %xmm1, %eax
3147 ; SSE2-NEXT: movzwl %ax, %eax
3148 ; SSE2-NEXT: movd %eax, %xmm0
3149 ; SSE2-NEXT: movl $11, %eax
3150 ; SSE2-NEXT: pinsrw $2, %eax, %xmm0
3151 ; SSE2-NEXT: pextrw $1, %xmm1, %ecx
3152 ; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
3153 ; SSE2-NEXT: pinsrw $6, %eax, %xmm0
3154 ; SSE2-NEXT: pextrw $2, %xmm1, %ecx
3155 ; SSE2-NEXT: movd %ecx, %xmm2
3156 ; SSE2-NEXT: pinsrw $2, %eax, %xmm2
3157 ; SSE2-NEXT: pextrw $3, %xmm1, %ecx
3158 ; SSE2-NEXT: pinsrw $4, %ecx, %xmm2
3159 ; SSE2-NEXT: pinsrw $6, %eax, %xmm2
3160 ; SSE2-NEXT: pextrw $4, %xmm1, %ecx
3161 ; SSE2-NEXT: movd %ecx, %xmm3
3162 ; SSE2-NEXT: pinsrw $2, %eax, %xmm3
3163 ; SSE2-NEXT: pextrw $5, %xmm1, %ecx
3164 ; SSE2-NEXT: pinsrw $4, %ecx, %xmm3
3165 ; SSE2-NEXT: pinsrw $6, %eax, %xmm3
3166 ; SSE2-NEXT: pextrw $6, %xmm1, %ecx
3167 ; SSE2-NEXT: movd %ecx, %xmm4
3168 ; SSE2-NEXT: pinsrw $2, %eax, %xmm4
3169 ; SSE2-NEXT: pextrw $7, %xmm1, %ecx
3170 ; SSE2-NEXT: pinsrw $4, %ecx, %xmm4
3171 ; SSE2-NEXT: pinsrw $6, %eax, %xmm4
3172 ; SSE2-NEXT: movdqa %xmm4, 48(%rdi)
3173 ; SSE2-NEXT: movdqa %xmm3, 32(%rdi)
3174 ; SSE2-NEXT: movdqa %xmm2, 16(%rdi)
3175 ; SSE2-NEXT: movdqa %xmm0, (%rdi)
3178 ; SSSE3-LABEL: PR45604:
3180 ; SSSE3-NEXT: movdqa (%rsi), %xmm1
3181 ; SSSE3-NEXT: movd %xmm1, %eax
3182 ; SSSE3-NEXT: movzwl %ax, %eax
3183 ; SSSE3-NEXT: movd %eax, %xmm0
3184 ; SSSE3-NEXT: movl $11, %eax
3185 ; SSSE3-NEXT: pinsrw $2, %eax, %xmm0
3186 ; SSSE3-NEXT: pextrw $1, %xmm1, %ecx
3187 ; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
3188 ; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
3189 ; SSSE3-NEXT: pextrw $2, %xmm1, %ecx
3190 ; SSSE3-NEXT: movd %ecx, %xmm2
3191 ; SSSE3-NEXT: pinsrw $2, %eax, %xmm2
3192 ; SSSE3-NEXT: pextrw $3, %xmm1, %ecx
3193 ; SSSE3-NEXT: pinsrw $4, %ecx, %xmm2
3194 ; SSSE3-NEXT: pinsrw $6, %eax, %xmm2
3195 ; SSSE3-NEXT: pextrw $4, %xmm1, %ecx
3196 ; SSSE3-NEXT: movd %ecx, %xmm3
3197 ; SSSE3-NEXT: pinsrw $2, %eax, %xmm3
3198 ; SSSE3-NEXT: pextrw $5, %xmm1, %ecx
3199 ; SSSE3-NEXT: pinsrw $4, %ecx, %xmm3
3200 ; SSSE3-NEXT: pinsrw $6, %eax, %xmm3
3201 ; SSSE3-NEXT: pextrw $6, %xmm1, %ecx
3202 ; SSSE3-NEXT: movd %ecx, %xmm4
3203 ; SSSE3-NEXT: pinsrw $2, %eax, %xmm4
3204 ; SSSE3-NEXT: pextrw $7, %xmm1, %ecx
3205 ; SSSE3-NEXT: pinsrw $4, %ecx, %xmm4
3206 ; SSSE3-NEXT: pinsrw $6, %eax, %xmm4
3207 ; SSSE3-NEXT: movdqa %xmm4, 48(%rdi)
3208 ; SSSE3-NEXT: movdqa %xmm3, 32(%rdi)
3209 ; SSSE3-NEXT: movdqa %xmm2, 16(%rdi)
3210 ; SSSE3-NEXT: movdqa %xmm0, (%rdi)
3213 ; SSE41-LABEL: PR45604:
3215 ; SSE41-NEXT: movdqa (%rsi), %xmm1
3216 ; SSE41-NEXT: pextrw $2, %xmm1, %eax
3217 ; SSE41-NEXT: movd %eax, %xmm0
3218 ; SSE41-NEXT: movl $11, %eax
3219 ; SSE41-NEXT: pinsrw $2, %eax, %xmm0
3220 ; SSE41-NEXT: pextrw $3, %xmm1, %ecx
3221 ; SSE41-NEXT: pinsrw $4, %ecx, %xmm0
3222 ; SSE41-NEXT: pinsrw $6, %eax, %xmm0
3223 ; SSE41-NEXT: pextrw $4, %xmm1, %ecx
3224 ; SSE41-NEXT: movd %ecx, %xmm2
3225 ; SSE41-NEXT: pinsrw $2, %eax, %xmm2
3226 ; SSE41-NEXT: pextrw $5, %xmm1, %ecx
3227 ; SSE41-NEXT: pinsrw $4, %ecx, %xmm2
3228 ; SSE41-NEXT: pinsrw $6, %eax, %xmm2
3229 ; SSE41-NEXT: pextrw $6, %xmm1, %ecx
3230 ; SSE41-NEXT: movd %ecx, %xmm3
3231 ; SSE41-NEXT: pinsrw $2, %eax, %xmm3
3232 ; SSE41-NEXT: pextrw $7, %xmm1, %ecx
3233 ; SSE41-NEXT: pinsrw $4, %ecx, %xmm3
3234 ; SSE41-NEXT: pinsrw $6, %eax, %xmm3
3235 ; SSE41-NEXT: pxor %xmm4, %xmm4
3236 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7]
3237 ; SSE41-NEXT: pinsrw $2, %eax, %xmm4
3238 ; SSE41-NEXT: pextrw $1, %xmm1, %ecx
3239 ; SSE41-NEXT: pinsrw $4, %ecx, %xmm4
3240 ; SSE41-NEXT: pinsrw $6, %eax, %xmm4
3241 ; SSE41-NEXT: movdqa %xmm4, (%rdi)
3242 ; SSE41-NEXT: movdqa %xmm3, 48(%rdi)
3243 ; SSE41-NEXT: movdqa %xmm2, 32(%rdi)
3244 ; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
3247 ; AVX1-LABEL: PR45604:
3249 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
3250 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
3251 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3252 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0]
3253 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3254 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
3255 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3256 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3257 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
3258 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
3259 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3260 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3261 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3262 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
3263 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
3264 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
3265 ; AVX1-NEXT: vmovups %ymm1, 32(%rdi)
3266 ; AVX1-NEXT: vzeroupper
3269 ; AVX2-LABEL: PR45604:
3271 ; AVX2-NEXT: vmovdqa (%rsi), %xmm0
3272 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2]
3273 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
3274 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
3275 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0>
3276 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
3277 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
3278 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
3279 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
3280 ; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
3281 ; AVX2-NEXT: vmovdqu %ymm1, (%rdi)
3282 ; AVX2-NEXT: vzeroupper
3284 %v1 = load <8 x i16>, <8 x i16>* %src, align 16
3285 %v2 = shufflevector <8 x i16> %v1, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3286 %v3 = shufflevector <16 x i16> %v2, <16 x i16> <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
3287 store <32 x i16> %v3, <32 x i16>* %dst, align 16
3291 ; Test case reported on D105827
3292 define void @SpinningCube() {
3293 ; SSE2-LABEL: SpinningCube:
3294 ; SSE2: # %bb.0: # %entry
3295 ; SSE2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
3296 ; SSE2-NEXT: movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3297 ; SSE2-NEXT: movaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3298 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3299 ; SSE2-NEXT: movaps %xmm2, %xmm3
3300 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[1,3]
3301 ; SSE2-NEXT: xorps %xmm4, %xmm4
3302 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
3303 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3]
3304 ; SSE2-NEXT: addps %xmm4, %xmm2
3305 ; SSE2-NEXT: movaps %xmm2, (%rax)
3306 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3307 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
3308 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3309 ; SSE2-NEXT: mulps %xmm2, %xmm1
3310 ; SSE2-NEXT: addps %xmm0, %xmm1
3311 ; SSE2-NEXT: movaps %xmm1, (%rax)
3314 ; SSSE3-LABEL: SpinningCube:
3315 ; SSSE3: # %bb.0: # %entry
3316 ; SSSE3-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
3317 ; SSSE3-NEXT: movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3318 ; SSSE3-NEXT: movaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3319 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3320 ; SSSE3-NEXT: movaps %xmm2, %xmm3
3321 ; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[1,3]
3322 ; SSSE3-NEXT: xorps %xmm4, %xmm4
3323 ; SSSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
3324 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3]
3325 ; SSSE3-NEXT: addps %xmm4, %xmm2
3326 ; SSSE3-NEXT: movaps %xmm2, (%rax)
3327 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3328 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3329 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2]
3330 ; SSSE3-NEXT: mulps %xmm1, %xmm2
3331 ; SSSE3-NEXT: addps %xmm0, %xmm2
3332 ; SSSE3-NEXT: movaps %xmm2, (%rax)
3335 ; SSE41-LABEL: SpinningCube:
3336 ; SSE41: # %bb.0: # %entry
3337 ; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
3338 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3339 ; SSE41-NEXT: movaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3340 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3341 ; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3342 ; SSE41-NEXT: movaps %xmm1, %xmm3
3343 ; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0]
3344 ; SSE41-NEXT: movaps %xmm0, %xmm4
3345 ; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3]
3346 ; SSE41-NEXT: addps %xmm3, %xmm4
3347 ; SSE41-NEXT: movaps %xmm4, (%rax)
3348 ; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3349 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2]
3350 ; SSE41-NEXT: mulps %xmm1, %xmm2
3351 ; SSE41-NEXT: addps %xmm0, %xmm2
3352 ; SSE41-NEXT: movaps %xmm2, (%rax)
3355 ; AVX1-LABEL: SpinningCube:
3356 ; AVX1: # %bb.0: # %entry
3357 ; AVX1-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
3358 ; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3359 ; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3360 ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,3]
3361 ; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
3362 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
3363 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[2,3]
3364 ; AVX1-NEXT: vaddps %xmm3, %xmm2, %xmm2
3365 ; AVX1-NEXT: vmovaps %xmm2, (%rax)
3366 ; AVX1-NEXT: vbroadcastss (%rax), %xmm2
3367 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
3368 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3369 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
3370 ; AVX1-NEXT: vmovaps %xmm0, (%rax)
3373 ; AVX2-LABEL: SpinningCube:
3374 ; AVX2: # %bb.0: # %entry
3375 ; AVX2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
3376 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
3377 ; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3378 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,3]
3379 ; AVX2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
3380 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
3381 ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[2,3]
3382 ; AVX2-NEXT: vaddps %xmm3, %xmm2, %xmm2
3383 ; AVX2-NEXT: vmovaps %xmm2, (%rax)
3384 ; AVX2-NEXT: vbroadcastss (%rax), %xmm2
3385 ; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1
3386 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3387 ; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
3388 ; AVX2-NEXT: vmovaps %xmm0, (%rax)
3391 store float 1.000000e+00, float* undef, align 4
3392 %0 = load float, float* undef, align 4
3393 %1 = fmul float undef, 0.000000e+00
3394 %2 = insertelement <4 x float> poison, float %0, i32 3
3395 %3 = load float, float* undef, align 4
3396 %4 = insertelement <2 x float> poison, float %3, i32 0
3397 %5 = shufflevector <2 x float> %4, <2 x float> poison, <2 x i32> zeroinitializer
3398 %6 = fmul <2 x float> %5, <float 0.000000e+00, float -2.000000e+00>
3399 %7 = fadd float %1, undef
3400 %8 = shufflevector <2 x float> %6, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
3401 %9 = shufflevector <4 x float> undef, <4 x float> %8, <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
3402 %10 = insertelement <4 x float> %9, float %7, i32 3
3403 %11 = insertelement <4 x float> %2, float 0x7FF8000000000000, i32 1
3404 %12 = insertelement <4 x float> %11, float undef, i32 0
3405 %13 = insertelement <4 x float> %12, float undef, i32 2
3406 %14 = fadd <4 x float> %10, %13
3407 store <4 x float> %14, <4 x float>* undef, align 16
3408 %15 = load float, float* undef, align 4
3409 %16 = insertelement <2 x float> poison, float %15, i32 0
3410 %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> zeroinitializer
3411 %18 = fmul <2 x float> %17, <float 0.000000e+00, float -2.000000e+00>
3412 %19 = shufflevector <2 x float> %18, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
3413 %20 = shufflevector <4 x float> undef, <4 x float> %19, <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
3414 %21 = fadd <4 x float> %20, %2
3415 store <4 x float> %21, <4 x float>* undef, align 16