1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse | FileCheck %s -check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s -check-prefixes=SSE,SSE4
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v4 | FileCheck %s -check-prefixes=AVX,AVX512
8 ; Verify that each of the following test cases is folded into a single
9 ; instruction which performs a blend operation.
11 define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
14 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
19 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
24 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
26 %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
27 %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
28 %or = or <2 x i64> %shuf1, %shuf2
33 define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
36 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
41 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
46 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
48 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
49 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
50 %or = or <4 x i32> %shuf1, %shuf2
55 define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
58 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
63 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
68 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
70 %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
71 %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
72 %or = or <2 x i64> %shuf1, %shuf2
77 define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
80 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
81 ; SSE2-NEXT: movaps %xmm1, %xmm0
86 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
91 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
93 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
94 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
95 %or = or <4 x i32> %shuf1, %shuf2
100 define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
103 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
108 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
113 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
115 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
116 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
117 %or = or <4 x i32> %shuf1, %shuf2
122 define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
125 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
130 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
135 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
137 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
138 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
139 %or = or <4 x i32> %shuf1, %shuf2
144 define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
147 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
152 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
157 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
159 %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
160 %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
161 %or = or <4 x i32> %and1, %and2
166 define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
169 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
174 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
179 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
181 %and1 = and <2 x i64> %a, <i64 -1, i64 0>
182 %and2 = and <2 x i64> %b, <i64 0, i64 -1>
183 %or = or <2 x i64> %and1, %and2
188 define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
191 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
196 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
201 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
203 %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
204 %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
205 %or = or <4 x i32> %and1, %and2
210 define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
211 ; SSE2-LABEL: test10:
213 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
216 ; SSE4-LABEL: test10:
218 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
223 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
225 %and1 = and <2 x i64> %a, <i64 0, i64 -1>
226 %and2 = and <2 x i64> %b, <i64 -1, i64 0>
227 %or = or <2 x i64> %and1, %and2
232 define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
233 ; SSE2-LABEL: test11:
235 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
236 ; SSE2-NEXT: movaps %xmm1, %xmm0
239 ; SSE4-LABEL: test11:
241 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
246 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
248 %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
249 %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
250 %or = or <4 x i32> %and1, %and2
255 define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
256 ; SSE2-LABEL: test12:
258 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
261 ; SSE4-LABEL: test12:
263 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
268 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
270 %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
271 %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
272 %or = or <4 x i32> %and1, %and2
277 ; Verify that the following test cases are folded into single shuffles.
279 define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
282 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
287 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
289 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
290 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
291 %or = or <4 x i32> %shuf1, %shuf2
296 define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
299 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
304 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
306 %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
307 %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
308 %or = or <2 x i64> %shuf1, %shuf2
313 define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
316 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1]
317 ; SSE-NEXT: movaps %xmm1, %xmm0
322 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,1],xmm0[2,1]
324 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
325 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
326 %or = or <4 x i32> %shuf1, %shuf2
331 define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
334 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
335 ; SSE-NEXT: movaps %xmm1, %xmm0
340 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
342 %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
343 %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
344 %or = or <2 x i64> %shuf1, %shuf2
349 ; Verify that the dag-combiner does not fold a OR of two shuffles into a single
350 ; shuffle instruction when the shuffle indexes are not compatible.
352 define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
355 ; SSE-NEXT: psllq $32, %xmm0
356 ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero
357 ; SSE-NEXT: por %xmm1, %xmm0
362 ; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
363 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero
364 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
366 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
367 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
368 %or = or <4 x i32> %shuf1, %shuf2
373 define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
374 ; SSE2-LABEL: test18:
376 ; SSE2-NEXT: xorps %xmm2, %xmm2
377 ; SSE2-NEXT: xorps %xmm3, %xmm3
378 ; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
379 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,1,1]
380 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
381 ; SSE2-NEXT: orps %xmm0, %xmm2
382 ; SSE2-NEXT: movaps %xmm2, %xmm0
385 ; SSE4-LABEL: test18:
387 ; SSE4-NEXT: pxor %xmm2, %xmm2
388 ; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
389 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
390 ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
391 ; SSE4-NEXT: por %xmm0, %xmm2
392 ; SSE4-NEXT: movdqa %xmm2, %xmm0
395 ; AVX1-LABEL: test18:
397 ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
398 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
399 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
400 ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
401 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
404 ; AVX2-LABEL: test18:
406 ; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
407 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
408 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
409 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
410 ; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0
413 ; AVX512-LABEL: test18:
415 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
416 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
417 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
418 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
420 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
421 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
422 %or = or <4 x i32> %shuf1, %shuf2
427 define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
428 ; SSE2-LABEL: test19:
430 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
431 ; SSE2-NEXT: pxor %xmm2, %xmm2
432 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
433 ; SSE2-NEXT: movdqa %xmm1, %xmm0
434 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
435 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,2]
436 ; SSE2-NEXT: orps %xmm2, %xmm0
439 ; SSE4-LABEL: test19:
441 ; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,3]
442 ; SSE4-NEXT: pxor %xmm3, %xmm3
443 ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
444 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
445 ; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
446 ; SSE4-NEXT: por %xmm2, %xmm0
449 ; AVX1-LABEL: test19:
451 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
452 ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
453 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
454 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
455 ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
456 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
459 ; AVX2-LABEL: test19:
461 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
462 ; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
463 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
464 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
465 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
466 ; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0
469 ; AVX512-LABEL: test19:
471 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[12,13,14,15]
472 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9,10,11,8,9,10,11]
473 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
475 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
476 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
477 %or = or <4 x i32> %shuf1, %shuf2
482 define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
485 ; SSE-NEXT: por %xmm1, %xmm0
486 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
491 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
492 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
494 %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
495 %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
496 %or = or <2 x i64> %shuf1, %shuf2
501 define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
504 ; SSE-NEXT: por %xmm1, %xmm0
505 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
508 ; AVX1-LABEL: test21:
510 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
511 ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
514 ; AVX2-LABEL: test21:
516 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
517 ; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
520 ; AVX512-LABEL: test21:
522 ; AVX512-NEXT: vorpd %xmm1, %xmm0, %xmm0
523 ; AVX512-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
525 %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
526 %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
527 %or = or <2 x i64> %shuf1, %shuf2
532 ; Verify that the dag-combiner keeps the correct domain for float/double vectors
533 ; bitcast to use the mask-or blend combine.
535 define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
536 ; SSE2-LABEL: test22:
538 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
541 ; SSE4-LABEL: test22:
543 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
548 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
550 %bc1 = bitcast <2 x double> %a0 to <2 x i64>
551 %bc2 = bitcast <2 x double> %a1 to <2 x i64>
552 %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
553 %and2 = and <2 x i64> %bc2, <i64 -1, i64 0>
554 %or = or <2 x i64> %and1, %and2
555 %bc3 = bitcast <2 x i64> %or to <2 x double>
556 ret <2 x double> %bc3
560 define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
561 ; SSE2-LABEL: test23:
563 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3]
564 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
567 ; SSE4-LABEL: test23:
569 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
574 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
576 %bc1 = bitcast <4 x float> %a0 to <4 x i32>
577 %bc2 = bitcast <4 x float> %a1 to <4 x i32>
578 %and1 = and <4 x i32> %bc1, <i32 0, i32 -1, i32 -1, i32 0>
579 %and2 = and <4 x i32> %bc2, <i32 -1, i32 0, i32 0, i32 -1>
580 %or = or <4 x i32> %and1, %and2
581 %bc3 = bitcast <4 x i32> %or to <4 x float>
586 define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
587 ; SSE2-LABEL: test24:
589 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
592 ; SSE4-LABEL: test24:
594 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
599 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
601 %bc1 = bitcast <4 x float> %a0 to <2 x i64>
602 %bc2 = bitcast <4 x float> %a1 to <2 x i64>
603 %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
604 %and2 = and <2 x i64> %bc2, <i64 -1, i64 0>
605 %or = or <2 x i64> %and1, %and2
606 %bc3 = bitcast <2 x i64> %or to <4 x float>
611 define <4 x float> @test25(<4 x float> %a0) {
612 ; SSE2-LABEL: test25:
614 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],mem[0,3]
615 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
618 ; SSE4-LABEL: test25:
620 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3]
623 ; AVX1-LABEL: test25:
625 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3]
628 ; AVX2-LABEL: test25:
630 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
631 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
634 ; AVX512-LABEL: test25:
636 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
637 ; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
639 %bc1 = bitcast <4 x float> %a0 to <4 x i32>
640 %bc2 = bitcast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0> to <4 x i32>
641 %and1 = and <4 x i32> %bc1, <i32 0, i32 -1, i32 -1, i32 0>
642 %and2 = and <4 x i32> %bc2, <i32 -1, i32 0, i32 0, i32 -1>
643 %or = or <4 x i32> %and1, %and2
644 %bc3 = bitcast <4 x i32> %or to <4 x float>
649 ; Verify that the DAGCombiner doesn't crash in the attempt to check if a shuffle
650 ; with illegal type has a legal mask. Method 'isShuffleMaskLegal' only knows how to
651 ; handle legal vector value types.
652 define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
653 ; SSE2-LABEL: test_crash:
655 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
656 ; SSE2-NEXT: andps %xmm2, %xmm1
657 ; SSE2-NEXT: andnps %xmm0, %xmm2
658 ; SSE2-NEXT: orps %xmm1, %xmm2
659 ; SSE2-NEXT: movaps %xmm2, %xmm0
662 ; SSE4-LABEL: test_crash:
664 ; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
667 ; AVX-LABEL: test_crash:
669 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
671 %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
672 %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
673 %or = or <4 x i8> %shuf1, %shuf2
677 ; Verify that we can fold regardless of which operand is the zeroinitializer
679 define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
680 ; SSE2-LABEL: test2b:
682 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
685 ; SSE4-LABEL: test2b:
687 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
692 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
694 %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
695 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
696 %or = or <4 x i32> %shuf1, %shuf2
700 define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
701 ; SSE2-LABEL: test2c:
703 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
706 ; SSE4-LABEL: test2c:
708 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
713 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
715 %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
716 %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
717 %or = or <4 x i32> %shuf1, %shuf2
722 define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
723 ; SSE2-LABEL: test2d:
725 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
728 ; SSE4-LABEL: test2d:
730 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
735 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
737 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
738 %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
739 %or = or <4 x i32> %shuf1, %shuf2
743 ; Make sure we can have an undef where an index pointing to the zero vector should be
745 define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
746 ; SSE2-LABEL: test2e:
748 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
751 ; SSE4-LABEL: test2e:
753 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
758 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
760 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 4, i32 2, i32 3>
761 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 0, i32 1, i32 4, i32 4>
762 %or = or <4 x i32> %shuf1, %shuf2
766 define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
767 ; SSE2-LABEL: test2f:
769 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
772 ; SSE4-LABEL: test2f:
774 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
779 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
781 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 4, i32 4, i32 2, i32 3>
782 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 1, i32 4, i32 4>
783 %or = or <4 x i32> %shuf1, %shuf2
787 ; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) iff (c1 & c2) != 0
789 define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
790 ; SSE-LABEL: or_and_v2i64:
792 ; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
793 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
796 ; AVX1-LABEL: or_and_v2i64:
798 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
799 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
802 ; AVX2-LABEL: or_and_v2i64:
804 ; AVX2-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
805 ; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
808 ; AVX512-LABEL: or_and_v2i64:
810 ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7]
811 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 & (xmm0 | mem)
813 %1 = and <2 x i64> %a0, <i64 7, i64 7>
814 %2 = or <2 x i64> %1, <i64 3, i64 3>
818 define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
819 ; SSE-LABEL: or_and_v4i32:
821 ; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
822 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
825 ; AVX1-LABEL: or_and_v4i32:
827 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
828 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
831 ; AVX2-LABEL: or_and_v4i32:
833 ; AVX2-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
834 ; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
837 ; AVX512-LABEL: or_and_v4i32:
839 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,7]
840 ; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 | mem)
842 %1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
843 %2 = or <4 x i32> %1, <i32 3, i32 2, i32 15, i32 2>
847 ; If all masked bits are going to be set, that's a constant fold.
849 define <4 x i32> @or_and_v4i32_fold(<4 x i32> %a0) {
850 ; SSE-LABEL: or_and_v4i32_fold:
852 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [3,3,3,3]
855 ; AVX-LABEL: or_and_v4i32_fold:
857 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
859 %1 = and <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
860 %2 = or <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>