llvm/test/CodeGen/X86/2012-04-26-sdglue.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
   3
   4 ; rdar://11314175: SD Scheduler, BuildSchedUnits assert:
   5 ;                  N->getNodeId() == -1 && "Node already inserted!
   6
   7 define void @func(<4 x float> %a, <16 x i8> %b, <16 x i8> %c, <8 x float> %d, <8 x float> %e, ptr %f) nounwind ssp {
   8 ; CHECK-LABEL: func:
   9 ; CHECK:       ## %bb.0:
  10 ; CHECK-NEXT:    vmovdqu 0, %xmm0
  11 ; CHECK-NEXT:    vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
  12 ; CHECK-NEXT:    vmulps %xmm1, %xmm1, %xmm1
  13 ; CHECK-NEXT:    vmulps %xmm0, %xmm0, %xmm0
  14 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
  15 ; CHECK-NEXT:    vaddps %xmm0, %xmm0, %xmm0
  16 ; CHECK-NEXT:    vmulps %xmm0, %xmm0, %xmm0
  17 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
  18 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
  19 ; CHECK-NEXT:    vaddps %ymm0, %ymm0, %ymm0
  20 ; CHECK-NEXT:    vhaddps %ymm4, %ymm0, %ymm0
  21 ; CHECK-NEXT:    vsubps %ymm0, %ymm0, %ymm0
  22 ; CHECK-NEXT:    vhaddps %ymm0, %ymm1, %ymm0
  23 ; CHECK-NEXT:    vmovaps %ymm0, (%rdi)
  24 ; CHECK-NEXT:    vzeroupper
  25 ; CHECK-NEXT:    retq
  26   %tmp = load <4 x float>, ptr null, align 1
  27   %tmp14 = getelementptr <4 x float>, ptr null, i32 2
  28   %tmp15 = load <4 x float>, ptr %tmp14, align 1
  29   %tmp16 = shufflevector <4 x float> %tmp, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
  30   %tmp17 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %tmp16, <4 x float> %a, i8 1)
  31   %tmp18 = bitcast <4 x float> %tmp to <16 x i8>
  32   %tmp19 = shufflevector <16 x i8> %tmp18, <16 x i8> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
  33   %tmp20 = bitcast <16 x i8> %tmp19 to <4 x float>
  34   %tmp21 = bitcast <4 x float> %tmp15 to <16 x i8>
  35   %tmp22 = shufflevector <16 x i8> %c, <16 x i8> %tmp21, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
  36   %tmp23 = bitcast <16 x i8> %tmp22 to <4 x float>
  37   %tmp24 = shufflevector <4 x float> %tmp20, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
  38   %tmp25 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %tmp24, <4 x float> %tmp23, i8 1)
  39   %tmp26 = fmul <8 x float> %tmp17, %tmp17
  40   %tmp27 = fmul <8 x float> %tmp25, %tmp25
  41   %tmp28 = fadd <8 x float> %tmp26, %tmp27
  42   %tmp29 = fadd <8 x float> %tmp28, %tmp28
  43   %tmp30 = shufflevector <8 x float> %tmp29, <8 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  44   %tmp31 = fmul <4 x float> %tmp30, %tmp30
  45   %tmp32 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> zeroinitializer, <4 x float> %tmp31, i8 1)
  46   %tmp33 = fadd <8 x float> %tmp32, %tmp32
  47   %tmp34 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %tmp33, <8 x float> %e) nounwind
  48   %tmp35 = fsub <8 x float> %tmp34, %tmp34
  49   %tmp36 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> zeroinitializer, <8 x float> %tmp35) nounwind
  50   store <8 x float> %tmp36, ptr %f, align 32
  51   ret void
  52 }
  53
  54 declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
  55
  56 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone