test/CodeGen/X86/sse_reload_fold.ll

   1 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic |& FileCheck %s
   2 ; CHECK: fail
   3 ; CHECK-NOT: fail
   4
   5 declare float @test_f(float %f)
   6 declare double @test_d(double %f)
   7 declare <4 x float> @test_vf(<4 x float> %f)
   8 declare <2 x double> @test_vd(<2 x double> %f)
   9 declare float @llvm.sqrt.f32(float)
  10 declare double @llvm.sqrt.f64(double)
  11
  12 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>)
  13 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>)
  14 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>)
  15 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
  16 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
  17 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8)
  18 declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>)
  19 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
  20 declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>)
  21 declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>)
  22 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
  23 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
  24 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8)
  25 declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>)
  26 declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>)
  27 declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>)
  28
  29 define float @foo(float %f) {
  30   %a = call float @test_f(float %f)
  31   %t = call float @llvm.sqrt.f32(float %f)
  32   ret float %t
  33 }
  34 define double @doo(double %f) {
  35   %a = call double @test_d(double %f)
  36   %t = call double @llvm.sqrt.f64(double %f)
  37   ret double %t
  38 }
  39 define <4 x float> @a0(<4 x float> %f) {
  40   %a = call <4 x float> @test_vf(<4 x float> %f)
  41   %t = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %f)
  42   ret <4 x float> %t
  43 }
  44 define <4 x float> @a1(<4 x float> %f) {
  45   %a = call <4 x float> @test_vf(<4 x float> %f)
  46   %t = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %f)
  47   ret <4 x float> %t
  48 }
  49 define <4 x float> @a2(<4 x float> %f) {
  50   %a = call <4 x float> @test_vf(<4 x float> %f)
  51   %t = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %f)
  52   ret <4 x float> %t
  53 }
  54 define <4 x float> @b3(<4 x float> %f) {
  55   %y = call <4 x float> @test_vf(<4 x float> %f)
  56   %t = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %y, <4 x float> %f)
  57   ret <4 x float> %t
  58 }
  59 define <4 x float> @b4(<4 x float> %f) {
  60   %y = call <4 x float> @test_vf(<4 x float> %f)
  61   %t = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %y, <4 x float> %f)
  62   ret <4 x float> %t
  63 }
  64 define <4 x float> @b5(<4 x float> %f) {
  65   %y = call <4 x float> @test_vf(<4 x float> %f)
  66   %t = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %y, <4 x float> %f, i8 7)
  67   ret <4 x float> %t
  68 }
  69 define <4 x float> @b6(<4 x float> %f) {
  70   %y = call <4 x float> @test_vf(<4 x float> %f)
  71   %t = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %y, <4 x float> %f)
  72   ret <4 x float> %t
  73 }
  74 define <4 x float> @b7(<4 x float> %f) {
  75   %y = call <4 x float> @test_vf(<4 x float> %f)
  76   %t = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %y, <4 x float> %f)
  77   ret <4 x float> %t
  78 }
  79 define <4 x float> @b8(<4 x float> %f) {
  80   %y = call <4 x float> @test_vf(<4 x float> %f)
  81   %t = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %y, <4 x float> %f)
  82   ret <4 x float> %t
  83 }
  84 define <2 x double> @c1(<2 x double> %f) {
  85   %a = call <2 x double> @test_vd(<2 x double> %f)
  86   %t = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %f)
  87   ret <2 x double> %t
  88 }
  89 define <2 x double> @d3(<2 x double> %f) {
  90   %y = call <2 x double> @test_vd(<2 x double> %f)
  91   %t = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %y, <2 x double> %f)
  92   ret <2 x double> %t
  93 }
  94 define <2 x double> @d4(<2 x double> %f) {
  95   %y = call <2 x double> @test_vd(<2 x double> %f)
  96   %t = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %y, <2 x double> %f)
  97   ret <2 x double> %t
  98 }
  99 define <2 x double> @d5(<2 x double> %f) {
 100   %y = call <2 x double> @test_vd(<2 x double> %f)
 101   %t = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %y, <2 x double> %f, i8 7)
 102   ret <2 x double> %t
 103 }
 104 define <2 x double> @d6(<2 x double> %f) {
 105   %y = call <2 x double> @test_vd(<2 x double> %f)
 106   %t = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %y, <2 x double> %f)
 107   ret <2 x double> %t
 108 }
 109 define <2 x double> @d7(<2 x double> %f) {
 110   %y = call <2 x double> @test_vd(<2 x double> %f)
 111   %t = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %y, <2 x double> %f)
 112   ret <2 x double> %t
 113 }
 114 define <2 x double> @d8(<2 x double> %f) {
 115   %y = call <2 x double> @test_vd(<2 x double> %f)
 116   %t = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %y, <2 x double> %f)
 117   ret <2 x double> %t
 118 }
 119
 120 ; This one should fail to fuse, but -regalloc=greedy isn't even trying. Instead
 121 ; it produces:
 122 ;   callq       test_vd
 123 ;   movapd      (%rsp), %xmm1           # 16-byte Reload
 124 ;   hsubpd      %xmm0, %xmm1
 125 ;   movapd      %xmm1, %xmm0
 126 ;   addq        $24, %rsp
 127 ;   ret
 128 ; RABasic still tries to fold this one.
 129
 130 define <2 x double> @z0(<2 x double> %f) {
 131   %y = call <2 x double> @test_vd(<2 x double> %f)
 132   %t = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %f, <2 x double> %y)
 133   ret <2 x double> %t
 134 }