1 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic |& FileCheck %s
5 declare float @test_f(float %f)
6 declare double @test_d(double %f)
7 declare <4 x float> @test_vf(<4 x float> %f)
8 declare <2 x double> @test_vd(<2 x double> %f)
9 declare float @llvm.sqrt.f32(float)
10 declare double @llvm.sqrt.f64(double)
12 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>)
13 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>)
14 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>)
15 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
16 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
17 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8)
18 declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>)
19 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
20 declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>)
21 declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>)
22 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
23 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
24 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8)
25 declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>)
26 declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>)
27 declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>)
29 define float @foo(float %f) {
30 %a = call float @test_f(float %f)
31 %t = call float @llvm.sqrt.f32(float %f)
34 define double @doo(double %f) {
35 %a = call double @test_d(double %f)
36 %t = call double @llvm.sqrt.f64(double %f)
39 define <4 x float> @a0(<4 x float> %f) {
40 %a = call <4 x float> @test_vf(<4 x float> %f)
41 %t = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %f)
44 define <4 x float> @a1(<4 x float> %f) {
45 %a = call <4 x float> @test_vf(<4 x float> %f)
46 %t = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %f)
49 define <4 x float> @a2(<4 x float> %f) {
50 %a = call <4 x float> @test_vf(<4 x float> %f)
51 %t = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %f)
54 define <4 x float> @b3(<4 x float> %f) {
55 %y = call <4 x float> @test_vf(<4 x float> %f)
56 %t = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %y, <4 x float> %f)
59 define <4 x float> @b4(<4 x float> %f) {
60 %y = call <4 x float> @test_vf(<4 x float> %f)
61 %t = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %y, <4 x float> %f)
64 define <4 x float> @b5(<4 x float> %f) {
65 %y = call <4 x float> @test_vf(<4 x float> %f)
66 %t = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %y, <4 x float> %f, i8 7)
69 define <4 x float> @b6(<4 x float> %f) {
70 %y = call <4 x float> @test_vf(<4 x float> %f)
71 %t = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %y, <4 x float> %f)
74 define <4 x float> @b7(<4 x float> %f) {
75 %y = call <4 x float> @test_vf(<4 x float> %f)
76 %t = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %y, <4 x float> %f)
79 define <4 x float> @b8(<4 x float> %f) {
80 %y = call <4 x float> @test_vf(<4 x float> %f)
81 %t = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %y, <4 x float> %f)
84 define <2 x double> @c1(<2 x double> %f) {
85 %a = call <2 x double> @test_vd(<2 x double> %f)
86 %t = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %f)
89 define <2 x double> @d3(<2 x double> %f) {
90 %y = call <2 x double> @test_vd(<2 x double> %f)
91 %t = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %y, <2 x double> %f)
94 define <2 x double> @d4(<2 x double> %f) {
95 %y = call <2 x double> @test_vd(<2 x double> %f)
96 %t = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %y, <2 x double> %f)
99 define <2 x double> @d5(<2 x double> %f) {
100 %y = call <2 x double> @test_vd(<2 x double> %f)
101 %t = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %y, <2 x double> %f, i8 7)
104 define <2 x double> @d6(<2 x double> %f) {
105 %y = call <2 x double> @test_vd(<2 x double> %f)
106 %t = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %y, <2 x double> %f)
109 define <2 x double> @d7(<2 x double> %f) {
110 %y = call <2 x double> @test_vd(<2 x double> %f)
111 %t = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %y, <2 x double> %f)
114 define <2 x double> @d8(<2 x double> %f) {
115 %y = call <2 x double> @test_vd(<2 x double> %f)
116 %t = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %y, <2 x double> %f)
120 ; This one should fail to fuse, but -regalloc=greedy isn't even trying. Instead
123 ; movapd (%rsp), %xmm1 # 16-byte Reload
124 ; hsubpd %xmm0, %xmm1
125 ; movapd %xmm1, %xmm0
128 ; RABasic still tries to fold this one.
130 define <2 x double> @z0(<2 x double> %f) {
131 %y = call <2 x double> @test_vd(<2 x double> %f)
132 %t = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %f, <2 x double> %y)