2 ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT
3 ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null -fp-contract=fast | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FAST
4 ; Check latencies of vmul/vfma accumulate chains.
6 define arm_aapcs_vfpcc float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) {
7 ; CHECK: ********** MI Scheduling **********
11 ; > VMULS common latency = 5
15 ; > VMULS read-advanced latency to VMLAS = 0
16 ; CHECK-SAME: Latency=0
18 ; CHECK-DEFAULT: VMLAS
20 ; > VMLAS common latency = 9
24 ; > VMLAS read-advanced latency to the next VMLAS = 4
25 ; CHECK-SAME: Latency=4
27 ; CHECK-DEFAULT: VMLAS
32 ; > VMLAS not-optimized latency to VMOVRS = 9
33 ; CHECK-SAME: Latency=9
35 ; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLAS, VMLAS
36 %mul1 = fmul float %f1, %f2
37 %mul2 = fmul float %f3, %f4
38 %mul3 = fmul float %f5, %f6
39 %add1 = fadd float %mul1, %mul2
40 %add2 = fadd float %add1, %mul3
45 define arm_aapcs_vfpcc <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 x float> %f4, <2 x float> %f5, <2 x float> %f6) {
46 ; CHECK: ********** MI Scheduling **********
50 ; > VMULfd common latency = 5
54 ; VMULfd read-advanced latency to VMLAfd = 0
55 ; CHECK-SAME: Latency=0
57 ; CHECK-DEFAULT: VMLAfd
59 ; > VMLAfd common latency = 9
63 ; > VMLAfd read-advanced latency to the next VMLAfd = 4
64 ; CHECK-SAME: Latency=4
66 ; CHECK-DEFAULT: VMLAfd
71 ; > VMLAfd not-optimized latency to VMOVRRD = 9
72 ; CHECK-SAME: Latency=9
74 ; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLAS, VMLAS
75 %mul1 = fmul <2 x float> %f1, %f2
76 %mul2 = fmul <2 x float> %f3, %f4
77 %mul3 = fmul <2 x float> %f5, %f6
78 %add1 = fadd <2 x float> %mul1, %mul2
79 %add2 = fadd <2 x float> %add1, %mul3
83 define arm_aapcs_vfpcc float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) {
84 ; CHECK: ********** MI Scheduling **********
88 ; > VMULS common latency = 5
92 ; > VMULS read-advanced latency to VMLSS = 0
93 ; CHECK-SAME: Latency=0
95 ; CHECK-DEFAULT: VMLSS
97 ; > VFNMSS common latency = 9
101 ; > VFNMSS read-advanced latency to the next VMLSS = 4
102 ; CHECK-SAME: Latency=4
104 ; CHECK-DEFAULT: VMLSS
109 ; > VMLSS not-optimized latency to VMOVRS = 9
110 ; CHECK-SAME: Latency=9
112 ; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLSS, VMLSS
113 %mul1 = fmul float %f1, %f2
114 %mul2 = fmul float %f3, %f4
115 %mul3 = fmul float %f5, %f6
116 %sub1 = fsub float %mul1, %mul2
117 %sub2 = fsub float %sub1, %mul3
122 define arm_aapcs_vfpcc <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 x float> %f4, <2 x float> %f5, <2 x float> %f6) {
123 ; CHECK: ********** MI Scheduling **********
127 ; > VMULfd common latency = 5
131 ; VMULfd read-advanced latency to VMLSfd = 0
132 ; CHECK-SAME: Latency=0
134 ; CHECK-DEFAULT: VMLSfd
136 ; > VMLSfd common latency = 9
140 ; > VMLSfd read-advanced latency to the next VMLSfd = 4
141 ; CHECK-SAME: Latency=4
143 ; CHECK-DEFAULT: VMLSfd
148 ; > VMLSfd not-optimized latency to VMOVRRD = 9
149 ; CHECK-SAME: Latency=9
151 ; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLSS, VMLSS
152 %mul1 = fmul <2 x float> %f1, %f2
153 %mul2 = fmul <2 x float> %f3, %f4
154 %mul3 = fmul <2 x float> %f5, %f6
155 %sub1 = fsub <2 x float> %mul1, %mul2
156 %sub2 = fsub <2 x float> %sub1, %mul3
157 ret <2 x float> %sub2
160 define arm_aapcs_vfpcc float @Test5(float %f1, float %f2, float %f3) {
161 ; CHECK: ********** MI Scheduling **********
164 ; CHECK-DEFAULT: VNMLS
169 ; > VMLAS not-optimized latency to VMOVRS = 9
170 ; CHECK-SAME: Latency=9
172 ; f1 * f2 - f3 ==> VNMLS/VFNMS
173 %mul = fmul float %f1, %f2
174 %sub = fsub float %mul, %f3
179 define arm_aapcs_vfpcc float @Test6(float %f1, float %f2, float %f3) {
180 ; CHECK: ********** MI Scheduling **********
183 ; CHECK-DEFAULT: VNMLA
188 ; > VMLAS not-optimized latency to VMOVRS = 9
189 ; CHECK-SAME: Latency=9
191 ; f1 * f2 - f3 ==> VNMLA/VFNMA
192 %mul = fmul float %f1, %f2
193 %sub1 = fsub float -0.0, %mul
194 %sub2 = fsub float %sub1, %f2