2 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
4 * SPDX-License-Identifier: BSD-3-Clause
7 #include "pico/asm_helper.S"
8 #include "hardware/hazard3.h"
10 // This file reimplements some common single-precision soft float routines
11 // from libgcc. It targets the RV32IMBZbkb dialect (plus optionally Xh3bextm)
12 // and is tuned for Hazard3 execution timings.
14 // Subnormal values are always flushed to zero on both input and output.
15 // Rounding is always to nearest (even on tie).
17 pico_default_asm_setup
19 .macro float_section name
21 .section RAM_SECTION_NAME(\name), "ax"
23 .section SECTION_NAME(\name), "ax"
27 float_section __addsf3
35 h3.bextmi a2, a0, 23, 8
36 h3.bextmi a3, a1, 23, 8
37 // Flush-to-zero => 0 + y = y applies, including nan, with the sole
38 // exception of y being subnormal (which also needs to be flushed)
39 beqz a2, __addsf_return_y_flushed
40 // Don't have to handle this case for x + 0 = 0 because we already know x
42 beqz a3, __addsf_return_x
43 // Unpack significand, plus 3 extra zeroes for working space:
46 // check nan/inf on input
48 beq a2, t0, __addsf_x_nan_inf
49 beq a3, t0, __addsf_y_nan_inf
50 // (finish unpacking significand)
54 // If we're still on the straight path then we are adding two normal
55 // values. Add implicit one (1.xx...xx000)
58 // Negate if sign bit is set
62 // (tuck this 16-bit here to avoid alignment penalty)
68 bltu a2, a3, __addsf_ye_gt_xe
70 // The main body is repeated twice with different register assignments.
71 // lhs is the more-significant addend:
72 .macro addsf_core packed_lhs, packed_rhs, sig_lhs, sig_rhs, exp_lhs, exp_rhs, rhs_is_x
73 sub \packed_rhs, \exp_lhs, \exp_rhs
74 // If there is a large exponent difference then there is no effect on lhs
76 bgeu \packed_rhs, t1, __addsf_return_y
78 bgeu \packed_rhs, t1, __addsf_return_x
80 // Shift rhs down to correct relative significance
81 sra \packed_lhs, \sig_rhs, \packed_rhs
82 // Set sticky bit if ones were shifted out
83 sll \packed_rhs, \packed_lhs, \packed_rhs
84 sltu \packed_rhs, \packed_rhs, \sig_rhs
85 or \packed_lhs, \packed_lhs, \packed_rhs
87 add \sig_lhs, \sig_lhs, \packed_lhs
88 // Detect exact cancellation (may be beyond max normalisation shift; also
89 // IEEE 754 requires +0 for exact cancellation, no matter input signs)
90 beqz \sig_lhs, __addsf_return_0
91 // Convert two's complement back to sign + magnitude
92 srai \exp_rhs, \sig_lhs, 31
93 xor \sig_lhs, \sig_lhs, \exp_rhs
94 sub \sig_lhs, \sig_lhs, \exp_rhs
95 // Renormalise significand: bit 31 is now implicit one
96 clz \packed_lhs, \sig_lhs
97 sll \sig_lhs, \sig_lhs, \packed_lhs
99 addi \packed_lhs, \packed_lhs, -5
100 sub \exp_lhs, \exp_lhs, \packed_lhs
102 // Round to nearest, even on tie (bias upward if above odd number)
103 bexti \packed_lhs, \sig_lhs, 8
104 addi \sig_lhs, \sig_lhs, 127
105 add \sig_lhs, \sig_lhs, \packed_lhs
106 // Exponent may increase by one due to rounding up from all-ones; this is
107 // detected by clearing of implicit one (there is a carry-out too)
110 // Detect underflow/overflow
111 bgeu \exp_lhs, t0, 1f
114 packh \exp_lhs, \exp_lhs, \exp_rhs
115 slli \exp_lhs, \exp_lhs, 23
116 slli \sig_lhs, \sig_lhs, 1
117 srli \sig_lhs, \sig_lhs, 9
118 add a0, \sig_lhs, \exp_lhs
122 // Signed zero on underflow
123 slli a0, \exp_rhs, 31
126 // Signed infinity on overflow
127 packh a0, t0, \exp_rhs
131 // Exponent increase due to rounding (uncommon)
132 srli \sig_lhs, \sig_lhs, 1
133 addi \exp_lhs, \exp_lhs, 1
138 addsf_core a0, a1, a4, a5, a2, a3, 0
141 addsf_core a1, a0, a5, a4, a3, a2, 1
144 // When at least one operand is nan, we must propagate at least one of
145 // those nan payloads (sign of nan result is unspecified, which we take
146 // advantage of by implementing x - y as x + -y). Check x nan vs inf:
147 bnez a4, __addsf_return_x
149 // If x is +-inf, need to distinguish the following cases:
150 bne a3, t0, __addsf_return_x // y is neither inf nor nan -> return x (propagate inf)
151 bnez a5, __addsf_return_y // y is nan: -> return y (propagate nan)
154 beqz a5, __addsf_return_x // y is inf of same sign -> return either x or y (x is faster)
155 li a0, -1 // y is inf of different sign -> return nan
159 // Mirror of __addsf_x_nan_inf
160 bnez a5, __addsf_return_y
162 bne a2, t0, __addsf_return_y
163 bnez a4, __addsf_return_x
166 beqz a4, __addsf_return_x
170 __addsf_return_y_flushed:
184 float_section __mulsf3
188 // Force y to be positive (by possibly negating x) *before* unpacking.
189 // This allows many special cases to be handled without repacking.
194 h3.bextmi a2, a0, 23, 8
195 h3.bextmi a3, a1, 23, 8
196 // Check special cases
200 beq a2, t0, __mulsf_x_nan_inf
201 beq a3, t0, __mulsf_y_nan_inf
203 // Finish unpacking sign
205 // Unpack significand (with implicit one in MSB)
210 // Get full 64-bit multiply result in a4:a1 (one cycle each half)
211 // Going from Q1.23 to Q2.46 (both left-justified)
214 // Normalise (shift left by either 0 or 1) -- bit 8 is the LSB of the
215 // final significand (ignoring rounding)
219 // After normalising we can calculate the final exponent, since rounding
220 // cannot increase the exponent for multiplication (unlike addition)
222 // Subtract redundant bias term (127), add 1 for normalisation correction
224 blez a2, __mulsf_underflow
225 bge a2, t0, __mulsf_overflow
227 // Gather sticky bits from low fraction:
230 // Round to nearest, even on tie (aka bias upward if odd)
234 // Pack it and ship it
253 // 0 times nan -> propagate nan
254 // 0 times inf -> generate nan
255 // 0 times others -> 0 (need to flush significand too as we are FTZ)
256 bne a3, t0, __mulsf_return_flushed_x
259 // Propagate nan from y
269 // Mirror image of x_0 except we still return x for signed 0, since the
270 // signs were already resolved.
271 bne a2, t0, __mulsf_return_flushed_x
278 __mulsf_return_flushed_x:
279 // If we don't support subnormals we at least need to flush to a canonical
280 // zero. This is just a sign bit in bit 31.
287 // We know that y is not zero and is positive. So...
288 // x is nan -> return x
289 // else y is nan -> return y
290 // else y is inf -> return x
291 // else y is normal -> return x
292 // (the order of the first two clauses is actually our free choice)
294 bnez a4, __mulsf_return_x
295 bne a3, t0, __mulsf_return_x
297 bnez a5, __mulsf_return_y
301 // We know that x is not zero, nan, nor inf. That just leaves normals.
302 // y is nan -> return y
303 // y is inf -> return inf * sgn(x) (since we already merged the signs)
305 bnez a5, __mulsf_return_y
312 // This is a hack to improve soft float performance for the routines we don't
313 // implement (e.g. libm) in libraries built against a non-Zbb ISA dialect:
314 float_section __clz2si