lib/main/pico-sdk/rp2_common/pico_float/float_single_hazard3.S

   1 /*
   2  * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
   3  *
   4  * SPDX-License-Identifier: BSD-3-Clause
   5  */
   6
   7 #include "pico/asm_helper.S"
   8 #include "hardware/hazard3.h"
   9
  10 // This file reimplements some common single-precision soft float routines
  11 // from libgcc. It targets the RV32IMBZbkb dialect (plus optionally Xh3bextm)
  12 // and is tuned for Hazard3 execution timings.
  13
  14 // Subnormal values are always flushed to zero on both input and output.
  15 // Rounding is always to nearest (even on tie).
  16
  17 pico_default_asm_setup
  18
  19 .macro float_section name
  20 #if PICO_FLOAT_IN_RAM
  21 .section RAM_SECTION_NAME(\name), "ax"
  22 #else
  23 .section SECTION_NAME(\name), "ax"
  24 #endif
  25 .endm
  26
  27 float_section __addsf3
  28 .global __subsf3
  29 .p2align 2
  30 __subsf3:
  31     binvi a1, a1, 31
  32 .global __addsf3
  33 __addsf3:
  34     // Unpack exponent:
  35     h3.bextmi a2, a0, 23, 8
  36     h3.bextmi a3, a1, 23, 8
  37     // Flush-to-zero => 0 + y = y applies, including nan, with the sole
  38     // exception of y being subnormal (which also needs to be flushed)
  39     beqz a2, __addsf_return_y_flushed
  40     // Don't have to handle this case for x + 0 = 0 because we already know x
  41     // is nonzero
  42     beqz a3, __addsf_return_x
  43     // Unpack significand, plus 3 extra zeroes for working space:
  44     slli a4, a0, 9
  45     slli a5, a1, 9
  46     // check nan/inf on input
  47     li t0, 255
  48     beq a2, t0, __addsf_x_nan_inf
  49     beq a3, t0, __addsf_y_nan_inf
  50     // (finish unpacking significand)
  51     srli a4, a4, 6
  52     srli a5, a5, 6
  53
  54     // If we're still on the straight path then we are adding two normal
  55     // values. Add implicit one (1.xx...xx000)
  56     bseti a4, a4, 23 + 3
  57     bseti a5, a5, 23 + 3
  58     // Negate if sign bit is set
  59     bgez a0, 1f
  60     neg a4, a4
  61 1:
  62     // (tuck this 16-bit here to avoid alignment penalty)
  63     li t1, 25
  64     bgez a1, 1f
  65     neg a5, a5
  66 1:
  67
  68     bltu a2, a3, __addsf_ye_gt_xe
  69
  70     // The main body is repeated twice with different register assignments.
  71     // lhs is the more-significant addend:
  72 .macro addsf_core packed_lhs, packed_rhs, sig_lhs, sig_rhs, exp_lhs, exp_rhs, rhs_is_x
  73     sub \packed_rhs, \exp_lhs, \exp_rhs
  74     // If there is a large exponent difference then there is no effect on lhs
  75 .if \rhs_is_x
  76     bgeu \packed_rhs, t1, __addsf_return_y
  77 .else
  78     bgeu \packed_rhs, t1, __addsf_return_x
  79 .endif
  80     // Shift rhs down to correct relative significance
  81     sra \packed_lhs, \sig_rhs, \packed_rhs
  82     // Set sticky bit if ones were shifted out
  83     sll \packed_rhs, \packed_lhs, \packed_rhs
  84     sltu \packed_rhs, \packed_rhs, \sig_rhs
  85     or \packed_lhs, \packed_lhs, \packed_rhs
  86     // Add significands
  87     add \sig_lhs, \sig_lhs, \packed_lhs
  88     // Detect exact cancellation (may be beyond max normalisation shift; also
  89     // IEEE 754 requires +0 for exact cancellation, no matter input signs)
  90     beqz \sig_lhs, __addsf_return_0
  91     // Convert two's complement back to sign + magnitude
  92     srai \exp_rhs, \sig_lhs, 31
  93     xor \sig_lhs, \sig_lhs, \exp_rhs
  94     sub \sig_lhs, \sig_lhs, \exp_rhs
  95     // Renormalise significand: bit 31 is now implicit one
  96     clz \packed_lhs, \sig_lhs
  97     sll \sig_lhs, \sig_lhs, \packed_lhs
  98     // Adjust exponent
  99     addi \packed_lhs, \packed_lhs, -5
 100     sub \exp_lhs, \exp_lhs, \packed_lhs
 101
 102     // Round to nearest, even on tie (bias upward if above odd number)
 103     bexti \packed_lhs, \sig_lhs, 8
 104     addi \sig_lhs, \sig_lhs, 127
 105     add \sig_lhs, \sig_lhs, \packed_lhs
 106     // Exponent may increase by one due to rounding up from all-ones; this is
 107     // detected by clearing of implicit one (there is a carry-out too)
 108     bgez \sig_lhs, 3f
 109 4:
 110     // Detect underflow/overflow
 111     bgeu \exp_lhs, t0, 1f
 112
 113     // Pack and return
 114     packh \exp_lhs, \exp_lhs, \exp_rhs
 115     slli \exp_lhs, \exp_lhs, 23
 116     slli \sig_lhs, \sig_lhs, 1
 117     srli \sig_lhs, \sig_lhs, 9
 118     add a0, \sig_lhs, \exp_lhs
 119     ret
 120 1:
 121     bgez \exp_lhs, 2f
 122     // Signed zero on underflow
 123     slli a0, \exp_rhs, 31
 124     ret
 125 2:
 126     // Signed infinity on overflow
 127     packh a0, t0, \exp_rhs
 128     slli a0, a0, 23
 129     ret
 130 3:
 131     // Exponent increase due to rounding (uncommon)
 132     srli \sig_lhs, \sig_lhs, 1
 133     addi \exp_lhs, \exp_lhs, 1
 134     j 4b
 135 .endm
 136
 137 __addsf_xe_gte_ye:
 138     addsf_core a0, a1, a4, a5, a2, a3, 0
 139 .p2align 2
 140 __addsf_ye_gt_xe:
 141     addsf_core a1, a0, a5, a4, a3, a2, 1
 142
 143 __addsf_x_nan_inf:
 144     // When at least one operand is nan, we must propagate at least one of
 145     // those nan payloads (sign of nan result is unspecified, which we take
 146     // advantage of by implementing x - y as x + -y). Check x nan vs inf:
 147     bnez a4, __addsf_return_x
 148 __addsf_x_inf:
 149     // If x is +-inf, need to distinguish the following cases:
 150     bne  a3, t0, __addsf_return_x // y is neither inf nor nan   -> return x (propagate inf)
 151     bnez a5,     __addsf_return_y // y is nan:                  -> return y (propagate nan)
 152     xor a5, a0, a1
 153     srli a5, a5, 31
 154     beqz a5,     __addsf_return_x // y is inf of same sign      -> return either x or y (x is faster)
 155     li a0, -1                     // y is inf of different sign -> return nan
 156     ret
 157
 158 __addsf_y_nan_inf:
 159     // Mirror of __addsf_x_nan_inf
 160     bnez a5, __addsf_return_y
 161 __addsf_y_inf:
 162     bne  a2, t0, __addsf_return_y
 163     bnez a4,     __addsf_return_x
 164     xor a4, a0, a1
 165     srli a4, a4, 31
 166     beqz a4,     __addsf_return_x
 167     li a0, -1
 168     ret
 169
 170 __addsf_return_y_flushed:
 171     bnez a3, 1f
 172     srli a1, a1, 23
 173     slli a1, a1, 23
 174 1:
 175 __addsf_return_y:
 176     mv a0, a1
 177 __addsf_return_x:
 178     ret
 179 __addsf_return_0:
 180     li a0, 0
 181     ret
 182
 183
 184 float_section __mulsf3
 185 .global __mulsf3
 186 .p2align 2
 187 __mulsf3:
 188     // Force y to be positive (by possibly negating x) *before* unpacking.
 189     // This allows many special cases to be handled without repacking.
 190     bgez a1, 1f
 191     binvi a0, a0, 31
 192 1:
 193     // Unpack exponent:
 194     h3.bextmi a2, a0, 23, 8
 195     h3.bextmi a3, a1, 23, 8
 196     // Check special cases
 197     li t0, 255
 198     beqz a2, __mulsf_x_0
 199     beqz a3, __mulsf_y_0
 200     beq a2, t0, __mulsf_x_nan_inf
 201     beq a3, t0, __mulsf_y_nan_inf
 202
 203     // Finish unpacking sign
 204     srai a6, a0, 31
 205     // Unpack significand (with implicit one in MSB)
 206     slli a4, a0, 8
 207     slli a5, a1, 8
 208     bseti a4, a4, 31
 209     bseti a5, a5, 31
 210     // Get full 64-bit multiply result in a4:a1 (one cycle each half)
 211     // Going from Q1.23 to Q2.46 (both left-justified)
 212     mul a1, a4, a5
 213     mulhu a4, a4, a5
 214     // Normalise (shift left by either 0 or 1) -- bit 8 is the LSB of the
 215     // final significand (ignoring rounding)
 216     clz a0, a4
 217     sll a4, a4, a0
 218     sub a2, a2, a0
 219     // After normalising we can calculate the final exponent, since rounding
 220     // cannot increase the exponent for multiplication (unlike addition)
 221     add a2, a2, a3
 222     // Subtract redundant bias term (127), add 1 for normalisation correction
 223     addi a2, a2, -126
 224     blez a2, __mulsf_underflow
 225     bge a2, t0, __mulsf_overflow
 226
 227     // Gather sticky bits from low fraction:
 228     snez a1, a1
 229     or a4, a4, a1
 230     // Round to nearest, even on tie (aka bias upward if odd)
 231     bexti a1, a4, 8
 232     add a4, a4, a1
 233     addi a4, a4, 127
 234     // Pack it and ship it
 235     packh a2, a2, a6
 236     slli a2, a2, 23
 237     slli a4, a4, 1
 238     srli a4, a4, 9
 239     add a0, a4, a2
 240     ret
 241
 242 __mulsf_underflow:
 243     // Signed zero
 244     slli a0, a6, 31
 245     ret
 246 __mulsf_overflow:
 247     // Signed inf
 248     packh a0, t0, a6
 249     slli a0, a0, 23
 250     ret
 251
 252 __mulsf_x_0:
 253     // 0 times nan    -> propagate nan
 254     // 0 times inf    -> generate nan
 255     // 0 times others -> 0 (need to flush significand too as we are FTZ)
 256     bne a3, t0, __mulsf_return_flushed_x
 257     slli a5, a1, 9
 258     beqz a5, 1f
 259     // Propagate nan from y
 260 __mulsf_return_y:
 261     mv a0, a1
 262     ret
 263 1:
 264     // Generate new nan
 265     li a0, -1
 266     ret
 267
 268 __mulsf_y_0:
 269     // Mirror image of x_0 except we still return x for signed 0, since the
 270     // signs were already resolved.
 271     bne a2, t0, __mulsf_return_flushed_x
 272     slli a1, a0, 9
 273     bnez a1, 1f
 274     li a0, -1
 275 1:
 276     ret
 277
 278 __mulsf_return_flushed_x:
 279     // If we don't support subnormals we at least need to flush to a canonical
 280     // zero. This is just a sign bit in bit 31.
 281     srli a0, a0, 31
 282     slli a0, a0, 31
 283 __mulsf_return_x:
 284     ret
 285
 286 __mulsf_x_nan_inf:
 287     // We know that y is not zero and is positive. So...
 288     //      x is nan    -> return x
 289     // else y is nan    -> return y
 290     // else y is inf    -> return x
 291     // else y is normal -> return x
 292     // (the order of the first two clauses is actually our free choice)
 293     slli a4, a0, 9
 294     bnez a4, __mulsf_return_x
 295     bne a3, t0, __mulsf_return_x
 296     slli a5, a1, 9
 297     bnez a5, __mulsf_return_y
 298     ret // return x
 299
 300 __mulsf_y_nan_inf:
 301     // We know that x is not zero, nan, nor inf. That just leaves normals.
 302     // y is nan -> return y
 303     // y is inf -> return inf * sgn(x) (since we already merged the signs)
 304     slli a5, a1, 9
 305     bnez a5, __mulsf_return_y
 306     srai a0, a0, 31
 307     packh a0, t0, a0
 308     slli a0, a0, 23
 309     ret
 310
 311
 312 // This is a hack to improve soft float performance for the routines we don't
 313 // implement (e.g. libm) in libraries built against a non-Zbb ISA dialect:
 314 float_section __clz2si
 315 .global __clz2si
 316 __clz2si:
 317     clz a0, a0
 318     ret