libc/src/__support/FPUtil/dyadic_float.h

   1 //===-- A class to store high precision floating point numbers --*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H
  10 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H
  11
  12 #include "FEnvImpl.h"
  13 #include "FPBits.h"
  14 #include "hdr/errno_macros.h"
  15 #include "hdr/fenv_macros.h"
  16 #include "multiply_add.h"
  17 #include "rounding_mode.h"
  18 #include "src/__support/CPP/type_traits.h"
  19 #include "src/__support/big_int.h"
  20 #include "src/__support/macros/config.h"
  21 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
  22 #include "src/__support/macros/properties/types.h"
  23
  24 #include <stddef.h>
  25
  26 namespace LIBC_NAMESPACE_DECL {
  27 namespace fputil {
  28
  29 // A generic class to perform computations of high precision floating points.
  30 // We store the value in dyadic format, including 3 fields:
  31 //   sign    : boolean value - false means positive, true means negative
  32 //   exponent: the exponent value of the least significant bit of the mantissa.
  33 //   mantissa: unsigned integer of length `Bits`.
  34 // So the real value that is stored is:
  35 //   real value = (-1)^sign * 2^exponent * (mantissa as unsigned integer)
  36 // The stored data is normal if for non-zero mantissa, the leading bit is 1.
  37 // The outputs of the constructors and most functions will be normalized.
  38 // To simplify and improve the efficiency, many functions will assume that the
  39 // inputs are normal.
  40 template <size_t Bits> struct DyadicFloat {
  41   using MantissaType = LIBC_NAMESPACE::UInt<Bits>;
  42
  43   Sign sign = Sign::POS;
  44   int exponent = 0;
  45   MantissaType mantissa = MantissaType(0);
  46
  47   LIBC_INLINE constexpr DyadicFloat() = default;
  48
  49   template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
  50   LIBC_INLINE constexpr DyadicFloat(T x) {
  51     static_assert(FPBits<T>::FRACTION_LEN < Bits);
  52     FPBits<T> x_bits(x);
  53     sign = x_bits.sign();
  54     exponent = x_bits.get_explicit_exponent() - FPBits<T>::FRACTION_LEN;
  55     mantissa = MantissaType(x_bits.get_explicit_mantissa());
  56     normalize();
  57   }
  58
  59   LIBC_INLINE constexpr DyadicFloat(Sign s, int e, MantissaType m)
  60       : sign(s), exponent(e), mantissa(m) {
  61     normalize();
  62   }
  63
  64   // Normalizing the mantissa, bringing the leading 1 bit to the most
  65   // significant bit.
  66   LIBC_INLINE constexpr DyadicFloat &normalize() {
  67     if (!mantissa.is_zero()) {
  68       int shift_length = cpp::countl_zero(mantissa);
  69       exponent -= shift_length;
  70       mantissa <<= static_cast<size_t>(shift_length);
  71     }
  72     return *this;
  73   }
  74
  75   // Used for aligning exponents.  Output might not be normalized.
  76   LIBC_INLINE constexpr DyadicFloat &shift_left(unsigned shift_length) {
  77     if (shift_length < Bits) {
  78       exponent -= static_cast<int>(shift_length);
  79       mantissa <<= shift_length;
  80     } else {
  81       exponent = 0;
  82       mantissa = MantissaType(0);
  83     }
  84     return *this;
  85   }
  86
  87   // Used for aligning exponents.  Output might not be normalized.
  88   LIBC_INLINE constexpr DyadicFloat &shift_right(unsigned shift_length) {
  89     if (shift_length < Bits) {
  90       exponent += static_cast<int>(shift_length);
  91       mantissa >>= shift_length;
  92     } else {
  93       exponent = 0;
  94       mantissa = MantissaType(0);
  95     }
  96     return *this;
  97   }
  98
  99   // Assume that it is already normalized.  Output the unbiased exponent.
 100   LIBC_INLINE constexpr int get_unbiased_exponent() const {
 101     return exponent + (Bits - 1);
 102   }
 103
 104 #ifdef LIBC_TYPES_HAS_FLOAT16
 105   template <typename T, bool ShouldSignalExceptions>
 106   LIBC_INLINE constexpr cpp::enable_if_t<
 107       cpp::is_floating_point_v<T> && (FPBits<T>::FRACTION_LEN < Bits), T>
 108   generic_as() const {
 109     using FPBits = FPBits<float16>;
 110     using StorageType = typename FPBits::StorageType;
 111
 112     constexpr int EXTRA_FRACTION_LEN = Bits - 1 - FPBits::FRACTION_LEN;
 113
 114     if (mantissa == 0)
 115       return FPBits::zero(sign).get_val();
 116
 117     int unbiased_exp = get_unbiased_exponent();
 118
 119     if (unbiased_exp + FPBits::EXP_BIAS >= FPBits::MAX_BIASED_EXPONENT) {
 120       if constexpr (ShouldSignalExceptions) {
 121         set_errno_if_required(ERANGE);
 122         raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
 123       }
 124
 125       switch (quick_get_round()) {
 126       case FE_TONEAREST:
 127         return FPBits::inf(sign).get_val();
 128       case FE_TOWARDZERO:
 129         return FPBits::max_normal(sign).get_val();
 130       case FE_DOWNWARD:
 131         if (sign.is_pos())
 132           return FPBits::max_normal(Sign::POS).get_val();
 133         return FPBits::inf(Sign::NEG).get_val();
 134       case FE_UPWARD:
 135         if (sign.is_neg())
 136           return FPBits::max_normal(Sign::NEG).get_val();
 137         return FPBits::inf(Sign::POS).get_val();
 138       default:
 139         __builtin_unreachable();
 140       }
 141     }
 142
 143     StorageType out_biased_exp = 0;
 144     StorageType out_mantissa = 0;
 145     bool round = false;
 146     bool sticky = false;
 147     bool underflow = false;
 148
 149     if (unbiased_exp < -FPBits::EXP_BIAS - FPBits::FRACTION_LEN) {
 150       sticky = true;
 151       underflow = true;
 152     } else if (unbiased_exp == -FPBits::EXP_BIAS - FPBits::FRACTION_LEN) {
 153       round = true;
 154       MantissaType sticky_mask = (MantissaType(1) << (Bits - 1)) - 1;
 155       sticky = (mantissa & sticky_mask) != 0;
 156     } else {
 157       int extra_fraction_len = EXTRA_FRACTION_LEN;
 158
 159       if (unbiased_exp < 1 - FPBits::EXP_BIAS) {
 160         underflow = true;
 161         extra_fraction_len += 1 - FPBits::EXP_BIAS - unbiased_exp;
 162       } else {
 163         out_biased_exp =
 164             static_cast<StorageType>(unbiased_exp + FPBits::EXP_BIAS);
 165       }
 166
 167       MantissaType round_mask = MantissaType(1) << (extra_fraction_len - 1);
 168       round = (mantissa & round_mask) != 0;
 169       MantissaType sticky_mask = round_mask - 1;
 170       sticky = (mantissa & sticky_mask) != 0;
 171
 172       out_mantissa = static_cast<StorageType>(mantissa >> extra_fraction_len);
 173     }
 174
 175     bool lsb = (out_mantissa & 1) != 0;
 176
 177     StorageType result =
 178         FPBits::create_value(sign, out_biased_exp, out_mantissa).uintval();
 179
 180     switch (quick_get_round()) {
 181     case FE_TONEAREST:
 182       if (round && (lsb || sticky))
 183         ++result;
 184       break;
 185     case FE_DOWNWARD:
 186       if (sign.is_neg() && (round || sticky))
 187         ++result;
 188       break;
 189     case FE_UPWARD:
 190       if (sign.is_pos() && (round || sticky))
 191         ++result;
 192       break;
 193     default:
 194       break;
 195     }
 196
 197     if (ShouldSignalExceptions && (round || sticky)) {
 198       int excepts = FE_INEXACT;
 199       if (FPBits(result).is_inf()) {
 200         set_errno_if_required(ERANGE);
 201         excepts |= FE_OVERFLOW;
 202       } else if (underflow) {
 203         set_errno_if_required(ERANGE);
 204         excepts |= FE_UNDERFLOW;
 205       }
 206       raise_except_if_required(excepts);
 207     }
 208
 209     return FPBits(result).get_val();
 210   }
 211 #endif // LIBC_TYPES_HAS_FLOAT16
 212
 213   template <typename T, bool ShouldSignalExceptions,
 214             typename = cpp::enable_if_t<cpp::is_floating_point_v<T> &&
 215                                             (FPBits<T>::FRACTION_LEN < Bits),
 216                                         void>>
 217   LIBC_INLINE constexpr T fast_as() const {
 218     if (LIBC_UNLIKELY(mantissa.is_zero()))
 219       return FPBits<T>::zero(sign).get_val();
 220
 221     // Assume that it is normalized, and output is also normal.
 222     constexpr uint32_t PRECISION = FPBits<T>::FRACTION_LEN + 1;
 223     using output_bits_t = typename FPBits<T>::StorageType;
 224     constexpr output_bits_t IMPLICIT_MASK =
 225         FPBits<T>::SIG_MASK - FPBits<T>::FRACTION_MASK;
 226
 227     int exp_hi = exponent + static_cast<int>((Bits - 1) + FPBits<T>::EXP_BIAS);
 228
 229     if (LIBC_UNLIKELY(exp_hi > 2 * FPBits<T>::EXP_BIAS)) {
 230       // Results overflow.
 231       T d_hi =
 232           FPBits<T>::create_value(sign, 2 * FPBits<T>::EXP_BIAS, IMPLICIT_MASK)
 233               .get_val();
 234       // volatile prevents constant propagation that would result in infinity
 235       // always being returned no matter the current rounding mode.
 236       volatile T two = static_cast<T>(2.0);
 237       T r = two * d_hi;
 238
 239       // TODO: Whether rounding down the absolute value to max_normal should
 240       // also raise FE_OVERFLOW and set ERANGE is debatable.
 241       if (ShouldSignalExceptions && FPBits<T>(r).is_inf())
 242         set_errno_if_required(ERANGE);
 243
 244       return r;
 245     }
 246
 247     bool denorm = false;
 248     uint32_t shift = Bits - PRECISION;
 249     if (LIBC_UNLIKELY(exp_hi <= 0)) {
 250       // Output is denormal.
 251       denorm = true;
 252       shift = (Bits - PRECISION) + static_cast<uint32_t>(1 - exp_hi);
 253
 254       exp_hi = FPBits<T>::EXP_BIAS;
 255     }
 256
 257     int exp_lo = exp_hi - static_cast<int>(PRECISION) - 1;
 258
 259     MantissaType m_hi =
 260         shift >= MantissaType::BITS ? MantissaType(0) : mantissa >> shift;
 261
 262     T d_hi = FPBits<T>::create_value(
 263                  sign, static_cast<output_bits_t>(exp_hi),
 264                  (static_cast<output_bits_t>(m_hi) & FPBits<T>::SIG_MASK) |
 265                      IMPLICIT_MASK)
 266                  .get_val();
 267
 268     MantissaType round_mask =
 269         shift > MantissaType::BITS ? 0 : MantissaType(1) << (shift - 1);
 270     MantissaType sticky_mask = round_mask - MantissaType(1);
 271
 272     bool round_bit = !(mantissa & round_mask).is_zero();
 273     bool sticky_bit = !(mantissa & sticky_mask).is_zero();
 274     int round_and_sticky = int(round_bit) * 2 + int(sticky_bit);
 275
 276     T d_lo;
 277
 278     if (LIBC_UNLIKELY(exp_lo <= 0)) {
 279       // d_lo is denormal, but the output is normal.
 280       int scale_up_exponent = 1 - exp_lo;
 281       T scale_up_factor =
 282           FPBits<T>::create_value(Sign::POS,
 283                                   static_cast<output_bits_t>(
 284                                       FPBits<T>::EXP_BIAS + scale_up_exponent),
 285                                   IMPLICIT_MASK)
 286               .get_val();
 287       T scale_down_factor =
 288           FPBits<T>::create_value(Sign::POS,
 289                                   static_cast<output_bits_t>(
 290                                       FPBits<T>::EXP_BIAS - scale_up_exponent),
 291                                   IMPLICIT_MASK)
 292               .get_val();
 293
 294       d_lo = FPBits<T>::create_value(
 295                  sign, static_cast<output_bits_t>(exp_lo + scale_up_exponent),
 296                  IMPLICIT_MASK)
 297                  .get_val();
 298
 299       return multiply_add(d_lo, T(round_and_sticky), d_hi * scale_up_factor) *
 300              scale_down_factor;
 301     }
 302
 303     d_lo = FPBits<T>::create_value(sign, static_cast<output_bits_t>(exp_lo),
 304                                    IMPLICIT_MASK)
 305                .get_val();
 306
 307     // Still correct without FMA instructions if `d_lo` is not underflow.
 308     T r = multiply_add(d_lo, T(round_and_sticky), d_hi);
 309
 310     if (LIBC_UNLIKELY(denorm)) {
 311       // Exponent before rounding is in denormal range, simply clear the
 312       // exponent field.
 313       output_bits_t clear_exp = static_cast<output_bits_t>(
 314           output_bits_t(exp_hi) << FPBits<T>::SIG_LEN);
 315       output_bits_t r_bits = FPBits<T>(r).uintval() - clear_exp;
 316
 317       if (!(r_bits & FPBits<T>::EXP_MASK)) {
 318         // Output is denormal after rounding, clear the implicit bit for 80-bit
 319         // long double.
 320         r_bits -= IMPLICIT_MASK;
 321
 322         // TODO: IEEE Std 754-2019 lets implementers choose whether to check for
 323         // "tininess" before or after rounding for base-2 formats, as long as
 324         // the same choice is made for all operations. Our choice to check after
 325         // rounding might not be the same as the hardware's.
 326         if (ShouldSignalExceptions && round_and_sticky) {
 327           set_errno_if_required(ERANGE);
 328           raise_except_if_required(FE_UNDERFLOW);
 329         }
 330       }
 331
 332       return FPBits<T>(r_bits).get_val();
 333     }
 334
 335     return r;
 336   }
 337
 338   // Assume that it is already normalized.
 339   // Output is rounded correctly with respect to the current rounding mode.
 340   template <typename T, bool ShouldSignalExceptions,
 341             typename = cpp::enable_if_t<cpp::is_floating_point_v<T> &&
 342                                             (FPBits<T>::FRACTION_LEN < Bits),
 343                                         void>>
 344   LIBC_INLINE constexpr T as() const {
 345 #if defined(LIBC_TYPES_HAS_FLOAT16) && !defined(__LIBC_USE_FLOAT16_CONVERSION)
 346     if constexpr (cpp::is_same_v<T, float16>)
 347       return generic_as<T, ShouldSignalExceptions>();
 348 #endif
 349     return fast_as<T, ShouldSignalExceptions>();
 350   }
 351
 352   template <typename T,
 353             typename = cpp::enable_if_t<cpp::is_floating_point_v<T> &&
 354                                             (FPBits<T>::FRACTION_LEN < Bits),
 355                                         void>>
 356   LIBC_INLINE explicit constexpr operator T() const {
 357     return as<T, /*ShouldSignalExceptions=*/false>();
 358   }
 359
 360   LIBC_INLINE constexpr MantissaType as_mantissa_type() const {
 361     if (mantissa.is_zero())
 362       return 0;
 363
 364     MantissaType new_mant = mantissa;
 365     if (exponent > 0) {
 366       new_mant <<= exponent;
 367     } else {
 368       new_mant >>= (-exponent);
 369     }
 370
 371     if (sign.is_neg()) {
 372       new_mant = (~new_mant) + 1;
 373     }
 374
 375     return new_mant;
 376   }
 377 };
 378
 379 // Quick add - Add 2 dyadic floats with rounding toward 0 and then normalize the
 380 // output:
 381 //   - Align the exponents so that:
 382 //     new a.exponent = new b.exponent = max(a.exponent, b.exponent)
 383 //   - Add or subtract the mantissas depending on the signs.
 384 //   - Normalize the result.
 385 // The absolute errors compared to the mathematical sum is bounded by:
 386 //   | quick_add(a, b) - (a + b) | < MSB(a + b) * 2^(-Bits + 2),
 387 // i.e., errors are up to 2 ULPs.
 388 // Assume inputs are normalized (by constructors or other functions) so that we
 389 // don't need to normalize the inputs again in this function.  If the inputs are
 390 // not normalized, the results might lose precision significantly.
 391 template <size_t Bits>
 392 LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,
 393                                                   DyadicFloat<Bits> b) {
 394   if (LIBC_UNLIKELY(a.mantissa.is_zero()))
 395     return b;
 396   if (LIBC_UNLIKELY(b.mantissa.is_zero()))
 397     return a;
 398
 399   // Align exponents
 400   if (a.exponent > b.exponent)
 401     b.shift_right(static_cast<unsigned>(a.exponent - b.exponent));
 402   else if (b.exponent > a.exponent)
 403     a.shift_right(static_cast<unsigned>(b.exponent - a.exponent));
 404
 405   DyadicFloat<Bits> result;
 406
 407   if (a.sign == b.sign) {
 408     // Addition
 409     result.sign = a.sign;
 410     result.exponent = a.exponent;
 411     result.mantissa = a.mantissa;
 412     if (result.mantissa.add_overflow(b.mantissa)) {
 413       // Mantissa addition overflow.
 414       result.shift_right(1);
 415       result.mantissa.val[DyadicFloat<Bits>::MantissaType::WORD_COUNT - 1] |=
 416           (uint64_t(1) << 63);
 417     }
 418     // Result is already normalized.
 419     return result;
 420   }
 421
 422   // Subtraction
 423   if (a.mantissa >= b.mantissa) {
 424     result.sign = a.sign;
 425     result.exponent = a.exponent;
 426     result.mantissa = a.mantissa - b.mantissa;
 427   } else {
 428     result.sign = b.sign;
 429     result.exponent = b.exponent;
 430     result.mantissa = b.mantissa - a.mantissa;
 431   }
 432
 433   return result.normalize();
 434 }
 435
 436 // Quick Mul - Slightly less accurate but efficient multiplication of 2 dyadic
 437 // floats with rounding toward 0 and then normalize the output:
 438 //   result.exponent = a.exponent + b.exponent + Bits,
 439 //   result.mantissa = quick_mul_hi(a.mantissa + b.mantissa)
 440 //                   ~ (full product a.mantissa * b.mantissa) >> Bits.
 441 // The errors compared to the mathematical product is bounded by:
 442 //   2 * errors of quick_mul_hi = 2 * (UInt<Bits>::WORD_COUNT - 1) in ULPs.
 443 // Assume inputs are normalized (by constructors or other functions) so that we
 444 // don't need to normalize the inputs again in this function.  If the inputs are
 445 // not normalized, the results might lose precision significantly.
 446 template <size_t Bits>
 447 LIBC_INLINE constexpr DyadicFloat<Bits> quick_mul(const DyadicFloat<Bits> &a,
 448                                                   const DyadicFloat<Bits> &b) {
 449   DyadicFloat<Bits> result;
 450   result.sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS;
 451   result.exponent = a.exponent + b.exponent + static_cast<int>(Bits);
 452
 453   if (!(a.mantissa.is_zero() || b.mantissa.is_zero())) {
 454     result.mantissa = a.mantissa.quick_mul_hi(b.mantissa);
 455     // Check the leading bit directly, should be faster than using clz in
 456     // normalize().
 457     if (result.mantissa.val[DyadicFloat<Bits>::MantissaType::WORD_COUNT - 1] >>
 458             63 ==
 459         0)
 460       result.shift_left(1);
 461   } else {
 462     result.mantissa = (typename DyadicFloat<Bits>::MantissaType)(0);
 463   }
 464   return result;
 465 }
 466
 467 // Simple polynomial approximation.
 468 template <size_t Bits>
 469 LIBC_INLINE constexpr DyadicFloat<Bits>
 470 multiply_add(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b,
 471              const DyadicFloat<Bits> &c) {
 472   return quick_add(c, quick_mul(a, b));
 473 }
 474
 475 // Simple exponentiation implementation for printf. Only handles positive
 476 // exponents, since division isn't implemented.
 477 template <size_t Bits>
 478 LIBC_INLINE constexpr DyadicFloat<Bits> pow_n(const DyadicFloat<Bits> &a,
 479                                               uint32_t power) {
 480   DyadicFloat<Bits> result = 1.0;
 481   DyadicFloat<Bits> cur_power = a;
 482
 483   while (power > 0) {
 484     if ((power % 2) > 0) {
 485       result = quick_mul(result, cur_power);
 486     }
 487     power = power >> 1;
 488     cur_power = quick_mul(cur_power, cur_power);
 489   }
 490   return result;
 491 }
 492
 493 template <size_t Bits>
 494 LIBC_INLINE constexpr DyadicFloat<Bits> mul_pow_2(const DyadicFloat<Bits> &a,
 495                                                   int32_t pow_2) {
 496   DyadicFloat<Bits> result = a;
 497   result.exponent += pow_2;
 498   return result;
 499 }
 500
 501 } // namespace fputil
 502 } // namespace LIBC_NAMESPACE_DECL
 503
 504 #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H