libc/src/__support/FPUtil/generic/FMA.h

   1 //===-- Common header for FMA implementations -------------------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_FMA_H
  10 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_FMA_H
  11
  12 #include "src/__support/CPP/bit.h"
  13 #include "src/__support/CPP/limits.h"
  14 #include "src/__support/CPP/type_traits.h"
  15 #include "src/__support/FPUtil/BasicOperations.h"
  16 #include "src/__support/FPUtil/FPBits.h"
  17 #include "src/__support/FPUtil/cast.h"
  18 #include "src/__support/FPUtil/dyadic_float.h"
  19 #include "src/__support/FPUtil/rounding_mode.h"
  20 #include "src/__support/big_int.h"
  21 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
  22 #include "src/__support/macros/config.h"
  23 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
  24
  25 #include "hdr/fenv_macros.h"
  26
  27 namespace LIBC_NAMESPACE_DECL {
  28 namespace fputil {
  29 namespace generic {
  30
  31 template <typename OutType, typename InType>
  32 LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<OutType> &&
  33                                  cpp::is_floating_point_v<InType> &&
  34                                  sizeof(OutType) <= sizeof(InType),
  35                              OutType>
  36 fma(InType x, InType y, InType z);
  37
  38 // TODO(lntue): Implement fmaf that is correctly rounded to all rounding modes.
  39 // The implementation below only is only correct for the default rounding mode,
  40 // round-to-nearest tie-to-even.
  41 template <> LIBC_INLINE float fma<float>(float x, float y, float z) {
  42   // Product is exact.
  43   double prod = static_cast<double>(x) * static_cast<double>(y);
  44   double z_d = static_cast<double>(z);
  45   double sum = prod + z_d;
  46   fputil::FPBits<double> bit_prod(prod), bitz(z_d), bit_sum(sum);
  47
  48   if (!(bit_sum.is_inf_or_nan() || bit_sum.is_zero())) {
  49     // Since the sum is computed in double precision, rounding might happen
  50     // (for instance, when bitz.exponent > bit_prod.exponent + 5, or
  51     // bit_prod.exponent > bitz.exponent + 40).  In that case, when we round
  52     // the sum back to float, double rounding error might occur.
  53     // A concrete example of this phenomenon is as follows:
  54     //   x = y = 1 + 2^(-12), z = 2^(-53)
  55     // The exact value of x*y + z is 1 + 2^(-11) + 2^(-24) + 2^(-53)
  56     // So when rounding to float, fmaf(x, y, z) = 1 + 2^(-11) + 2^(-23)
  57     // On the other hand, with the default rounding mode,
  58     //   double(x*y + z) = 1 + 2^(-11) + 2^(-24)
  59     // and casting again to float gives us:
  60     //   float(double(x*y + z)) = 1 + 2^(-11).
  61     //
  62     // In order to correct this possible double rounding error, first we use
  63     // Dekker's 2Sum algorithm to find t such that sum - t = prod + z exactly,
  64     // assuming the (default) rounding mode is round-to-the-nearest,
  65     // tie-to-even.  Moreover, t satisfies the condition that t < eps(sum),
  66     // i.e., t.exponent < sum.exponent - 52. So if t is not 0, meaning rounding
  67     // occurs when computing the sum, we just need to use t to adjust (any) last
  68     // bit of sum, so that the sticky bits used when rounding sum to float are
  69     // correct (when it matters).
  70     fputil::FPBits<double> t(
  71         (bit_prod.get_biased_exponent() >= bitz.get_biased_exponent())
  72             ? ((bit_sum.get_val() - bit_prod.get_val()) - bitz.get_val())
  73             : ((bit_sum.get_val() - bitz.get_val()) - bit_prod.get_val()));
  74
  75     // Update sticky bits if t != 0.0 and the least (52 - 23 - 1 = 28) bits are
  76     // zero.
  77     if (!t.is_zero() && ((bit_sum.get_mantissa() & 0xfff'ffffULL) == 0)) {
  78       if (bit_sum.sign() != t.sign())
  79         bit_sum.set_mantissa(bit_sum.get_mantissa() + 1);
  80       else if (bit_sum.get_mantissa())
  81         bit_sum.set_mantissa(bit_sum.get_mantissa() - 1);
  82     }
  83   }
  84
  85   return static_cast<float>(bit_sum.get_val());
  86 }
  87
  88 namespace internal {
  89
  90 // Extract the sticky bits and shift the `mantissa` to the right by
  91 // `shift_length`.
  92 template <typename T>
  93 LIBC_INLINE cpp::enable_if_t<is_unsigned_integral_or_big_int_v<T>, bool>
  94 shift_mantissa(int shift_length, T &mant) {
  95   if (shift_length >= cpp::numeric_limits<T>::digits) {
  96     mant = 0;
  97     return true; // prod_mant is non-zero.
  98   }
  99   T mask = (T(1) << shift_length) - 1;
 100   bool sticky_bits = (mant & mask) != 0;
 101   mant >>= shift_length;
 102   return sticky_bits;
 103 }
 104
 105 } // namespace internal
 106
 107 template <typename OutType, typename InType>
 108 LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<OutType> &&
 109                                  cpp::is_floating_point_v<InType> &&
 110                                  sizeof(OutType) <= sizeof(InType),
 111                              OutType>
 112 fma(InType x, InType y, InType z) {
 113   using OutFPBits = FPBits<OutType>;
 114   using OutStorageType = typename OutFPBits::StorageType;
 115   using InFPBits = FPBits<InType>;
 116   using InStorageType = typename InFPBits::StorageType;
 117
 118   constexpr int IN_EXPLICIT_MANT_LEN = InFPBits::FRACTION_LEN + 1;
 119   constexpr size_t PROD_LEN = 2 * IN_EXPLICIT_MANT_LEN;
 120   constexpr size_t TMP_RESULT_LEN = cpp::bit_ceil(PROD_LEN + 1);
 121   using TmpResultType = UInt<TMP_RESULT_LEN>;
 122   using DyadicFloat = DyadicFloat<TMP_RESULT_LEN>;
 123
 124   InFPBits x_bits(x), y_bits(y), z_bits(z);
 125
 126   if (LIBC_UNLIKELY(x_bits.is_nan() || y_bits.is_nan() || z_bits.is_nan())) {
 127     if (x_bits.is_nan() || y_bits.is_nan()) {
 128       if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan() ||
 129           z_bits.is_signaling_nan())
 130         raise_except_if_required(FE_INVALID);
 131
 132       if (x_bits.is_quiet_nan()) {
 133         InStorageType x_payload = x_bits.get_mantissa();
 134         x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
 135         return OutFPBits::quiet_nan(x_bits.sign(),
 136                                     static_cast<OutStorageType>(x_payload))
 137             .get_val();
 138       }
 139
 140       if (y_bits.is_quiet_nan()) {
 141         InStorageType y_payload = y_bits.get_mantissa();
 142         y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
 143         return OutFPBits::quiet_nan(y_bits.sign(),
 144                                     static_cast<OutStorageType>(y_payload))
 145             .get_val();
 146       }
 147
 148       if (z_bits.is_quiet_nan()) {
 149         InStorageType z_payload = z_bits.get_mantissa();
 150         z_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
 151         return OutFPBits::quiet_nan(z_bits.sign(),
 152                                     static_cast<OutStorageType>(z_payload))
 153             .get_val();
 154       }
 155
 156       return OutFPBits::quiet_nan().get_val();
 157     }
 158   }
 159
 160   if (LIBC_UNLIKELY(x == 0 || y == 0 || z == 0))
 161     return cast<OutType>(x * y + z);
 162
 163   int x_exp = 0;
 164   int y_exp = 0;
 165   int z_exp = 0;
 166
 167   // Denormal scaling = 2^(fraction length).
 168   constexpr InStorageType IMPLICIT_MASK =
 169       InFPBits::SIG_MASK - InFPBits::FRACTION_MASK;
 170
 171   constexpr InType DENORMAL_SCALING =
 172       InFPBits::create_value(
 173           Sign::POS, InFPBits::FRACTION_LEN + InFPBits::EXP_BIAS, IMPLICIT_MASK)
 174           .get_val();
 175
 176   // Normalize denormal inputs.
 177   if (LIBC_UNLIKELY(InFPBits(x).is_subnormal())) {
 178     x_exp -= InFPBits::FRACTION_LEN;
 179     x *= DENORMAL_SCALING;
 180   }
 181   if (LIBC_UNLIKELY(InFPBits(y).is_subnormal())) {
 182     y_exp -= InFPBits::FRACTION_LEN;
 183     y *= DENORMAL_SCALING;
 184   }
 185   if (LIBC_UNLIKELY(InFPBits(z).is_subnormal())) {
 186     z_exp -= InFPBits::FRACTION_LEN;
 187     z *= DENORMAL_SCALING;
 188   }
 189
 190   x_bits = InFPBits(x);
 191   y_bits = InFPBits(y);
 192   z_bits = InFPBits(z);
 193   const Sign z_sign = z_bits.sign();
 194   Sign prod_sign = (x_bits.sign() == y_bits.sign()) ? Sign::POS : Sign::NEG;
 195   x_exp += x_bits.get_biased_exponent();
 196   y_exp += y_bits.get_biased_exponent();
 197   z_exp += z_bits.get_biased_exponent();
 198
 199   if (LIBC_UNLIKELY(x_exp == InFPBits::MAX_BIASED_EXPONENT ||
 200                     y_exp == InFPBits::MAX_BIASED_EXPONENT ||
 201                     z_exp == InFPBits::MAX_BIASED_EXPONENT))
 202     return cast<OutType>(x * y + z);
 203
 204   // Extract mantissa and append hidden leading bits.
 205   InStorageType x_mant = x_bits.get_explicit_mantissa();
 206   InStorageType y_mant = y_bits.get_explicit_mantissa();
 207   TmpResultType z_mant = z_bits.get_explicit_mantissa();
 208
 209   // If the exponent of the product x*y > the exponent of z, then no extra
 210   // precision beside the entire product x*y is needed.  On the other hand, when
 211   // the exponent of z >= the exponent of the product x*y, the worst-case that
 212   // we need extra precision is when there is cancellation and the most
 213   // significant bit of the product is aligned exactly with the second most
 214   // significant bit of z:
 215   //      z :    10aa...a
 216   // - prod :     1bb...bb....b
 217   // In that case, in order to store the exact result, we need at least
 218   //     (Length of prod) - (Fraction length of z)
 219   //   = 2*(Length of input explicit mantissa) - (Fraction length of z) bits.
 220   // Overall, before aligning the mantissas and exponents, we can simply left-
 221   // shift the mantissa of z by that amount.  After that, it is enough to align
 222   // the least significant bit, given that we keep track of the round and sticky
 223   // bits after the least significant bit.
 224
 225   TmpResultType prod_mant = TmpResultType(x_mant) * y_mant;
 226   int prod_lsb_exp =
 227       x_exp + y_exp - (InFPBits::EXP_BIAS + 2 * InFPBits::FRACTION_LEN);
 228
 229   constexpr int RESULT_MIN_LEN = PROD_LEN - InFPBits::FRACTION_LEN;
 230   z_mant <<= RESULT_MIN_LEN;
 231   int z_lsb_exp = z_exp - (InFPBits::FRACTION_LEN + RESULT_MIN_LEN);
 232   bool sticky_bits = false;
 233   bool z_shifted = false;
 234
 235   // Align exponents.
 236   if (prod_lsb_exp < z_lsb_exp) {
 237     sticky_bits = internal::shift_mantissa(z_lsb_exp - prod_lsb_exp, prod_mant);
 238     prod_lsb_exp = z_lsb_exp;
 239   } else if (z_lsb_exp < prod_lsb_exp) {
 240     z_shifted = true;
 241     sticky_bits = internal::shift_mantissa(prod_lsb_exp - z_lsb_exp, z_mant);
 242   }
 243
 244   // Perform the addition:
 245   //   (-1)^prod_sign * prod_mant + (-1)^z_sign * z_mant.
 246   // The final result will be stored in prod_sign and prod_mant.
 247   if (prod_sign == z_sign) {
 248     // Effectively an addition.
 249     prod_mant += z_mant;
 250   } else {
 251     // Subtraction cases.
 252     if (prod_mant >= z_mant) {
 253       if (z_shifted && sticky_bits) {
 254         // Add 1 more to the subtrahend so that the sticky bits remain
 255         // positive. This would simplify the rounding logic.
 256         ++z_mant;
 257       }
 258       prod_mant -= z_mant;
 259     } else {
 260       if (!z_shifted && sticky_bits) {
 261         // Add 1 more to the subtrahend so that the sticky bits remain
 262         // positive. This would simplify the rounding logic.
 263         ++prod_mant;
 264       }
 265       prod_mant = z_mant - prod_mant;
 266       prod_sign = z_sign;
 267     }
 268   }
 269
 270   if (prod_mant == 0) {
 271     // When there is exact cancellation, i.e., x*y == -z exactly, return -0.0 if
 272     // rounding downward and +0.0 for other rounding modes.
 273     if (quick_get_round() == FE_DOWNWARD)
 274       prod_sign = Sign::NEG;
 275     else
 276       prod_sign = Sign::POS;
 277   }
 278
 279   DyadicFloat result(prod_sign, prod_lsb_exp - InFPBits::EXP_BIAS, prod_mant);
 280   result.mantissa |= static_cast<unsigned int>(sticky_bits);
 281   return result.template as<OutType, /*ShouldSignalExceptions=*/true>();
 282 }
 283
 284 } // namespace generic
 285 } // namespace fputil
 286 } // namespace LIBC_NAMESPACE_DECL
 287
 288 #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_FMA_H