VEX/priv/host_generic_maddf.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                              host_generic_maddf.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    Compute x * y + z as ternary operation.
   8    Copyright (C) 2010-2017 Free Software Foundation, Inc.
   9    This file is part of the GNU C Library.
  10    Contributed by Jakub Jelinek <jakub@redhat.com>, 2010.
  11
  12    The GNU C Library is free software; you can redistribute it and/or
  13    modify it under the terms of the GNU Lesser General Public
  14    License as published by the Free Software Foundation; either
  15    version 2.1 of the License, or (at your option) any later version.
  16
  17    The GNU C Library is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20    Lesser General Public License for more details.
  21
  22    You should have received a copy of the GNU Lesser General Public
  23    License along with the GNU C Library; if not, see
  24    <http://www.gnu.org/licenses/>.
  25 */
  26
  27 /* Generic helper functions for doing FMA, i.e. compute x * y + z
  28    as ternary operation.
  29    These are purely back-end entities and cannot be seen/referenced
  30    from IR. */
  31
  32 #include "libvex_basictypes.h"
  33 #include "host_generic_maddf.h"
  34 #include "main_util.h"
  35
  36 /* This implementation relies on Double being more than twice as
  37    precise as Float and uses rounding to odd in order to avoid problems
  38    with double rounding.
  39    See a paper by Boldo and Melquiond:
  40    http://www.lri.fr/~melquion/doc/08-tc.pdf  */
  41
  42 #define FORCE_EVAL(X) __asm __volatile__ ("" : : "m" (X))
  43
  44 #if defined(__x86_64__) && defined(__SSE2_MATH__)
  45 # define ENV_TYPE unsigned int
  46 /* Save current rounding mode into ENV, hold exceptions, set rounding
  47    mode to rounding toward zero.  */
  48 # define ROUNDTOZERO(env) \
  49    do {                                                 \
  50       unsigned int mxcsr;                               \
  51       __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \
  52       (env) = mxcsr;                                    \
  53       mxcsr = (mxcsr | 0x7f80) & ~0x3f;                 \
  54       __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
  55    } while (0)
  56 /* Restore exceptions from ENV, return if inexact exception has been raised
  57    since ROUNDTOZERO.  */
  58 # define RESET_TESTINEXACT(env) \
  59    ({                                                   \
  60       unsigned int mxcsr, ret;                          \
  61       __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \
  62       ret = (mxcsr >> 5) & 1;                           \
  63       mxcsr = (mxcsr & 0x3d) | (env);                   \
  64       __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
  65       ret;                                              \
  66    })
  67 /* Return if inexact exception has been raised since ROUNDTOZERO.  */
  68 # define TESTINEXACT() \
  69    ({                                                   \
  70       unsigned int mxcsr;                               \
  71       __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \
  72       (mxcsr >> 5) & 1;                                 \
  73    })
  74 #endif
  75
  76 #define DBL_MANT_DIG 53
  77 #define IEEE754_DOUBLE_BIAS 0x3ff
  78
  79 union vg_ieee754_double {
  80    Double d;
  81
  82    /* This is the IEEE 754 double-precision format.  */
  83    struct {
  84 #ifdef VKI_BIG_ENDIAN
  85       unsigned int negative:1;
  86       unsigned int exponent:11;
  87       unsigned int mantissa0:20;
  88       unsigned int mantissa1:32;
  89 #else
  90       unsigned int mantissa1:32;
  91       unsigned int mantissa0:20;
  92       unsigned int exponent:11;
  93       unsigned int negative:1;
  94 #endif
  95    } ieee;
  96 };
  97
  98 void VEX_REGPARM(3)
  99      h_generic_calc_MAddF32 ( /*OUT*/Float* res,
 100                                Float* argX, Float* argY, Float* argZ )
 101 {
 102 #ifndef ENV_TYPE
 103    /* Lame fallback implementation.  */
 104    *res = *argX * *argY + *argZ;
 105 #else
 106    ENV_TYPE env;
 107    /* Multiplication is always exact.  */
 108    Double temp = (Double) *argX * (Double) *argY;
 109    union vg_ieee754_double u;
 110
 111    ROUNDTOZERO (env);
 112
 113    /* Perform addition with round to odd.  */
 114    u.d = temp + (Double) *argZ;
 115    /* Ensure the addition is not scheduled after fetestexcept call.  */
 116    FORCE_EVAL (u.d);
 117
 118    /* Reset rounding mode and test for inexact simultaneously.  */
 119    int j = RESET_TESTINEXACT (env);
 120
 121    if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
 122       u.ieee.mantissa1 |= j;
 123
 124    /* And finally truncation with round to nearest.  */
 125    *res = (Float) u.d;
 126 #endif
 127 }
 128
 129
 130 void VEX_REGPARM(3)
 131      h_generic_calc_MAddF64 ( /*OUT*/Double* res,
 132                                Double* argX, Double* argY, Double* argZ )
 133 {
 134 #ifndef ENV_TYPE
 135    /* Lame fallback implementation.  */
 136    *res = *argX * *argY + *argZ;
 137 #else
 138    Double x = *argX, y = *argY, z = *argZ;
 139    union vg_ieee754_double u, v, w;
 140    int adjust = 0;
 141    u.d = x;
 142    v.d = y;
 143    w.d = z;
 144    if (UNLIKELY (u.ieee.exponent + v.ieee.exponent
 145                  >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG)
 146        || UNLIKELY (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
 147        || UNLIKELY (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
 148        || UNLIKELY (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
 149        || UNLIKELY (u.ieee.exponent + v.ieee.exponent
 150                     <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)) {
 151       /* If z is Inf, but x and y are finite, the result should be
 152          z rather than NaN.  */
 153       if (w.ieee.exponent == 0x7ff
 154           && u.ieee.exponent != 0x7ff
 155           && v.ieee.exponent != 0x7ff) {
 156          *res = (z + x) + y;
 157          return;
 158       }
 159       /* If x or y or z is Inf/NaN, or if fma will certainly overflow,
 160          or if x * y is less than half of DBL_DENORM_MIN,
 161          compute as x * y + z.  */
 162       if (u.ieee.exponent == 0x7ff
 163           || v.ieee.exponent == 0x7ff
 164           || w.ieee.exponent == 0x7ff
 165           || u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS
 166           || u.ieee.exponent + v.ieee.exponent
 167              < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2) {
 168          *res = x * y + z;
 169          return;
 170       }
 171       if (u.ieee.exponent + v.ieee.exponent
 172           >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) {
 173          /* Compute 1p-53 times smaller result and multiply
 174             at the end.  */
 175          if (u.ieee.exponent > v.ieee.exponent)
 176             u.ieee.exponent -= DBL_MANT_DIG;
 177          else
 178             v.ieee.exponent -= DBL_MANT_DIG;
 179          /* If x + y exponent is very large and z exponent is very small,
 180             it doesn't matter if we don't adjust it.  */
 181          if (w.ieee.exponent > DBL_MANT_DIG)
 182             w.ieee.exponent -= DBL_MANT_DIG;
 183          adjust = 1;
 184       } else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
 185          /* Similarly.
 186             If z exponent is very large and x and y exponents are
 187             very small, it doesn't matter if we don't adjust it.  */
 188          if (u.ieee.exponent > v.ieee.exponent) {
 189             if (u.ieee.exponent > DBL_MANT_DIG)
 190                u.ieee.exponent -= DBL_MANT_DIG;
 191          } else if (v.ieee.exponent > DBL_MANT_DIG)
 192             v.ieee.exponent -= DBL_MANT_DIG;
 193          w.ieee.exponent -= DBL_MANT_DIG;
 194          adjust = 1;
 195       } else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
 196          u.ieee.exponent -= DBL_MANT_DIG;
 197          if (v.ieee.exponent)
 198             v.ieee.exponent += DBL_MANT_DIG;
 199          else
 200             v.d *= 0x1p53;
 201       } else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
 202          v.ieee.exponent -= DBL_MANT_DIG;
 203          if (u.ieee.exponent)
 204             u.ieee.exponent += DBL_MANT_DIG;
 205          else
 206             u.d *= 0x1p53;
 207       } else /* if (u.ieee.exponent + v.ieee.exponent
 208                     <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */ {
 209          if (u.ieee.exponent > v.ieee.exponent)
 210             u.ieee.exponent += 2 * DBL_MANT_DIG;
 211          else
 212             v.ieee.exponent += 2 * DBL_MANT_DIG;
 213          if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4) {
 214             if (w.ieee.exponent)
 215                w.ieee.exponent += 2 * DBL_MANT_DIG;
 216             else
 217                w.d *= 0x1p106;
 218             adjust = -1;
 219          }
 220          /* Otherwise x * y should just affect inexact
 221             and nothing else.  */
 222       }
 223       x = u.d;
 224       y = v.d;
 225       z = w.d;
 226    }
 227    /* Multiplication m1 + m2 = x * y using Dekker's algorithm.  */
 228 #  define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1)
 229    Double x1 = x * C;
 230    Double y1 = y * C;
 231    Double m1 = x * y;
 232    x1 = (x - x1) + x1;
 233    y1 = (y - y1) + y1;
 234    Double x2 = x - x1;
 235    Double y2 = y - y1;
 236    Double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
 237 #  undef C
 238
 239    /* Addition a1 + a2 = z + m1 using Knuth's algorithm.  */
 240    Double a1 = z + m1;
 241    Double t1 = a1 - z;
 242    Double t2 = a1 - t1;
 243    t1 = m1 - t1;
 244    t2 = z - t2;
 245    Double a2 = t1 + t2;
 246
 247    ENV_TYPE env;
 248    ROUNDTOZERO (env);
 249
 250    /* Perform m2 + a2 addition with round to odd.  */
 251    u.d = a2 + m2;
 252
 253    if (UNLIKELY (adjust < 0)) {
 254       if ((u.ieee.mantissa1 & 1) == 0)
 255          u.ieee.mantissa1 |= TESTINEXACT ();
 256       v.d = a1 + u.d;
 257       /* Ensure the addition is not scheduled after fetestexcept call.  */
 258       FORCE_EVAL (v.d);
 259    }
 260
 261    /* Reset rounding mode and test for inexact simultaneously.  */
 262    int j = RESET_TESTINEXACT (env) != 0;
 263
 264    if (LIKELY (adjust == 0)) {
 265       if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
 266          u.ieee.mantissa1 |= j;
 267       /* Result is a1 + u.d.  */
 268       *res = a1 + u.d;
 269    } else if (LIKELY (adjust > 0)) {
 270       if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
 271          u.ieee.mantissa1 |= j;
 272       /* Result is a1 + u.d, scaled up.  */
 273       *res = (a1 + u.d) * 0x1p53;
 274    } else {
 275       /* If a1 + u.d is exact, the only rounding happens during
 276          scaling down.  */
 277       if (j == 0) {
 278          *res = v.d * 0x1p-106;
 279          return;
 280       }
 281       /* If result rounded to zero is not subnormal, no double
 282          rounding will occur.  */
 283       if (v.ieee.exponent > 106) {
 284          *res = (a1 + u.d) * 0x1p-106;
 285          return;
 286       }
 287       /* If v.d * 0x1p-106 with round to zero is a subnormal above
 288          or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa
 289          down just by 1 bit, which means v.ieee.mantissa1 |= j would
 290          change the round bit, not sticky or guard bit.
 291          v.d * 0x1p-106 never normalizes by shifting up,
 292          so round bit plus sticky bit should be already enough
 293          for proper rounding.  */
 294       if (v.ieee.exponent == 106) {
 295          /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
 296             v.ieee.mantissa1 & 1 is the round bit and j is our sticky
 297             bit.  In round-to-nearest 001 rounds down like 00,
 298             011 rounds up, even though 01 rounds down (thus we need
 299             to adjust), 101 rounds down like 10 and 111 rounds up
 300             like 11.  */
 301          if ((v.ieee.mantissa1 & 3) == 1) {
 302             v.d *= 0x1p-106;
 303             if (v.ieee.negative)
 304                *res = v.d - 0x1p-1074;
 305             else
 306                *res = v.d + 0x1p-1074;
 307          } else
 308             *res = v.d * 0x1p-106;
 309          return;
 310       }
 311       v.ieee.mantissa1 |= j;
 312       *res = v.d * 0x1p-106;
 313       return;
 314     }
 315 #endif
 316 }
 317
 318 /*---------------------------------------------------------------*/
 319 /*--- end                                 host_generic_maddf.c --*/
 320 /*---------------------------------------------------------------*/