fpu/softfloat.cc

   1 /*============================================================================
   2 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   3 Package, Release 2b.
   4
   5 Written by John R. Hauser.  This work was made possible in part by the
   6 International Computer Science Institute, located at Suite 600, 1947 Center
   7 Street, Berkeley, California 94704.  Funding was partially provided by the
   8 National Science Foundation under grant MIP-9311980.  The original version
   9 of this code was written as part of a project to build a fixed-point vector
  10 processor in collaboration with the University of California at Berkeley,
  11 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  12 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  13 arithmetic/SoftFloat.html'.
  14
  15 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  16 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  17 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  18 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  19 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  20 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  21 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  22 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  23
  24 Derivative works are acceptable, even for commercial purposes, so long as
  25 (1) the source code for the derivative work includes prominent notice that
  26 the work is derivative, and (2) the source code includes prominent notice with
  27 these four paragraphs for those parts of this code that are retained.
  28 =============================================================================*/
  29
  30 #define FLOAT128
  31
  32 /*============================================================================
  33  * Adapted for Bochs (x86 achitecture simulator) by
  34  *            Stanislav Shwartsman [sshwarts at sourceforge net]
  35  * ==========================================================================*/
  36
  37 #include "softfloat.h"
  38 #include "softfloat-round-pack.h"
  39
  40 /*----------------------------------------------------------------------------
  41 | Primitive arithmetic functions, including multi-word arithmetic, and
  42 | division and square root approximations. (Can be specialized to target
  43 | if desired).
  44 *----------------------------------------------------------------------------*/
  45 #define USE_estimateDiv128To64
  46 #define USE_estimateSqrt32
  47 #include "softfloat-macros.h"
  48
  49 /*----------------------------------------------------------------------------
  50 | Functions and definitions to determine:  (1) whether tininess for underflow
  51 | is detected before or after rounding by default, (2) what (if anything)
  52 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  53 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  54 | are propagated from function inputs to output.  These details are target-
  55 | specific.
  56 *----------------------------------------------------------------------------*/
  57 #include "softfloat-specialize.h"
  58
  59 /*----------------------------------------------------------------------------
  60 | Returns the result of converting the 32-bit two's complement integer `a'
  61 | to the single-precision floating-point format.  The conversion is performed
  62 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  63 *----------------------------------------------------------------------------*/
  64
  65 float32 int32_to_float32(Bit32s a, float_status_t &status)
  66 {
  67     if (a == 0) return 0;
  68     if (a == (Bit32s) 0x80000000) return packFloat32(1, 0x9E, 0);
  69     int zSign = (a < 0);
  70     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
  71 }
  72
  73 /*----------------------------------------------------------------------------
  74 | Returns the result of converting the 32-bit two's complement integer `a'
  75 | to the double-precision floating-point format.  The conversion is performed
  76 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  77 *----------------------------------------------------------------------------*/
  78
  79 float64 int32_to_float64(Bit32s a)
  80 {
  81     if (a == 0) return 0;
  82     int zSign = (a < 0);
  83     Bit32u absA = zSign ? -a : a;
  84     int shiftCount = countLeadingZeros32(absA) + 21;
  85     Bit64u zSig = absA;
  86     return packFloat64(zSign, 0x432 - shiftCount, zSig<<shiftCount);
  87 }
  88
  89 /*----------------------------------------------------------------------------
  90 | Returns the result of converting the 64-bit two's complement integer `a'
  91 | to the single-precision floating-point format.  The conversion is performed
  92 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  93 *----------------------------------------------------------------------------*/
  94
  95 float32 int64_to_float32(Bit64s a, float_status_t &status)
  96 {
  97     if (a == 0) return 0;
  98     int zSign = (a < 0);
  99     Bit64u absA = zSign ? -a : a;
 100     int shiftCount = countLeadingZeros64(absA) - 40;
 101     if (0 <= shiftCount) {
 102         return packFloat32(zSign, 0x95 - shiftCount, (Bit32u)(absA<<shiftCount));
 103     }
 104     else {
 105         shiftCount += 7;
 106         if (shiftCount < 0) {
 107             absA = shift64RightJamming(absA, -shiftCount);
 108         }
 109         else {
 110             absA <<= shiftCount;
 111         }
 112         return roundAndPackFloat32(zSign, 0x9C - shiftCount, (Bit32u) absA, status);
 113     }
 114 }
 115
 116 /*----------------------------------------------------------------------------
 117 | Returns the result of converting the 64-bit two's complement integer `a'
 118 | to the double-precision floating-point format.  The conversion is performed
 119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 120 *----------------------------------------------------------------------------*/
 121
 122 float64 int64_to_float64(Bit64s a, float_status_t &status)
 123 {
 124     if (a == 0) return 0;
 125     if (a == (Bit64s) BX_CONST64(0x8000000000000000)) {
 126         return packFloat64(1, 0x43E, 0);
 127     }
 128     int zSign = (a < 0);
 129     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
 130 }
 131
 132 /*----------------------------------------------------------------------------
 133 | Returns the result of converting the single-precision floating-point value
 134 | `a' to the 32-bit two's complement integer format.  The conversion is
 135 | performed according to the IEC/IEEE Standard for Binary Floating-Point
 136 | Arithmetic - which means in particular that the conversion is rounded
 137 | according to the current rounding mode.  If `a' is a NaN or the
 138 | conversion overflows the integer indefinite value is returned.
 139 *----------------------------------------------------------------------------*/
 140
 141 Bit32s float32_to_int32(float32 a, float_status_t &status)
 142 {
 143     Bit32u aSig = extractFloat32Frac(a);
 144     Bit16s aExp = extractFloat32Exp(a);
 145     int aSign = extractFloat32Sign(a);
 146     if ((aExp == 0xFF) && aSig) aSign = 0;
 147     if (aExp) aSig |= 0x00800000;
 148     int shiftCount = 0xAF - aExp;
 149     Bit64u aSig64 = aSig;
 150     aSig64 <<= 32;
 151     if (0 < shiftCount) aSig64 = shift64RightJamming(aSig64, shiftCount);
 152     return roundAndPackInt32(aSign, aSig64, status);
 153 }
 154
 155 /*----------------------------------------------------------------------------
 156 | Returns the result of converting the single-precision floating-point value
 157 | `a' to the 32-bit two's complement integer format.  The conversion is
 158 | performed according to the IEC/IEEE Standard for Binary Floating-Point
 159 | Arithmetic, except that the conversion is always rounded toward zero.
 160 | If `a' is a NaN or the conversion overflows, the integer indefinite
 161 | value is returned.
 162 *----------------------------------------------------------------------------*/
 163
 164 Bit32s float32_to_int32_round_to_zero(float32 a, float_status_t &status)
 165 {
 166     int aSign;
 167     Bit16s aExp;
 168     Bit32u aSig;
 169     Bit32s z;
 170
 171     aSig = extractFloat32Frac(a);
 172     aExp = extractFloat32Exp(a);
 173     aSign = extractFloat32Sign(a);
 174     int shiftCount = aExp - 0x9E;
 175     if (0 <= shiftCount) {
 176         if (a != 0xCF000000) {
 177             float_raise(status, float_flag_invalid);
 178         }
 179         return (Bit32s)(int32_indefinite);
 180     }
 181     else if (aExp <= 0x7E) {
 182         if (aExp | aSig) float_raise(status, float_flag_inexact);
 183         return 0;
 184     }
 185     aSig = (aSig | 0x00800000)<<8;
 186     z = aSig>>(-shiftCount);
 187     if ((Bit32u) (aSig<<(shiftCount & 31))) {
 188         float_raise(status, float_flag_inexact);
 189     }
 190     if (aSign) z = -z;
 191     return z;
 192 }
 193
 194 /*----------------------------------------------------------------------------
 195 | Returns the result of converting the single-precision floating-point value
 196 | `a' to the 64-bit two's complement integer format.  The conversion is
 197 | performed according to the IEC/IEEE Standard for Binary Floating-Point
 198 | Arithmetic - which means in particular that the conversion is rounded
 199 | according to the current rounding mode. If `a' is a NaN or the
 200 | conversion overflows, the integer indefinite value is returned.
 201 *----------------------------------------------------------------------------*/
 202
 203 Bit64s float32_to_int64(float32 a, float_status_t &status)
 204 {
 205     Bit64u aSig64, aSigExtra;
 206
 207     Bit32u aSig = extractFloat32Frac(a);
 208     Bit16s aExp = extractFloat32Exp(a);
 209     int aSign = extractFloat32Sign(a);
 210
 211     int shiftCount = 0xBE - aExp;
 212     if (shiftCount < 0) {
 213         float_raise(status, float_flag_invalid);
 214         return (Bit64s)(int64_indefinite);
 215     }
 216     if (aExp) aSig |= 0x00800000;
 217     aSig64 = aSig;
 218     aSig64 <<= 40;
 219     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
 220     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
 221 }
 222
 223 /*----------------------------------------------------------------------------
 224 | Returns the result of converting the single-precision floating-point value
 225 | `a' to the 64-bit two's complement integer format.  The conversion is
 226 | performed according to the IEC/IEEE Standard for Binary Floating-Point
 227 | Arithmetic, except that the conversion is always rounded toward zero.
 228 | If `a' is a NaN or the conversion overflows, the integer indefinite
 229 | value is returned.
 230 *----------------------------------------------------------------------------*/
 231
 232 Bit64s float32_to_int64_round_to_zero(float32 a, float_status_t &status)
 233 {
 234     int aSign;
 235     Bit16s aExp;
 236     Bit32u aSig;
 237     Bit64u aSig64;
 238     Bit64s z;
 239
 240     aSig = extractFloat32Frac(a);
 241     aExp = extractFloat32Exp(a);
 242     aSign = extractFloat32Sign(a);
 243     int shiftCount = aExp - 0xBE;
 244     if (0 <= shiftCount) {
 245         if (a != 0xDF000000) {
 246             float_raise(status, float_flag_invalid);
 247         }
 248         return (Bit64s)(int64_indefinite);
 249     }
 250     else if (aExp <= 0x7E) {
 251         if (aExp | aSig) float_raise(status, float_flag_inexact);
 252         return 0;
 253     }
 254     aSig64 = aSig | 0x00800000;
 255     aSig64 <<= 40;
 256     z = aSig64>>(-shiftCount);
 257     if ((Bit64u) (aSig64<<(shiftCount & 63))) {
 258         float_raise(status, float_flag_inexact);
 259     }
 260     if (aSign) z = -z;
 261     return z;
 262 }
 263
 264 /*----------------------------------------------------------------------------
 265 | Returns the result of converting the single-precision floating-point value
 266 | `a' to the double-precision floating-point format.  The conversion is
 267 | performed according to the IEC/IEEE Standard for Binary Floating-Point
 268 | Arithmetic.
 269 *----------------------------------------------------------------------------*/
 270
 271 float64 float32_to_float64(float32 a, float_status_t &status)
 272 {
 273     Bit32u aSig = extractFloat32Frac(a);
 274     Bit16s aExp = extractFloat32Exp(a);
 275     int  aSign = extractFloat32Sign(a);
 276
 277     if (aExp == 0xFF) {
 278         if (aSig) return commonNaNToFloat64(float32ToCommonNaN(a, status));
 279         return packFloat64(aSign, 0x7FF, 0);
 280     }
 281     if (aExp == 0) {
 282         if (aSig == 0) return packFloat64(aSign, 0, 0);
 283         float_raise(status, float_flag_denormal);
 284         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
 285         --aExp;
 286     }
 287     return packFloat64(aSign, aExp + 0x380, ((Bit64u) aSig)<<29);
 288 }
 289
 290 /*----------------------------------------------------------------------------
 291 | Rounds the single-precision floating-point value `a' to an integer, and
 292 | returns the result as a single-precision floating-point value.  The
 293 | operation is performed according to the IEC/IEEE Standard for Binary
 294 | Floating-Point Arithmetic.
 295 *----------------------------------------------------------------------------*/
 296
 297 float32 float32_round_to_int(float32 a, float_status_t &status)
 298 {
 299     int aSign;
 300     Bit16s aExp;
 301     Bit32u lastBitMask, roundBitsMask;
 302     int roundingMode;
 303     float32 z;
 304
 305     aExp = extractFloat32Exp(a);
 306     if (0x96 <= aExp) {
 307         if ((aExp == 0xFF) && extractFloat32Frac(a)) {
 308             return propagateFloat32NaN(a, status);
 309         }
 310         return a;
 311     }
 312     if (aExp <= 0x7E) {
 313         if ((Bit32u) (a<<1) == 0) return a;
 314         float_raise(status, float_flag_inexact);
 315         aSign = extractFloat32Sign(a);
 316         switch (get_float_rounding_mode(status)) {
 317          case float_round_nearest_even:
 318             if ((aExp == 0x7E) && extractFloat32Frac(a)) {
 319                 return packFloat32(aSign, 0x7F, 0);
 320             }
 321             break;
 322          case float_round_down:
 323             return aSign ? 0xBF800000 : 0;
 324          case float_round_up:
 325             return aSign ? 0x80000000 : 0x3F800000;
 326         }
 327         return packFloat32(aSign, 0, 0);
 328     }
 329     lastBitMask = 1;
 330     lastBitMask <<= 0x96 - aExp;
 331     roundBitsMask = lastBitMask - 1;
 332     z = a;
 333     roundingMode = get_float_rounding_mode(status);
 334     if (roundingMode == float_round_nearest_even) {
 335         z += lastBitMask>>1;
 336         if ((z & roundBitsMask) == 0) z &= ~lastBitMask;
 337     }
 338     else if (roundingMode != float_round_to_zero) {
 339         if (extractFloat32Sign(z) ^ (roundingMode == float_round_up)) {
 340             z += roundBitsMask;
 341         }
 342     }
 343     z &= ~roundBitsMask;
 344     if (z != a) float_raise(status, float_flag_inexact);
 345     return z;
 346 }
 347
 348 /*----------------------------------------------------------------------------
 349 | Returns the result of adding the absolute values of the single-precision
 350 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
 351 | before being returned.  `zSign' is ignored if the result is a NaN.
 352 | The addition is performed according to the IEC/IEEE Standard for Binary
 353 | Floating-Point Arithmetic.
 354 *----------------------------------------------------------------------------*/
 355
 356 static float32 addFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status)
 357 {
 358     Bit16s aExp, bExp, zExp;
 359     Bit32u aSig, bSig, zSig;
 360     Bit16s expDiff;
 361
 362     aSig = extractFloat32Frac(a);
 363     aExp = extractFloat32Exp(a);
 364     bSig = extractFloat32Frac(b);
 365     bExp = extractFloat32Exp(b);
 366
 367     expDiff = aExp - bExp;
 368     aSig <<= 6;
 369     bSig <<= 6;
 370
 371     if (0 < expDiff) {
 372         if (aExp == 0xFF) {
 373             if (aSig) return propagateFloat32NaN(a, b, status);
 374             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
 375             return a;
 376         }
 377         if ((aExp == 0) && aSig)
 378             float_raise(status, float_flag_denormal);
 379
 380         if (bExp == 0) {
 381             if (bSig) float_raise(status, float_flag_denormal);
 382             --expDiff;
 383         }
 384         else bSig |= 0x20000000;
 385
 386         bSig = shift32RightJamming(bSig, expDiff);
 387         zExp = aExp;
 388     }
 389     else if (expDiff < 0) {
 390         if (bExp == 0xFF) {
 391             if (bSig) return propagateFloat32NaN(a, b, status);
 392             if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
 393             return packFloat32(zSign, 0xFF, 0);
 394         }
 395         if ((bExp == 0) && bSig)
 396             float_raise(status, float_flag_denormal);
 397
 398         if (aExp == 0) {
 399             if (aSig) float_raise(status, float_flag_denormal);
 400             ++expDiff;
 401         }
 402         else aSig |= 0x20000000;
 403
 404         aSig = shift32RightJamming(aSig, -expDiff);
 405         zExp = bExp;
 406     }
 407     else {
 408         if (aExp == 0xFF) {
 409             if (aSig | bSig) return propagateFloat32NaN(a, b, status);
 410             return a;
 411         }
 412         if (aExp == 0) {
 413             if (aSig | bSig) float_raise(status, float_flag_denormal);
 414             return packFloat32(zSign, 0, (aSig + bSig)>>6);
 415         }
 416         zSig = 0x40000000 + aSig + bSig;
 417         zExp = aExp;
 418         goto roundAndPack;
 419     }
 420     aSig |= 0x20000000;
 421     zSig = (aSig + bSig)<<1;
 422     --zExp;
 423     if ((Bit32s) zSig < 0) {
 424         zSig = aSig + bSig;
 425         ++zExp;
 426     }
 427  roundAndPack:
 428     return roundAndPackFloat32(zSign, zExp, zSig, status);
 429 }
 430
 431 /*----------------------------------------------------------------------------
 432 | Returns the result of subtracting the absolute values of the single-
 433 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
 434 | difference is negated before being returned.  `zSign' is ignored if the
 435 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
 436 | Standard for Binary Floating-Point Arithmetic.
 437 *----------------------------------------------------------------------------*/
 438
 439 static float32 subFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status)
 440 {
 441     Bit16s aExp, bExp, zExp;
 442     Bit32u aSig, bSig, zSig;
 443     Bit16s expDiff;
 444
 445     aSig = extractFloat32Frac(a);
 446     aExp = extractFloat32Exp(a);
 447     bSig = extractFloat32Frac(b);
 448     bExp = extractFloat32Exp(b);
 449
 450     expDiff = aExp - bExp;
 451     aSig <<= 7;
 452     bSig <<= 7;
 453     if (0 < expDiff) goto aExpBigger;
 454     if (expDiff < 0) goto bExpBigger;
 455     if (aExp == 0xFF) {
 456         if (aSig | bSig) return propagateFloat32NaN(a, b, status);
 457         float_raise(status, float_flag_invalid);
 458         return float32_default_nan;
 459     }
 460     if (aExp == 0) {
 461         if (aSig | bSig) float_raise(status, float_flag_denormal);
 462         aExp = 1;
 463         bExp = 1;
 464     }
 465     if (bSig < aSig) goto aBigger;
 466     if (aSig < bSig) goto bBigger;
 467     return packFloat32(get_float_rounding_mode(status) == float_round_down, 0, 0);
 468  bExpBigger:
 469     if (bExp == 0xFF) {
 470         if (bSig) return propagateFloat32NaN(a, b, status);
 471         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
 472         return packFloat32(zSign ^ 1, 0xFF, 0);
 473     }
 474     if ((bExp == 0) && bSig)
 475         float_raise(status, float_flag_denormal);
 476
 477     if (aExp == 0) {
 478         if (aSig) float_raise(status, float_flag_denormal);
 479         ++expDiff;
 480     }
 481     else aSig |= 0x40000000;
 482
 483     aSig = shift32RightJamming(aSig, -expDiff);
 484     bSig |= 0x40000000;
 485  bBigger:
 486     zSig = bSig - aSig;
 487     zExp = bExp;
 488     zSign ^= 1;
 489     goto normalizeRoundAndPack;
 490  aExpBigger:
 491     if (aExp == 0xFF) {
 492         if (aSig) return propagateFloat32NaN(a, b, status);
 493         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
 494         return a;
 495     }
 496     if ((aExp == 0) && aSig)
 497         float_raise(status, float_flag_denormal);
 498
 499     if (bExp == 0) {
 500         if (bSig) float_raise(status, float_flag_denormal);
 501         --expDiff;
 502     }
 503     else bSig |= 0x40000000;
 504
 505     bSig = shift32RightJamming(bSig, expDiff);
 506     aSig |= 0x40000000;
 507  aBigger:
 508     zSig = aSig - bSig;
 509     zExp = aExp;
 510  normalizeRoundAndPack:
 511     --zExp;
 512     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
 513 }
 514
 515 /*----------------------------------------------------------------------------
 516 | Returns the result of adding the single-precision floating-point values `a'
 517 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
 518 | Binary Floating-Point Arithmetic.
 519 *----------------------------------------------------------------------------*/
 520
 521 float32 float32_add(float32 a, float32 b, float_status_t &status)
 522 {
 523     int aSign = extractFloat32Sign(a);
 524     int bSign = extractFloat32Sign(b);
 525
 526     if (aSign == bSign) {
 527         return addFloat32Sigs(a, b, aSign, status);
 528     }
 529     else {
 530         return subFloat32Sigs(a, b, aSign, status);
 531     }
 532 }
 533
 534 /*----------------------------------------------------------------------------
 535 | Returns the result of subtracting the single-precision floating-point values
 536 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
 537 | for Binary Floating-Point Arithmetic.
 538 *----------------------------------------------------------------------------*/
 539
 540 float32 float32_sub(float32 a, float32 b, float_status_t &status)
 541 {
 542     int aSign = extractFloat32Sign(a);
 543     int bSign = extractFloat32Sign(b);
 544
 545     if (aSign == bSign) {
 546         return subFloat32Sigs(a, b, aSign, status);
 547     }
 548     else {
 549         return addFloat32Sigs(a, b, aSign, status);
 550     }
 551 }
 552
 553 /*----------------------------------------------------------------------------
 554 | Returns the result of multiplying the single-precision floating-point values
 555 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
 556 | for Binary Floating-Point Arithmetic.
 557 *----------------------------------------------------------------------------*/
 558
 559 float32 float32_mul(float32 a, float32 b, float_status_t &status)
 560 {
 561     int aSign, bSign, zSign;
 562     Bit16s aExp, bExp, zExp;
 563     Bit32u aSig, bSig;
 564     Bit64u zSig64;
 565     Bit32u zSig;
 566
 567     aSig = extractFloat32Frac(a);
 568     aExp = extractFloat32Exp(a);
 569     aSign = extractFloat32Sign(a);
 570     bSig = extractFloat32Frac(b);
 571     bExp = extractFloat32Exp(b);
 572     bSign = extractFloat32Sign(b);
 573     zSign = aSign ^ bSign;
 574     if (aExp == 0xFF) {
 575         if (aSig || ((bExp == 0xFF) && bSig))
 576             return propagateFloat32NaN(a, b, status);
 577
 578         if ((bExp | bSig) == 0) {
 579             float_raise(status, float_flag_invalid);
 580             return float32_default_nan;
 581         }
 582         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
 583         return packFloat32(zSign, 0xFF, 0);
 584     }
 585     if (bExp == 0xFF) {
 586         if (bSig) return propagateFloat32NaN(a, b, status);
 587         if ((aExp | aSig) == 0) {
 588             float_raise(status, float_flag_invalid);
 589             return float32_default_nan;
 590         }
 591         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
 592         return packFloat32(zSign, 0xFF, 0);
 593     }
 594     if (aExp == 0) {
 595         if (aSig == 0) {
 596             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
 597             return packFloat32(zSign, 0, 0);
 598         }
 599         float_raise(status, float_flag_denormal);
 600         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
 601     }
 602     if (bExp == 0) {
 603         if (bSig == 0) return packFloat32(zSign, 0, 0);
 604         float_raise(status, float_flag_denormal);
 605         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
 606     }
 607     zExp = aExp + bExp - 0x7F;
 608     aSig = (aSig | 0x00800000)<<7;
 609     bSig = (bSig | 0x00800000)<<8;
 610     zSig64 = shift64RightJamming(((Bit64u) aSig) * bSig, 32);
 611     zSig = (Bit32u) zSig64;
 612     if (0 <= (Bit32s) (zSig<<1)) {
 613         zSig <<= 1;
 614         --zExp;
 615     }
 616     return roundAndPackFloat32(zSign, zExp, zSig, status);
 617 }
 618
 619 /*----------------------------------------------------------------------------
 620 | Returns the result of dividing the single-precision floating-point value `a'
 621 | by the corresponding value `b'.  The operation is performed according to the
 622 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 623 *----------------------------------------------------------------------------*/
 624
 625 float32 float32_div(float32 a, float32 b, float_status_t &status)
 626 {
 627     int aSign, bSign, zSign;
 628     Bit16s aExp, bExp, zExp;
 629     Bit32u aSig, bSig, zSig;
 630
 631     aSig = extractFloat32Frac(a);
 632     aExp = extractFloat32Exp(a);
 633     aSign = extractFloat32Sign(a);
 634     bSig = extractFloat32Frac(b);
 635     bExp = extractFloat32Exp(b);
 636     bSign = extractFloat32Sign(b);
 637     zSign = aSign ^ bSign;
 638     if (aExp == 0xFF) {
 639         if (aSig) return propagateFloat32NaN(a, b, status);
 640         if (bExp == 0xFF) {
 641             if (bSig) return propagateFloat32NaN(a, b, status);
 642    invalid:
 643             float_raise(status, float_flag_invalid);
 644             return float32_default_nan;
 645         }
 646         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
 647         return packFloat32(zSign, 0xFF, 0);
 648     }
 649     if (bExp == 0xFF) {
 650         if (bSig) return propagateFloat32NaN(a, b, status);
 651         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
 652         return packFloat32(zSign, 0, 0);
 653     }
 654     if (bExp == 0) {
 655         if (bSig == 0) {
 656             if ((aExp | aSig) == 0) goto invalid;
 657             float_raise(status, float_flag_divbyzero);
 658             return packFloat32(zSign, 0xFF, 0);
 659         }
 660         float_raise(status, float_flag_denormal);
 661         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
 662     }
 663     if (aExp == 0) {
 664         if (aSig == 0) return packFloat32(zSign, 0, 0);
 665         float_raise(status, float_flag_denormal);
 666         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
 667     }
 668     zExp = aExp - bExp + 0x7D;
 669     aSig = (aSig | 0x00800000)<<7;
 670     bSig = (bSig | 0x00800000)<<8;
 671     if (bSig <= (aSig + aSig)) {
 672         aSig >>= 1;
 673         ++zExp;
 674     }
 675     zSig = (((Bit64u) aSig)<<32) / bSig;
 676     if ((zSig & 0x3F) == 0) {
 677         zSig |= ((Bit64u) bSig * zSig != ((Bit64u) aSig)<<32);
 678     }
 679     return roundAndPackFloat32(zSign, zExp, zSig, status);
 680 }
 681
 682 /*----------------------------------------------------------------------------
 683 | Returns the square root of the single-precision floating-point value `a'.
 684 | The operation is performed according to the IEC/IEEE Standard for Binary
 685 | Floating-Point Arithmetic.
 686 *----------------------------------------------------------------------------*/
 687
 688 float32 float32_sqrt(float32 a, float_status_t &status)
 689 {
 690     int aSign;
 691     Bit16s aExp, zExp;
 692     Bit32u aSig, zSig;
 693     Bit64u rem, term;
 694
 695     aSig = extractFloat32Frac(a);
 696     aExp = extractFloat32Exp(a);
 697     aSign = extractFloat32Sign(a);
 698     if (aExp == 0xFF) {
 699         if (aSig) return propagateFloat32NaN(a, status);
 700         if (! aSign) return a;
 701         float_raise(status, float_flag_invalid);
 702         return float32_default_nan;
 703     }
 704     if (aSign) {
 705         if ((aExp | aSig) == 0) return a;
 706         float_raise(status, float_flag_invalid);
 707         return float32_default_nan;
 708     }
 709     if (aExp == 0) {
 710         if (aSig == 0) return 0;
 711         float_raise(status, float_flag_denormal);
 712         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
 713     }
 714     zExp = ((aExp - 0x7F)>>1) + 0x7E;
 715     aSig = (aSig | 0x00800000)<<8;
 716     zSig = estimateSqrt32(aExp, aSig) + 2;
 717     if ((zSig & 0x7F) <= 5) {
 718         if (zSig < 2) {
 719             zSig = 0x7FFFFFFF;
 720             goto roundAndPack;
 721         }
 722         aSig >>= aExp & 1;
 723         term = ((Bit64u) zSig) * zSig;
 724         rem = (((Bit64u) aSig)<<32) - term;
 725         while ((Bit64s) rem < 0) {
 726             --zSig;
 727             rem += (((Bit64u) zSig)<<1) | 1;
 728         }
 729         zSig |= (rem != 0);
 730     }
 731     zSig = shift32RightJamming(zSig, 1);
 732  roundAndPack:
 733     return roundAndPackFloat32(0, zExp, zSig, status);
 734 }
 735
 736 /*----------------------------------------------------------------------------
 737 | Determine single-precision floating-point number class
 738 *----------------------------------------------------------------------------*/
 739
 740 float_class_t float32_class(float32 a)
 741 {
 742    Bit16s aExp = extractFloat32Exp(a);
 743    Bit32u aSig = extractFloat32Frac(a);
 744    int  aSign = extractFloat32Sign(a);
 745
 746    if(aExp == 0xFF) {
 747        if (aSig == 0)
 748            return (aSign) ? float_negative_inf : float_positive_inf;
 749
 750        return float_NaN;
 751    }
 752
 753    if(aExp == 0) {
 754        if (aSig == 0) return float_zero;
 755        return float_denormal;
 756    }
 757
 758    return float_normalized;
 759 }
 760
 761 /*----------------------------------------------------------------------------
 762 | Returns 1 if the single-precision floating-point value `a' is equal to
 763 | the corresponding value `b', and 0 otherwise.  The comparison is performed
 764 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 765 *----------------------------------------------------------------------------*/
 766
 767 int float32_eq(float32 a, float32 b, float_status_t &status)
 768 {
 769     float_class_t aClass = float32_class(a);
 770     float_class_t bClass = float32_class(b);
 771
 772     if (aClass == float_NaN || bClass == float_NaN)
 773     {
 774         if (float32_is_signaling_nan(a) || float32_is_signaling_nan(b))
 775         {
 776             float_raise(status, float_flag_invalid);
 777         }
 778         return 0;
 779     }
 780
 781     if (aClass == float_denormal || bClass == float_denormal)
 782     {
 783         float_raise(status, float_flag_denormal);
 784     }
 785
 786     return (a == b) || ((Bit32u) ((a | b)<<1) == 0);
 787 }
 788
 789 /*----------------------------------------------------------------------------
 790 | Returns 1 if the single-precision floating-point value `a' is less than
 791 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
 792 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
 793 | Arithmetic.
 794 *----------------------------------------------------------------------------*/
 795
 796 int float32_le(float32 a, float32 b, float_status_t &status)
 797 {
 798     float_class_t aClass = float32_class(a);
 799     float_class_t bClass = float32_class(b);
 800
 801     if (aClass == float_NaN || bClass == float_NaN) {
 802         float_raise(status, float_flag_invalid);
 803         return 0;
 804     }
 805
 806     if (aClass == float_denormal || bClass == float_denormal)
 807     {
 808         float_raise(status, float_flag_denormal);
 809     }
 810
 811     int aSign = extractFloat32Sign(a);
 812     int bSign = extractFloat32Sign(b);
 813     if (aSign != bSign) return aSign || ((Bit32u) ((a | b)<<1) == 0);
 814     return (a == b) || (aSign ^ (a < b));
 815 }
 816
 817 /*----------------------------------------------------------------------------
 818 | Returns 1 if the single-precision floating-point value `a' is less than
 819 | the corresponding value `b', and 0 otherwise.  The comparison is performed
 820 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 821 *----------------------------------------------------------------------------*/
 822
 823 int float32_lt(float32 a, float32 b, float_status_t &status)
 824 {
 825     float_class_t aClass = float32_class(a);
 826     float_class_t bClass = float32_class(b);
 827
 828     if (aClass == float_NaN || bClass == float_NaN) {
 829         float_raise(status, float_flag_invalid);
 830         return 0;
 831     }
 832
 833     if (aClass == float_denormal || bClass == float_denormal)
 834     {
 835         float_raise(status, float_flag_denormal);
 836     }
 837
 838     int aSign = extractFloat32Sign(a);
 839     int bSign = extractFloat32Sign(b);
 840     if (aSign != bSign) return aSign && ((Bit32u) ((a | b)<<1) != 0);
 841     return (a != b) && (aSign ^ (a < b));
 842 }
 843
 844 /*----------------------------------------------------------------------------
 845 | Returns 1 if the single-precision floating-point value `a' is equal to
 846 | the corresponding value `b', and 0 otherwise.  The invalid exception is
 847 | raised if either operand is a NaN.  Otherwise, the comparison is performed
 848 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 849 *----------------------------------------------------------------------------*/
 850
 851 int float32_eq_signaling(float32 a, float32 b, float_status_t &status)
 852 {
 853     float_class_t aClass = float32_class(a);
 854     float_class_t bClass = float32_class(b);
 855
 856     if (aClass == float_NaN || bClass == float_NaN) {
 857         float_raise(status, float_flag_invalid);
 858         return 0;
 859     }
 860
 861     if (aClass == float_denormal || bClass == float_denormal)
 862     {
 863         float_raise(status, float_flag_denormal);
 864     }
 865
 866     return (a == b) || ((Bit32u) ((a | b)<<1) == 0);
 867 }
 868
 869 /*----------------------------------------------------------------------------
 870 | Returns 1 if the single-precision floating-point value `a' is less than or
 871 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
 872 | cause an exception.  Otherwise, the comparison is performed according to the
 873 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 874 *----------------------------------------------------------------------------*/
 875
 876 int float32_le_quiet(float32 a, float32 b, float_status_t &status)
 877 {
 878     float_class_t aClass = float32_class(a);
 879     float_class_t bClass = float32_class(b);
 880
 881     if (aClass == float_NaN || bClass == float_NaN)
 882     {
 883         if (float32_is_signaling_nan(a) || float32_is_signaling_nan(b))
 884         {
 885             float_raise(status, float_flag_invalid);
 886         }
 887         return 0;
 888     }
 889
 890     if (aClass == float_denormal || bClass == float_denormal)
 891     {
 892         float_raise(status, float_flag_denormal);
 893     }
 894
 895     int aSign = extractFloat32Sign(a);
 896     int bSign = extractFloat32Sign(b);
 897     if (aSign != bSign) return aSign || ((Bit32u) ((a | b)<<1) == 0);
 898     return (a == b) || (aSign ^ (a < b));
 899 }
 900
 901 /*----------------------------------------------------------------------------
 902 | Returns 1 if the single-precision floating-point value `a' is less than
 903 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
 904 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
 905 | Standard for Binary Floating-Point Arithmetic.
 906 *----------------------------------------------------------------------------*/
 907
 908 int float32_lt_quiet(float32 a, float32 b, float_status_t &status)
 909 {
 910     float_class_t aClass = float32_class(a);
 911     float_class_t bClass = float32_class(b);
 912
 913     if (aClass == float_NaN || bClass == float_NaN)
 914     {
 915         if (float32_is_signaling_nan(a) || float32_is_signaling_nan(b))
 916         {
 917             float_raise(status, float_flag_invalid);
 918         }
 919         return 0;
 920     }
 921
 922     if (aClass == float_denormal || bClass == float_denormal)
 923     {
 924         float_raise(status, float_flag_denormal);
 925     }
 926
 927     int aSign = extractFloat32Sign(a);
 928     int bSign = extractFloat32Sign(b);
 929     if (aSign != bSign) return aSign && ((Bit32u) ((a | b)<<1) != 0);
 930     return (a != b) && (aSign ^ (a < b));
 931 }
 932
 933 /*----------------------------------------------------------------------------
 934 | The unordered relationship is true when at least one of two source operands
 935 | being compared is a NaN. Quiet NaNs do not cause an exception.
 936 *----------------------------------------------------------------------------*/
 937
 938 int float32_unordered(float32 a, float32 b, float_status_t &status)
 939 {
 940     float_class_t aClass = float32_class(a);
 941     float_class_t bClass = float32_class(b);
 942
 943     if (aClass == float_NaN || bClass == float_NaN)
 944     {
 945         if (float32_is_signaling_nan(a) || float32_is_signaling_nan(b))
 946         {
 947             float_raise(status, float_flag_invalid);
 948         }
 949         return 1;
 950     }
 951
 952     if (aClass == float_denormal || bClass == float_denormal)
 953     {
 954         float_raise(status, float_flag_denormal);
 955     }
 956
 957     return 0;
 958 }
 959
 960 /*----------------------------------------------------------------------------
 961 | Compare  between  two  single  precision  floating  point  numbers. Returns
 962 | 'float_relation_equal'  if the operands are equal, 'float_relation_less' if
 963 | the    value    'a'   is   less   than   the   corresponding   value   `b',
 964 | 'float_relation_greater' if the value 'a' is greater than the corresponding
 965 | value `b', or 'float_relation_unordered' otherwise.
 966 *----------------------------------------------------------------------------*/
 967
 968 int float32_compare(float32 a, float32 b, float_status_t &status)
 969 {
 970     float_class_t aClass = float32_class(a);
 971     float_class_t bClass = float32_class(b);
 972
 973     if (aClass == float_NaN || bClass == float_NaN) {
 974         float_raise(status, float_flag_invalid);
 975         return float_relation_unordered;
 976     }
 977
 978     if (aClass == float_denormal || bClass == float_denormal)
 979     {
 980         float_raise(status, float_flag_denormal);
 981     }
 982
 983     if ((a == b) || ((Bit32u) ((a | b)<<1) == 0)) return float_relation_equal;
 984
 985     int aSign = extractFloat32Sign(a);
 986     int bSign = extractFloat32Sign(b);
 987     if (aSign != bSign)
 988         return (aSign) ? float_relation_less : float_relation_greater;
 989
 990     if (aSign ^ (a < b)) return float_relation_less;
 991     return float_relation_greater;
 992 }
 993
 994 /*----------------------------------------------------------------------------
 995 | Compare  between  two  double  precision  floating  point  numbers. Returns
 996 | 'float_relation_equal'  if the operands are equal, 'float_relation_less' if
 997 | the    value    'a'   is   less   than   the   corresponding   value   `b',
 998 | 'float_relation_greater' if the value 'a' is greater than the corresponding
 999 | value `b', or 'float_relation_unordered' otherwise. Quiet NaNs do not cause
1000 | an exception.
1001 *----------------------------------------------------------------------------*/
1002
1003 int float32_compare_quiet(float32 a, float32 b, float_status_t &status)
1004 {
1005     float_class_t aClass = float32_class(a);
1006     float_class_t bClass = float32_class(b);
1007
1008     if (aClass == float_NaN || bClass == float_NaN)
1009     {
1010         if (float32_is_signaling_nan(a) || float32_is_signaling_nan(b))
1011         {
1012             float_raise(status, float_flag_invalid);
1013         }
1014         return float_relation_unordered;
1015     }
1016
1017     if (aClass == float_denormal || bClass == float_denormal)
1018     {
1019         float_raise(status, float_flag_denormal);
1020     }
1021
1022     if ((a == b) || ((Bit32u) ((a | b)<<1) == 0)) return float_relation_equal;
1023
1024     int aSign = extractFloat32Sign(a);
1025     int bSign = extractFloat32Sign(b);
1026     if (aSign != bSign)
1027         return (aSign) ? float_relation_less : float_relation_greater;
1028
1029     if (aSign ^ (a < b)) return float_relation_less;
1030     return float_relation_greater;
1031 }
1032
1033 /*----------------------------------------------------------------------------
1034 | Returns the result of converting the double-precision floating-point value
1035 | `a' to the 32-bit two's complement integer format.  The conversion is
1036 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1037 | Arithmetic - which means in particular that the conversion is rounded
1038 | according to the current rounding mode. If `a' is a NaN or the
1039 | conversion overflows, the integer indefinite value is returned.
1040 *----------------------------------------------------------------------------*/
1041
1042 Bit32s float64_to_int32(float64 a, float_status_t &status)
1043 {
1044     Bit64u aSig = extractFloat64Frac(a);
1045     Bit16s aExp = extractFloat64Exp(a);
1046     int aSign = extractFloat64Sign(a);
1047     if ((aExp == 0x7FF) && aSig) aSign = 0;
1048     if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1049     int shiftCount = 0x42C - aExp;
1050     if (0 < shiftCount) aSig = shift64RightJamming(aSig, shiftCount);
1051     return roundAndPackInt32(aSign, aSig, status);
1052 }
1053
1054 /*----------------------------------------------------------------------------
1055 | Returns the result of converting the double-precision floating-point value
1056 | `a' to the 32-bit two's complement integer format.  The conversion is
1057 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1058 | Arithmetic, except that the conversion is always rounded toward zero.
1059 | If `a' is a NaN or the conversion overflows, the integer indefinite
1060 | value  is returned.
1061 *----------------------------------------------------------------------------*/
1062
1063 Bit32s float64_to_int32_round_to_zero(float64 a, float_status_t &status)
1064 {
1065     int aSign;
1066     Bit16s aExp;
1067     Bit64u aSig, savedASig;
1068     Bit32s z;
1069     int shiftCount;
1070
1071     aSig = extractFloat64Frac(a);
1072     aExp = extractFloat64Exp(a);
1073     aSign = extractFloat64Sign(a);
1074     if (0x41E < aExp) {
1075         if ((aExp == 0x7FF) && aSig) aSign = 0;
1076         goto invalid;
1077     }
1078     else if (aExp < 0x3FF) {
1079         if (aExp || aSig) float_raise(status, float_flag_inexact);
1080         return 0;
1081     }
1082     aSig |= BX_CONST64(0x0010000000000000);
1083     shiftCount = 0x433 - aExp;
1084     savedASig = aSig;
1085     aSig >>= shiftCount;
1086     z = (Bit32s) aSig;
1087     if (aSign) z = -z;
1088     if ((z < 0) ^ aSign) {
1089  invalid:
1090         float_raise(status, float_flag_invalid);
1091         return (Bit32s)(int32_indefinite);
1092     }
1093     if ((aSig<<shiftCount) != savedASig) {
1094         float_raise(status, float_flag_inexact);
1095     }
1096     return z;
1097 }
1098
1099 /*----------------------------------------------------------------------------
1100 | Returns the result of converting the double-precision floating-point value
1101 | `a' to the 64-bit two's complement integer format.  The conversion is
1102 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1103 | Arithmetic - which means in particular that the conversion is rounded
1104 | according to the current rounding mode.  If `a' is a NaN, the largest
1105 | positive integer is returned.  Otherwise, if the conversion overflows, the
1106 | largest integer with the same sign as `a' is returned.
1107 *----------------------------------------------------------------------------*/
1108
1109 Bit64s float64_to_int64(float64 a, float_status_t &status)
1110 {
1111     int aSign;
1112     Bit16s aExp;
1113     Bit64u aSig, aSigExtra;
1114
1115     aSig = extractFloat64Frac(a);
1116     aExp = extractFloat64Exp(a);
1117     aSign = extractFloat64Sign(a);
1118     if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1119     int shiftCount = 0x433 - aExp;
1120     if (shiftCount <= 0) {
1121         if (0x43E < aExp) {
1122             float_raise(status, float_flag_invalid);
1123             return (Bit64s)(int64_indefinite);
1124         }
1125         aSigExtra = 0;
1126         aSig <<= -shiftCount;
1127     }
1128     else {
1129         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
1130     }
1131     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
1132 }
1133
1134 /*----------------------------------------------------------------------------
1135 | Returns the result of converting the double-precision floating-point value
1136 | `a' to the 64-bit two's complement integer format.  The conversion is
1137 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1138 | Arithmetic, except that the conversion is always rounded toward zero.
1139 | If `a' is a NaN or the conversion overflows, the integer indefinite
1140 | value  is returned.
1141 *----------------------------------------------------------------------------*/
1142
1143 Bit64s float64_to_int64_round_to_zero(float64 a, float_status_t &status)
1144 {
1145     int aSign;
1146     Bit16s aExp;
1147     Bit64u aSig;
1148     Bit64s z;
1149
1150     aSig = extractFloat64Frac(a);
1151     aExp = extractFloat64Exp(a);
1152     aSign = extractFloat64Sign(a);
1153     if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1154     int shiftCount = aExp - 0x433;
1155     if (0 <= shiftCount) {
1156         if (0x43E <= aExp) {
1157             if (a != BX_CONST64(0xC3E0000000000000)) {
1158                 float_raise(status, float_flag_invalid);
1159             }
1160             return (Bit64s)(int64_indefinite);
1161         }
1162         z = aSig<<shiftCount;
1163     }
1164     else {
1165         if (aExp < 0x3FE) {
1166             if (aExp | aSig) float_raise(status, float_flag_inexact);
1167             return 0;
1168         }
1169         z = aSig>>(-shiftCount);
1170         if ((Bit64u) (aSig<<(shiftCount & 63))) {
1171              float_raise(status, float_flag_inexact);
1172         }
1173     }
1174     if (aSign) z = -z;
1175     return z;
1176 }
1177
1178 /*----------------------------------------------------------------------------
1179 | Returns the result of converting the double-precision floating-point value
1180 | `a' to the single-precision floating-point format.  The conversion is
1181 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1182 | Arithmetic.
1183 *----------------------------------------------------------------------------*/
1184
1185 float32 float64_to_float32(float64 a, float_status_t &status)
1186 {
1187     int aSign;
1188     Bit16s aExp;
1189     Bit64u aSig;
1190     Bit32u zSig;
1191
1192     aSig = extractFloat64Frac(a);
1193     aExp = extractFloat64Exp(a);
1194     aSign = extractFloat64Sign(a);
1195     if (aExp == 0x7FF) {
1196         if (aSig) return commonNaNToFloat32(float64ToCommonNaN(a, status));
1197         return packFloat32(aSign, 0xFF, 0);
1198     }
1199     if (aExp == 0) {
1200         if (aSig == 0) return packFloat32(aSign, 0, 0);
1201         float_raise(status, float_flag_denormal);
1202     }
1203     aSig = shift64RightJamming(aSig, 22);
1204     zSig = (Bit32u) aSig;
1205     if (aExp || zSig) {
1206         zSig |= 0x40000000;
1207         aExp -= 0x381;
1208     }
1209     return roundAndPackFloat32(aSign, aExp, zSig, status);
1210 }
1211
1212 /*----------------------------------------------------------------------------
1213 | Rounds the double-precision floating-point value `a' to an integer, and
1214 | returns the result as a double-precision floating-point value.  The
1215 | operation is performed according to the IEC/IEEE Standard for Binary
1216 | Floating-Point Arithmetic.
1217 *----------------------------------------------------------------------------*/
1218
1219 float64 float64_round_to_int(float64 a, float_status_t &status)
1220 {
1221     int aSign;
1222     Bit16s aExp;
1223     Bit64u lastBitMask, roundBitsMask;
1224     int roundingMode;
1225     float64 z;
1226
1227     aExp = extractFloat64Exp(a);
1228     if (0x433 <= aExp) {
1229         if ((aExp == 0x7FF) && extractFloat64Frac(a)) {
1230             return propagateFloat64NaN(a, status);
1231         }
1232         return a;
1233     }
1234     if (aExp < 0x3FF) {
1235         if ((Bit64u) (a<<1) == 0) return a;
1236         float_raise(status, float_flag_inexact);
1237         aSign = extractFloat64Sign(a);
1238         switch (get_float_rounding_mode(status)) {
1239          case float_round_nearest_even:
1240             if ((aExp == 0x3FE) && extractFloat64Frac(a)) {
1241               return packFloat64(aSign, 0x3FF, 0);
1242             }
1243             break;
1244          case float_round_down:
1245             return aSign ? BX_CONST64(0xBFF0000000000000) : 0;
1246          case float_round_up:
1247             return
1248               aSign ? BX_CONST64(0x8000000000000000) : BX_CONST64(0x3FF0000000000000);
1249         }
1250         return packFloat64(aSign, 0, 0);
1251     }
1252     lastBitMask = 1;
1253     lastBitMask <<= 0x433 - aExp;
1254     roundBitsMask = lastBitMask - 1;
1255     z = a;
1256     roundingMode = get_float_rounding_mode(status);
1257     if (roundingMode == float_round_nearest_even) {
1258         z += lastBitMask>>1;
1259         if ((z & roundBitsMask) == 0) z &= ~lastBitMask;
1260     }
1261     else if (roundingMode != float_round_to_zero) {
1262         if (extractFloat64Sign(z) ^ (roundingMode == float_round_up)) {
1263             z += roundBitsMask;
1264         }
1265     }
1266     z &= ~roundBitsMask;
1267     if (z != a) float_raise(status, float_flag_inexact);
1268     return z;
1269 }
1270
1271 /*----------------------------------------------------------------------------
1272 | Returns the result of adding the absolute values of the double-precision
1273 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1274 | before being returned.  `zSign' is ignored if the result is a NaN.
1275 | The addition is performed according to the IEC/IEEE Standard for Binary
1276 | Floating-Point Arithmetic.
1277 *----------------------------------------------------------------------------*/
1278
1279 static float64 addFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status)
1280 {
1281     Bit16s aExp, bExp, zExp;
1282     Bit64u aSig, bSig, zSig;
1283     Bit16s expDiff;
1284
1285     aSig = extractFloat64Frac(a);
1286     aExp = extractFloat64Exp(a);
1287     bSig = extractFloat64Frac(b);
1288     bExp = extractFloat64Exp(b);
1289
1290     expDiff = aExp - bExp;
1291     aSig <<= 9;
1292     bSig <<= 9;
1293     if (0 < expDiff) {
1294         if (aExp == 0x7FF) {
1295             if (aSig) return propagateFloat64NaN(a, b, status);
1296             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1297             return a;
1298         }
1299         if ((aExp == 0) && aSig)
1300             float_raise(status, float_flag_denormal);
1301
1302         if (bExp == 0) {
1303             if (bSig) float_raise(status, float_flag_denormal);
1304             --expDiff;
1305         }
1306         else bSig |= BX_CONST64(0x2000000000000000);
1307
1308         bSig = shift64RightJamming(bSig, expDiff);
1309         zExp = aExp;
1310     }
1311     else if (expDiff < 0) {
1312         if (bExp == 0x7FF) {
1313             if (bSig) return propagateFloat64NaN(a, b, status);
1314             if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1315             return packFloat64(zSign, 0x7FF, 0);
1316         }
1317         if ((bExp == 0) && bSig)
1318             float_raise(status, float_flag_denormal);
1319
1320         if (aExp == 0) {
1321             if (aSig) float_raise(status, float_flag_denormal);
1322             ++expDiff;
1323         }
1324         else aSig |= BX_CONST64(0x2000000000000000);
1325
1326         aSig = shift64RightJamming(aSig, -expDiff);
1327         zExp = bExp;
1328     }
1329     else {
1330         if (aExp == 0x7FF) {
1331             if (aSig | bSig) return propagateFloat64NaN(a, b, status);
1332             return a;
1333         }
1334         if (aExp == 0) {
1335             if (aSig | bSig) float_raise(status, float_flag_denormal);
1336             return packFloat64(zSign, 0, (aSig + bSig)>>9);
1337         }
1338         zSig = BX_CONST64(0x4000000000000000) + aSig + bSig;
1339         zExp = aExp;
1340         goto roundAndPack;
1341     }
1342     aSig |= BX_CONST64(0x2000000000000000);
1343     zSig = (aSig + bSig)<<1;
1344     --zExp;
1345     if ((Bit64s) zSig < 0) {
1346         zSig = aSig + bSig;
1347         ++zExp;
1348     }
1349  roundAndPack:
1350     return roundAndPackFloat64(zSign, zExp, zSig, status);
1351 }
1352
1353 /*----------------------------------------------------------------------------
1354 | Returns the result of subtracting the absolute values of the double-
1355 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1356 | difference is negated before being returned.  `zSign' is ignored if the
1357 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1358 | Standard for Binary Floating-Point Arithmetic.
1359 *----------------------------------------------------------------------------*/
1360
1361 static float64 subFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status)
1362 {
1363     Bit16s aExp, bExp, zExp;
1364     Bit64u aSig, bSig, zSig;
1365     Bit16s expDiff;
1366
1367     aSig = extractFloat64Frac(a);
1368     aExp = extractFloat64Exp(a);
1369     bSig = extractFloat64Frac(b);
1370     bExp = extractFloat64Exp(b);
1371
1372     expDiff = aExp - bExp;
1373     aSig <<= 10;
1374     bSig <<= 10;
1375     if (0 < expDiff) goto aExpBigger;
1376     if (expDiff < 0) goto bExpBigger;
1377     if (aExp == 0x7FF) {
1378         if (aSig | bSig) return propagateFloat64NaN(a, b, status);
1379         float_raise(status, float_flag_invalid);
1380         return float64_default_nan;
1381     }
1382     if (aExp == 0) {
1383         if (aSig | bSig) float_raise(status, float_flag_denormal);
1384         aExp = 1;
1385         bExp = 1;
1386     }
1387     if (bSig < aSig) goto aBigger;
1388     if (aSig < bSig) goto bBigger;
1389     return packFloat64(get_float_rounding_mode(status) == float_round_down, 0, 0);
1390  bExpBigger:
1391     if (bExp == 0x7FF) {
1392         if (bSig) return propagateFloat64NaN(a, b, status);
1393         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1394         return packFloat64(zSign ^ 1, 0x7FF, 0);
1395     }
1396     if ((bExp == 0) && bSig)
1397         float_raise(status, float_flag_denormal);
1398
1399     if (aExp == 0) {
1400         if (aSig) float_raise(status, float_flag_denormal);
1401         ++expDiff;
1402     }
1403     else aSig |= BX_CONST64(0x4000000000000000);
1404
1405     aSig = shift64RightJamming(aSig, -expDiff);
1406     bSig |= BX_CONST64(0x4000000000000000);
1407  bBigger:
1408     zSig = bSig - aSig;
1409     zExp = bExp;
1410     zSign ^= 1;
1411     goto normalizeRoundAndPack;
1412  aExpBigger:
1413     if (aExp == 0x7FF) {
1414         if (aSig) return propagateFloat64NaN(a, b, status);
1415         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1416         return a;
1417     }
1418     if ((aExp == 0) && aSig)
1419         float_raise(status, float_flag_denormal);
1420
1421     if (bExp == 0) {
1422         if (bSig) float_raise(status, float_flag_denormal);
1423         --expDiff;
1424     }
1425     else bSig |= BX_CONST64(0x4000000000000000);
1426
1427     bSig = shift64RightJamming(bSig, expDiff);
1428     aSig |= BX_CONST64(0x4000000000000000);
1429  aBigger:
1430     zSig = aSig - bSig;
1431     zExp = aExp;
1432  normalizeRoundAndPack:
1433     --zExp;
1434     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
1435 }
1436
1437 /*----------------------------------------------------------------------------
1438 | Returns the result of adding the double-precision floating-point values `a'
1439 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1440 | Binary Floating-Point Arithmetic.
1441 *----------------------------------------------------------------------------*/
1442
1443 float64 float64_add(float64 a, float64 b, float_status_t &status)
1444 {
1445     int aSign = extractFloat64Sign(a);
1446     int bSign = extractFloat64Sign(b);
1447
1448     if (aSign == bSign) {
1449         return addFloat64Sigs(a, b, aSign, status);
1450     }
1451     else {
1452         return subFloat64Sigs(a, b, aSign, status);
1453     }
1454 }
1455
1456 /*----------------------------------------------------------------------------
1457 | Returns the result of subtracting the double-precision floating-point values
1458 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1459 | for Binary Floating-Point Arithmetic.
1460 *----------------------------------------------------------------------------*/
1461
1462 float64 float64_sub(float64 a, float64 b, float_status_t &status)
1463 {
1464     int aSign = extractFloat64Sign(a);
1465     int bSign = extractFloat64Sign(b);
1466
1467     if (aSign == bSign) {
1468         return subFloat64Sigs(a, b, aSign, status);
1469     }
1470     else {
1471         return addFloat64Sigs(a, b, aSign, status);
1472     }
1473 }
1474
1475 /*----------------------------------------------------------------------------
1476 | Returns the result of multiplying the double-precision floating-point values
1477 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1478 | for Binary Floating-Point Arithmetic.
1479 *----------------------------------------------------------------------------*/
1480
1481 float64 float64_mul(float64 a, float64 b, float_status_t &status)
1482 {
1483     int aSign, bSign, zSign;
1484     Bit16s aExp, bExp, zExp;
1485     Bit64u aSig, bSig, zSig0, zSig1;
1486
1487     aSig = extractFloat64Frac(a);
1488     aExp = extractFloat64Exp(a);
1489     aSign = extractFloat64Sign(a);
1490     bSig = extractFloat64Frac(b);
1491     bExp = extractFloat64Exp(b);
1492     bSign = extractFloat64Sign(b);
1493     zSign = aSign ^ bSign;
1494     if (aExp == 0x7FF) {
1495         if (aSig || ((bExp == 0x7FF) && bSig)) {
1496             return propagateFloat64NaN(a, b, status);
1497         }
1498         if ((bExp | bSig) == 0) {
1499             float_raise(status, float_flag_invalid);
1500             return float64_default_nan;
1501         }
1502         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1503         return packFloat64(zSign, 0x7FF, 0);
1504     }
1505     if (bExp == 0x7FF) {
1506         if (bSig) return propagateFloat64NaN(a, b, status);
1507         if ((aExp | aSig) == 0) {
1508             float_raise(status, float_flag_invalid);
1509             return float64_default_nan;
1510         }
1511         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1512         return packFloat64(zSign, 0x7FF, 0);
1513     }
1514     if (aExp == 0) {
1515         if (aSig == 0) {
1516             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1517             return packFloat64(zSign, 0, 0);
1518         }
1519         float_raise(status, float_flag_denormal);
1520         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1521     }
1522     if (bExp == 0) {
1523         if (bSig == 0) return packFloat64(zSign, 0, 0);
1524         float_raise(status, float_flag_denormal);
1525         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
1526     }
1527     zExp = aExp + bExp - 0x3FF;
1528     aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
1529     bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
1530     mul64To128(aSig, bSig, &zSig0, &zSig1);
1531     zSig0 |= (zSig1 != 0);
1532     if (0 <= (Bit64s) (zSig0<<1)) {
1533         zSig0 <<= 1;
1534         --zExp;
1535     }
1536     return roundAndPackFloat64(zSign, zExp, zSig0, status);
1537 }
1538
1539 /*----------------------------------------------------------------------------
1540 | Returns the result of dividing the double-precision floating-point value `a'
1541 | by the corresponding value `b'.  The operation is performed according to
1542 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1543 *----------------------------------------------------------------------------*/
1544
1545 float64 float64_div(float64 a, float64 b, float_status_t &status)
1546 {
1547     int aSign, bSign, zSign;
1548     Bit16s aExp, bExp, zExp;
1549     Bit64u aSig, bSig, zSig;
1550     Bit64u rem0, rem1;
1551     Bit64u term0, term1;
1552
1553     aSig = extractFloat64Frac(a);
1554     aExp = extractFloat64Exp(a);
1555     aSign = extractFloat64Sign(a);
1556     bSig = extractFloat64Frac(b);
1557     bExp = extractFloat64Exp(b);
1558     bSign = extractFloat64Sign(b);
1559     zSign = aSign ^ bSign;
1560     if (aExp == 0x7FF) {
1561         if (aSig) return propagateFloat64NaN(a, b, status);
1562         if (bExp == 0x7FF) {
1563             if (bSig) return propagateFloat64NaN(a, b, status);
1564    invalid:
1565             float_raise(status, float_flag_invalid);
1566             return float64_default_nan;
1567         }
1568         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1569         return packFloat64(zSign, 0x7FF, 0);
1570     }
1571     if (bExp == 0x7FF) {
1572         if (bSig) return propagateFloat64NaN(a, b, status);
1573         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1574         return packFloat64(zSign, 0, 0);
1575     }
1576     if (bExp == 0) {
1577         if (bSig == 0) {
1578             if ((aExp | aSig) == 0) goto invalid;
1579             float_raise(status, float_flag_divbyzero);
1580             return packFloat64(zSign, 0x7FF, 0);
1581         }
1582         float_raise(status, float_flag_denormal);
1583         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
1584     }
1585     if (aExp == 0) {
1586         if (aSig == 0) return packFloat64(zSign, 0, 0);
1587         float_raise(status, float_flag_denormal);
1588         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1589     }
1590     zExp = aExp - bExp + 0x3FD;
1591     aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
1592     bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
1593     if (bSig <= (aSig + aSig)) {
1594         aSig >>= 1;
1595         ++zExp;
1596     }
1597     zSig = estimateDiv128To64(aSig, 0, bSig);
1598     if ((zSig & 0x1FF) <= 2) {
1599         mul64To128(bSig, zSig, &term0, &term1);
1600         sub128(aSig, 0, term0, term1, &rem0, &rem1);
1601         while ((Bit64s) rem0 < 0) {
1602             --zSig;
1603             add128(rem0, rem1, 0, bSig, &rem0, &rem1);
1604         }
1605         zSig |= (rem1 != 0);
1606     }
1607     return roundAndPackFloat64(zSign, zExp, zSig, status);
1608 }
1609
1610 /*----------------------------------------------------------------------------
1611 | Returns the square root of the double-precision floating-point value `a'.
1612 | The operation is performed according to the IEC/IEEE Standard for Binary
1613 | Floating-Point Arithmetic.
1614 *----------------------------------------------------------------------------*/
1615
1616 float64 float64_sqrt(float64 a, float_status_t &status)
1617 {
1618     int aSign;
1619     Bit16s aExp, zExp;
1620     Bit64u aSig, zSig, doubleZSig;
1621     Bit64u rem0, rem1, term0, term1;
1622
1623     aSig = extractFloat64Frac(a);
1624     aExp = extractFloat64Exp(a);
1625     aSign = extractFloat64Sign(a);
1626     if (aExp == 0x7FF) {
1627         if (aSig) return propagateFloat64NaN(a, status);
1628         if (! aSign) return a;
1629         float_raise(status, float_flag_invalid);
1630         return float64_default_nan;
1631     }
1632     if (aSign) {
1633         if ((aExp | aSig) == 0) return a;
1634         float_raise(status, float_flag_invalid);
1635         return float64_default_nan;
1636     }
1637     if (aExp == 0) {
1638         if (aSig == 0) return 0;
1639         float_raise(status, float_flag_denormal);
1640         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1641     }
1642     zExp = ((aExp - 0x3FF)>>1) + 0x3FE;
1643     aSig |= BX_CONST64(0x0010000000000000);
1644     zSig = estimateSqrt32(aExp, (Bit32u)(aSig>>21));
1645     aSig <<= 9 - (aExp & 1);
1646     zSig = estimateDiv128To64(aSig, 0, zSig<<32) + (zSig<<30);
1647     if ((zSig & 0x1FF) <= 5) {
1648         doubleZSig = zSig<<1;
1649         mul64To128(zSig, zSig, &term0, &term1);
1650         sub128(aSig, 0, term0, term1, &rem0, &rem1);
1651         while ((Bit64s) rem0 < 0) {
1652             --zSig;
1653             doubleZSig -= 2;
1654             add128(rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1);
1655         }
1656         zSig |= ((rem0 | rem1) != 0);
1657     }
1658     return roundAndPackFloat64(0, zExp, zSig, status);
1659 }
1660
1661 /*----------------------------------------------------------------------------
1662 | Determine double-precision floating-point number class
1663 *----------------------------------------------------------------------------*/
1664
1665 float_class_t float64_class(float64 a)
1666 {
1667    Bit16s aExp = extractFloat64Exp(a);
1668    Bit64u aSig = extractFloat64Frac(a);
1669    int  aSign = extractFloat64Sign(a);
1670
1671    if(aExp == 0x7FF) {
1672        if (aSig == 0)
1673            return (aSign) ? float_negative_inf : float_positive_inf;
1674
1675        return float_NaN;
1676    }
1677
1678    if(aExp == 0) {
1679        if (aSig == 0)
1680            return float_zero;
1681        return float_denormal;
1682    }
1683
1684    return float_normalized;
1685 }
1686
1687 /*----------------------------------------------------------------------------
1688 | Returns 1 if the double-precision floating-point value `a' is equal to the
1689 | corresponding value `b', and 0 otherwise.  The comparison is performed
1690 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1691 *----------------------------------------------------------------------------*/
1692
1693 int float64_eq(float64 a, float64 b, float_status_t &status)
1694 {
1695     float_class_t aClass = float64_class(a);
1696     float_class_t bClass = float64_class(b);
1697
1698     if (aClass == float_NaN || bClass == float_NaN)
1699     {
1700         if (float64_is_signaling_nan(a) || float64_is_signaling_nan(b))
1701         {
1702             float_raise(status, float_flag_invalid);
1703         }
1704         return 0;
1705     }
1706
1707     if (aClass == float_denormal || bClass == float_denormal)
1708     {
1709         float_raise(status, float_flag_denormal);
1710     }
1711
1712     return (a == b) || ((Bit64u) ((a | b)<<1) == 0);
1713 }
1714
1715 /*----------------------------------------------------------------------------
1716 | Returns 1 if the double-precision floating-point value `a' is less than or
1717 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
1718 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1719 | Arithmetic.
1720 *----------------------------------------------------------------------------*/
1721
1722 int float64_le(float64 a, float64 b, float_status_t &status)
1723 {
1724     float_class_t aClass = float64_class(a);
1725     float_class_t bClass = float64_class(b);
1726
1727     if (aClass == float_NaN || bClass == float_NaN) {
1728         float_raise(status, float_flag_invalid);
1729         return 0;
1730     }
1731
1732     if (aClass == float_denormal || bClass == float_denormal)
1733     {
1734         float_raise(status, float_flag_denormal);
1735     }
1736
1737     int aSign = extractFloat64Sign(a);
1738     int bSign = extractFloat64Sign(b);
1739     if (aSign != bSign) return aSign || ((Bit64u) ((a | b)<<1) == 0);
1740     return (a == b) || (aSign ^ (a < b));
1741 }
1742
1743 /*----------------------------------------------------------------------------
1744 | Returns 1 if the double-precision floating-point value `a' is less than
1745 | the corresponding value `b', and 0 otherwise.  The comparison is performed
1746 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1747 *----------------------------------------------------------------------------*/
1748
1749 int float64_lt(float64 a, float64 b, float_status_t &status)
1750 {
1751     float_class_t aClass = float64_class(a);
1752     float_class_t bClass = float64_class(b);
1753
1754     if (aClass == float_NaN || bClass == float_NaN) {
1755         float_raise(status, float_flag_invalid);
1756         return 0;
1757     }
1758
1759     if (aClass == float_denormal || bClass == float_denormal)
1760     {
1761         float_raise(status, float_flag_denormal);
1762     }
1763
1764     int aSign = extractFloat64Sign(a);
1765     int bSign = extractFloat64Sign(b);
1766     if (aSign != bSign) return aSign && ((Bit64u) ((a | b)<<1) != 0);
1767     return (a != b) && (aSign ^ (a < b));
1768 }
1769
1770 /*----------------------------------------------------------------------------
1771 | Returns 1 if the double-precision floating-point value `a' is equal to the
1772 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
1773 | if either operand is a NaN.  Otherwise, the comparison is performed
1774 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1775 *----------------------------------------------------------------------------*/
1776
1777 int float64_eq_signaling(float64 a, float64 b, float_status_t &status)
1778 {
1779     float_class_t aClass = float64_class(a);
1780     float_class_t bClass = float64_class(b);
1781
1782     if (aClass == float_NaN || bClass == float_NaN) {
1783         float_raise(status, float_flag_invalid);
1784         return 0;
1785     }
1786
1787     if (aClass == float_denormal || bClass == float_denormal)
1788     {
1789         float_raise(status, float_flag_denormal);
1790     }
1791
1792     return (a == b) || ((Bit64u) ((a | b)<<1) == 0);
1793 }
1794
1795 /*----------------------------------------------------------------------------
1796 | Returns 1 if the double-precision floating-point value `a' is less than or
1797 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1798 | cause an exception.  Otherwise, the comparison is performed according to the
1799 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1800 *----------------------------------------------------------------------------*/
1801
1802 int float64_le_quiet(float64 a, float64 b, float_status_t &status)
1803 {
1804     float_class_t aClass = float64_class(a);
1805     float_class_t bClass = float64_class(b);
1806
1807     if (aClass == float_NaN || bClass == float_NaN)
1808     {
1809         if (float64_is_signaling_nan(a) || float64_is_signaling_nan(b))
1810         {
1811             float_raise(status, float_flag_invalid);
1812         }
1813         return 0;
1814     }
1815
1816     if (aClass == float_denormal || bClass == float_denormal)
1817     {
1818         float_raise(status, float_flag_denormal);
1819     }
1820
1821     int aSign = extractFloat64Sign(a);
1822     int bSign = extractFloat64Sign(b);
1823     if (aSign != bSign) return aSign || ((Bit64u) ((a | b)<<1) == 0);
1824     return (a == b) || (aSign ^ (a < b));
1825 }
1826
1827 /*----------------------------------------------------------------------------
1828 | Returns 1 if the double-precision floating-point value `a' is less than
1829 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1830 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1831 | Standard for Binary Floating-Point Arithmetic.
1832 *----------------------------------------------------------------------------*/
1833
1834 int float64_lt_quiet(float64 a, float64 b, float_status_t &status)
1835 {
1836     float_class_t aClass = float64_class(a);
1837     float_class_t bClass = float64_class(b);
1838
1839     if (aClass == float_NaN || bClass == float_NaN)
1840     {
1841         if (float64_is_signaling_nan(a) || float64_is_signaling_nan(b))
1842         {
1843             float_raise(status, float_flag_invalid);
1844         }
1845         return 0;
1846     }
1847
1848     if (aClass == float_denormal || bClass == float_denormal)
1849     {
1850         float_raise(status, float_flag_denormal);
1851     }
1852
1853     int aSign = extractFloat64Sign(a);
1854     int bSign = extractFloat64Sign(b);
1855     if (aSign != bSign) return aSign && ((Bit64u) ((a | b)<<1) != 0);
1856     return (a != b) && (aSign ^ (a < b));
1857 }
1858
1859 /*----------------------------------------------------------------------------
1860 | The unordered relationship is true when at least one of two source operands
1861 | being compared is a NaN. Quiet NaNs do not cause an exception.
1862 *----------------------------------------------------------------------------*/
1863
1864 int float64_unordered(float64 a, float64 b, float_status_t &status)
1865 {
1866     float_class_t aClass = float64_class(a);
1867     float_class_t bClass = float64_class(b);
1868
1869     if (aClass == float_NaN || bClass == float_NaN)
1870     {
1871         if (float64_is_signaling_nan(a) || float64_is_signaling_nan(b))
1872         {
1873             float_raise(status, float_flag_invalid);
1874         }
1875         return 1;
1876     }
1877
1878     if (aClass == float_denormal || bClass == float_denormal)
1879     {
1880         float_raise(status, float_flag_denormal);
1881     }
1882
1883     return 0;
1884 }
1885
1886 /*----------------------------------------------------------------------------
1887 | Compare  between  two  double  precision  floating  point  numbers. Returns
1888 | 'float_relation_equal'  if the operands are equal, 'float_relation_less' if
1889 | the    value    'a'   is   less   than   the   corresponding   value   `b',
1890 | 'float_relation_greater' if the value 'a' is greater than the corresponding
1891 | value `b', or 'float_relation_unordered' otherwise.
1892 *----------------------------------------------------------------------------*/
1893
1894 int float64_compare(float64 a, float64 b, float_status_t &status)
1895 {
1896     float_class_t aClass = float64_class(a);
1897     float_class_t bClass = float64_class(b);
1898
1899     if (aClass == float_NaN || bClass == float_NaN) {
1900         float_raise(status, float_flag_invalid);
1901         return float_relation_unordered;
1902     }
1903
1904     if (aClass == float_denormal || bClass == float_denormal)
1905     {
1906         float_raise(status, float_flag_denormal);
1907     }
1908
1909     if ((a == b) || ((Bit64u) ((a | b)<<1) == 0)) return float_relation_equal;
1910
1911     int aSign = extractFloat64Sign(a);
1912     int bSign = extractFloat64Sign(b);
1913     if (aSign != bSign)
1914         return (aSign) ? float_relation_less : float_relation_greater;
1915
1916     if (aSign ^ (a < b)) return float_relation_less;
1917     return float_relation_greater;
1918 }
1919
1920 /*----------------------------------------------------------------------------
1921 | Compare  between  two  double  precision  floating  point  numbers. Returns
1922 | 'float_relation_equal'  if the operands are equal, 'float_relation_less' if
1923 | the    value    'a'   is   less   than   the   corresponding   value   `b',
1924 | 'float_relation_greater' if the value 'a' is greater than the corresponding
1925 | value `b', or 'float_relation_unordered' otherwise. Quiet NaNs do not cause
1926 | an exception.
1927 *----------------------------------------------------------------------------*/
1928
1929 int float64_compare_quiet(float64 a, float64 b, float_status_t &status)
1930 {
1931     float_class_t aClass = float64_class(a);
1932     float_class_t bClass = float64_class(b);
1933
1934     if (aClass == float_NaN || bClass == float_NaN)
1935     {
1936         if (float64_is_signaling_nan(a) || float64_is_signaling_nan(b))
1937         {
1938             float_raise(status, float_flag_invalid);
1939         }
1940         return float_relation_unordered;
1941     }
1942
1943     if (aClass == float_denormal || bClass == float_denormal)
1944     {
1945         float_raise(status, float_flag_denormal);
1946     }
1947
1948     if ((a == b) || ((Bit64u) ((a | b)<<1) == 0)) return float_relation_equal;
1949
1950     int aSign = extractFloat64Sign(a);
1951     int bSign = extractFloat64Sign(b);
1952     if (aSign != bSign)
1953         return (aSign) ? float_relation_less : float_relation_greater;
1954
1955     if (aSign ^ (a < b)) return float_relation_less;
1956     return float_relation_greater;
1957 }
1958
1959 #ifdef FLOATX80
1960
1961 /*----------------------------------------------------------------------------
1962 | Returns the result of converting the 32-bit two's complement integer `a'
1963 | to the extended double-precision floating-point format.  The conversion
1964 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1965 | Arithmetic.
1966 *----------------------------------------------------------------------------*/
1967
1968 floatx80 int32_to_floatx80(Bit32s a)
1969 {
1970     if (a == 0) return packFloatx80(0, 0, 0);
1971     int   zSign = (a < 0);
1972     Bit32u absA = zSign ? -a : a;
1973     int    shiftCount = countLeadingZeros32(absA) + 32;
1974     Bit64u zSig = absA;
1975     return packFloatx80(zSign, 0x403E - shiftCount, zSig<<shiftCount);
1976 }
1977
1978 /*----------------------------------------------------------------------------
1979 | Returns the result of converting the 64-bit two's complement integer `a'
1980 | to the extended double-precision floating-point format.  The conversion
1981 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1982 | Arithmetic.
1983 *----------------------------------------------------------------------------*/
1984
1985 floatx80 int64_to_floatx80(Bit64s a)
1986 {
1987     if (a == 0) return packFloatx80(0, 0, 0);
1988     int   zSign = (a < 0);
1989     Bit64u absA = zSign ? -a : a;
1990     int    shiftCount = countLeadingZeros64(absA);
1991     return packFloatx80(zSign, 0x403E - shiftCount, absA<<shiftCount);
1992 }
1993
1994 /*----------------------------------------------------------------------------
1995 | Returns the result of converting the single-precision floating-point value
1996 | `a' to the extended double-precision floating-point format.  The conversion
1997 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1998 | Arithmetic.
1999 *----------------------------------------------------------------------------*/
2000
2001 floatx80 float32_to_floatx80(float32 a, float_status_t &status)
2002 {
2003     Bit32u aSig = extractFloat32Frac(a);
2004     Bit16s aExp = extractFloat32Exp(a);
2005     int aSign = extractFloat32Sign(a);
2006     if (aExp == 0xFF) {
2007         if (aSig) return commonNaNToFloatx80(float32ToCommonNaN(a, status));
2008         return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2009     }
2010     if (aExp == 0) {
2011         if (aSig == 0) return packFloatx80(aSign, 0, 0);
2012         float_raise(status, float_flag_denormal);
2013         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2014     }
2015     aSig |= 0x00800000;
2016     return packFloatx80(aSign, aExp + 0x3F80, ((Bit64u) aSig)<<40);
2017 }
2018
2019 /*----------------------------------------------------------------------------
2020 | Returns the result of converting the double-precision floating-point value
2021 | `a' to the extended double-precision floating-point format.  The conversion
2022 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2023 | Arithmetic.
2024 *----------------------------------------------------------------------------*/
2025
2026 floatx80 float64_to_floatx80(float64 a, float_status_t &status)
2027 {
2028     Bit64u aSig = extractFloat64Frac(a);
2029     Bit16s aExp = extractFloat64Exp(a);
2030     int aSign = extractFloat64Sign(a);
2031
2032     if (aExp == 0x7FF) {
2033         if (aSig) return commonNaNToFloatx80(float64ToCommonNaN(a, status));
2034         return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2035     }
2036     if (aExp == 0) {
2037         if (aSig == 0) return packFloatx80(aSign, 0, 0);
2038         float_raise(status, float_flag_denormal);
2039         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2040     }
2041     return
2042         packFloatx80(
2043             aSign, aExp + 0x3C00, (aSig | BX_CONST64(0x0010000000000000))<<11);
2044 }
2045
2046 /*----------------------------------------------------------------------------
2047 | Returns the result of converting the extended double-precision floating-
2048 | point value `a' to the 32-bit two's complement integer format.  The
2049 | conversion is performed according to the IEC/IEEE Standard for Binary
2050 | Floating-Point Arithmetic - which means in particular that the conversion
2051 | is rounded according to the current rounding mode. If `a' is a NaN or the
2052 | conversion overflows, the integer indefinite value is returned.
2053 *----------------------------------------------------------------------------*/
2054
2055 Bit32s floatx80_to_int32(floatx80 a, float_status_t &status)
2056 {
2057     Bit64u aSig = extractFloatx80Frac(a);
2058     Bit32s aExp = extractFloatx80Exp(a);
2059     int aSign = extractFloatx80Sign(a);
2060
2061     // handle unsupported extended double-precision floating encodings
2062     if (floatx80_is_unsupported(a))
2063     {
2064         float_raise(status, float_flag_invalid);
2065         return int32_indefinite;
2066     }
2067
2068     if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1)) aSign = 0;
2069     int shiftCount = 0x4037 - aExp;
2070     if (shiftCount <= 0) shiftCount = 1;
2071     aSig = shift64RightJamming(aSig, shiftCount);
2072     return roundAndPackInt32(aSign, aSig, status);
2073 }
2074
2075 /*----------------------------------------------------------------------------
2076 | Returns the result of converting the extended double-precision floating-
2077 | point value `a' to the 32-bit two's complement integer format.  The
2078 | conversion is performed according to the IEC/IEEE Standard for Binary
2079 | Floating-Point Arithmetic, except that the conversion is always rounded
2080 | toward zero.  If `a' is a NaN or the conversion overflows, the integer
2081 | indefinite value is returned.
2082 *----------------------------------------------------------------------------*/
2083
2084 Bit32s floatx80_to_int32_round_to_zero(floatx80 a, float_status_t &status)
2085 {
2086     Bit32s aExp;
2087     Bit64u aSig, savedASig;
2088     Bit32s z;
2089     int shiftCount;
2090
2091     // handle unsupported extended double-precision floating encodings
2092     if (floatx80_is_unsupported(a))
2093     {
2094         float_raise(status, float_flag_invalid);
2095         return int32_indefinite;
2096     }
2097
2098     aSig = extractFloatx80Frac(a);
2099     aExp = extractFloatx80Exp(a);
2100     int aSign = extractFloatx80Sign(a);
2101
2102     if (aExp > 0x401E) goto invalid;
2103     if (aExp < 0x3FFF) {
2104         if (aExp || aSig) float_raise(status, float_flag_inexact);
2105         return 0;
2106     }
2107     shiftCount = 0x403E - aExp;
2108     savedASig = aSig;
2109     aSig >>= shiftCount;
2110     z = (Bit32s) aSig;
2111     if (aSign) z = -z;
2112     if ((z < 0) ^ aSign) {
2113  invalid:
2114         float_raise(status, float_flag_invalid);
2115         return (Bit32s)(int32_indefinite);
2116     }
2117     if ((aSig<<shiftCount) != savedASig)
2118     {
2119         float_raise(status, float_flag_inexact);
2120     }
2121     return z;
2122 }
2123
2124 /*----------------------------------------------------------------------------
2125 | Returns the result of converting the extended double-precision floating-
2126 | point value `a' to the 64-bit two's complement integer format.  The
2127 | conversion is performed according to the IEC/IEEE Standard for Binary
2128 | Floating-Point Arithmetic - which means in particular that the conversion
2129 | is rounded according to the current rounding mode. If `a' is a NaN or the
2130 | conversion overflows, the integer indefinite value is returned.
2131 *----------------------------------------------------------------------------*/
2132
2133 Bit64s floatx80_to_int64(floatx80 a, float_status_t &status)
2134 {
2135     Bit32s aExp;
2136     Bit64u aSig, aSigExtra;
2137
2138     // handle unsupported extended double-precision floating encodings
2139     if (floatx80_is_unsupported(a))
2140     {
2141         float_raise(status, float_flag_invalid);
2142         return int64_indefinite;
2143     }
2144
2145     aSig = extractFloatx80Frac(a);
2146     aExp = extractFloatx80Exp(a);
2147     int aSign = extractFloatx80Sign(a);
2148
2149     int shiftCount = 0x403E - aExp;
2150     if (shiftCount <= 0)
2151     {
2152         if (shiftCount)
2153         {
2154             float_raise(status, float_flag_invalid);
2155             return (Bit64s)(int64_indefinite);
2156         }
2157         aSigExtra = 0;
2158     }
2159     else {
2160         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
2161     }
2162
2163     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
2164 }
2165
2166 /*----------------------------------------------------------------------------
2167 | Returns the result of converting the extended double-precision floating-
2168 | point value `a' to the 64-bit two's complement integer format.  The
2169 | conversion is performed according to the IEC/IEEE Standard for Binary
2170 | Floating-Point Arithmetic, except that the conversion is always rounded
2171 | toward zero.  If `a' is a NaN or the conversion overflows, the integer
2172 | indefinite value is returned.
2173 *----------------------------------------------------------------------------*/
2174
2175 Bit64s floatx80_to_int64_round_to_zero(floatx80 a, float_status_t &status)
2176 {
2177     int aSign;
2178     Bit32s aExp;
2179     Bit64u aSig;
2180     Bit64s z;
2181
2182     // handle unsupported extended double-precision floating encodings
2183     if (floatx80_is_unsupported(a))
2184     {
2185         float_raise(status, float_flag_invalid);
2186         return int64_indefinite;
2187     }
2188
2189     aSig = extractFloatx80Frac(a);
2190     aExp = extractFloatx80Exp(a);
2191     aSign = extractFloatx80Sign(a);
2192     int shiftCount = aExp - 0x403E;
2193     if (0 <= shiftCount) {
2194         aSig &= BX_CONST64(0x7FFFFFFFFFFFFFFF);
2195         if ((a.exp != 0xC03E) || aSig) {
2196             float_raise(status, float_flag_invalid);
2197         }
2198         return (Bit64s)(int64_indefinite);
2199     }
2200     else if (aExp < 0x3FFF) {
2201         if (aExp | aSig) float_raise(status, float_flag_inexact);
2202         return 0;
2203     }
2204     z = aSig>>(-shiftCount);
2205     if ((Bit64u) (aSig<<(shiftCount & 63))) {
2206         float_raise(status, float_flag_inexact);
2207     }
2208     if (aSign) z = -z;
2209     return z;
2210 }
2211
2212 /*----------------------------------------------------------------------------
2213 | Returns the result of converting the extended double-precision floating-
2214 | point value `a' to the single-precision floating-point format.  The
2215 | conversion is performed according to the IEC/IEEE Standard for Binary
2216 | Floating-Point Arithmetic.
2217 *----------------------------------------------------------------------------*/
2218
2219 float32 floatx80_to_float32(floatx80 a, float_status_t &status)
2220 {
2221     Bit64u aSig = extractFloatx80Frac(a);
2222     Bit32s aExp = extractFloatx80Exp(a);
2223     int aSign = extractFloatx80Sign(a);
2224
2225     // handle unsupported extended double-precision floating encodings
2226     if (floatx80_is_unsupported(a))
2227     {
2228         float_raise(status, float_flag_invalid);
2229         return float32_default_nan;
2230     }
2231
2232     if (aExp == 0x7FFF) {
2233         if ((Bit64u) (aSig<<1))
2234             return commonNaNToFloat32(floatx80ToCommonNaN(a, status));
2235
2236         return packFloat32(aSign, 0xFF, 0);
2237     }
2238     aSig = shift64RightJamming(aSig, 33);
2239     if (aExp || aSig) aExp -= 0x3F81;
2240     return roundAndPackFloat32(aSign, aExp, (Bit32u) aSig, status);
2241 }
2242
2243 /*----------------------------------------------------------------------------
2244 | Returns the result of converting the extended double-precision floating-
2245 | point value `a' to the double-precision floating-point format.  The
2246 | conversion is performed according to the IEC/IEEE Standard for Binary
2247 | Floating-Point Arithmetic.
2248 *----------------------------------------------------------------------------*/
2249
2250 float64 floatx80_to_float64(floatx80 a, float_status_t &status)
2251 {
2252     Bit32s aExp;
2253     Bit64u aSig, zSig;
2254
2255     // handle unsupported extended double-precision floating encodings
2256     if (floatx80_is_unsupported(a))
2257     {
2258         float_raise(status, float_flag_invalid);
2259         return float64_default_nan;
2260     }
2261
2262     aSig = extractFloatx80Frac(a);
2263     aExp = extractFloatx80Exp(a);
2264     int aSign = extractFloatx80Sign(a);
2265
2266     if (aExp == 0x7FFF) {
2267         if ((Bit64u) (aSig<<1)) {
2268             return commonNaNToFloat64(floatx80ToCommonNaN(a, status));
2269         }
2270         return packFloat64(aSign, 0x7FF, 0);
2271     }
2272     zSig = shift64RightJamming(aSig, 1);
2273     if (aExp || aSig) aExp -= 0x3C01;
2274     return roundAndPackFloat64(aSign, aExp, zSig, status);
2275 }
2276
2277 /*----------------------------------------------------------------------------
2278 | Rounds the extended double-precision floating-point value `a' to an integer,
2279 | and returns the result as an extended double-precision floating-point
2280 | value.  The operation is performed according to the IEC/IEEE Standard for
2281 | Binary Floating-Point Arithmetic.
2282 *----------------------------------------------------------------------------*/
2283
2284 floatx80 floatx80_round_to_int(floatx80 a, float_status_t &status)
2285 {
2286     int aSign;
2287     Bit64u lastBitMask, roundBitsMask;
2288     Bit8u roundingMode;
2289     floatx80 z;
2290
2291     // handle unsupported extended double-precision floating encodings
2292     if (floatx80_is_unsupported(a))
2293     {
2294         float_raise(status, float_flag_invalid);
2295         return floatx80_default_nan;
2296     }
2297
2298     Bit32s aExp = extractFloatx80Exp(a);
2299     Bit64u aSig = extractFloatx80Frac(a);
2300     if (0x403E <= aExp) {
2301         if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1)) {
2302             return propagateFloatx80NaN(a, status);
2303         }
2304         return a;
2305     }
2306     if (aExp < 0x3FFF) {
2307         if ((aExp == 0) && ((Bit64u) (aSig<<1) == 0)) {
2308             return a;
2309         }
2310         float_raise(status, float_flag_inexact);
2311         aSign = extractFloatx80Sign(a);
2312         switch (get_float_rounding_mode(status)) {
2313          case float_round_nearest_even:
2314             if ((aExp == 0x3FFE) && (Bit64u) (aSig<<1))
2315                 return packFloatx80(aSign, 0x3FFF, BX_CONST64(0x8000000000000000));
2316             break;
2317          case float_round_down:
2318             return aSign ?
2319                       packFloatx80(1, 0x3FFF, BX_CONST64(0x8000000000000000))
2320                                 : packFloatx80(0, 0, 0);
2321          case float_round_up:
2322             return aSign ?
2323                       packFloatx80(1, 0, 0)
2324                                 : packFloatx80(0, 0x3FFF, BX_CONST64(0x8000000000000000));
2325         }
2326         return packFloatx80(aSign, 0, 0);
2327     }
2328     lastBitMask = 1;
2329     lastBitMask <<= 0x403E - aExp;
2330     roundBitsMask = lastBitMask - 1;
2331     z = a;
2332     roundingMode = get_float_rounding_mode(status);
2333     if (roundingMode == float_round_nearest_even) {
2334         z.fraction += lastBitMask>>1;
2335         if ((z.fraction & roundBitsMask) == 0) z.fraction &= ~lastBitMask;
2336     }
2337     else if (roundingMode != float_round_to_zero) {
2338         if (extractFloatx80Sign(z) ^ (roundingMode == float_round_up))
2339             z.fraction += roundBitsMask;
2340     }
2341     z.fraction &= ~roundBitsMask;
2342     if (z.fraction == 0) {
2343         z.exp++;
2344         z.fraction = BX_CONST64(0x8000000000000000);
2345     }
2346     if (z.fraction != a.fraction) float_raise(status, float_flag_inexact);
2347     return z;
2348 }
2349
2350 /*----------------------------------------------------------------------------
2351 | Returns the result of adding the absolute values of the extended double-
2352 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
2353 | negated before being returned.  `zSign' is ignored if the result is a NaN.
2354 | The addition is performed according to the IEC/IEEE Standard for Binary
2355 | Floating-Point Arithmetic.
2356 *----------------------------------------------------------------------------*/
2357
2358 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status)
2359 {
2360     Bit32s aExp, bExp, zExp;
2361     Bit64u aSig, bSig, zSig0, zSig1;
2362     Bit32s expDiff;
2363
2364     // handle unsupported extended double-precision floating encodings
2365     if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
2366     {
2367         float_raise(status, float_flag_invalid);
2368         return floatx80_default_nan;
2369     }
2370
2371     aSig = extractFloatx80Frac(a);
2372     aExp = extractFloatx80Exp(a);
2373     bSig = extractFloatx80Frac(b);
2374     bExp = extractFloatx80Exp(b);
2375
2376     if (aExp == 0x7FFF) {
2377         if ((Bit64u) (aSig<<1)
2378              || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
2379         {
2380             return propagateFloatx80NaN(a, b, status);
2381         }
2382         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2383         return a;
2384     }
2385     if (bExp == 0x7FFF) {
2386         if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
2387         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2388         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2389     }
2390     if (aExp == 0) {
2391         if (aSig == 0) {
2392             if ((bExp == 0) && bSig) {
2393                 float_raise(status, float_flag_denormal);
2394                 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
2395             }
2396             return roundAndPackFloatx80(get_float_rounding_precision(status),
2397                     zSign, bExp, bSig, 0, status);
2398         }
2399         float_raise(status, float_flag_denormal);
2400         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
2401     }
2402     if (bExp == 0) {
2403         if (bSig == 0)
2404             return roundAndPackFloatx80(get_float_rounding_precision(status),
2405                     zSign, aExp, aSig, 0, status);
2406
2407         float_raise(status, float_flag_denormal);
2408         normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
2409     }
2410     expDiff = aExp - bExp;
2411     zExp = aExp;
2412     if (0 < expDiff) {
2413         shift64ExtraRightJamming(bSig, 0,  expDiff, &bSig, &zSig1);
2414     }
2415     else if (expDiff < 0) {
2416         shift64ExtraRightJamming(aSig, 0, -expDiff, &aSig, &zSig1);
2417         zExp = bExp;
2418     }
2419     else {
2420         zSig0 = aSig + bSig;
2421         zSig1 = 0;
2422         goto shiftRight1;
2423     }
2424     zSig0 = aSig + bSig;
2425     if ((Bit64s) zSig0 < 0) goto roundAndPack;
2426  shiftRight1:
2427     shift64ExtraRightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
2428     zSig0 |= BX_CONST64(0x8000000000000000);
2429     zExp++;
2430  roundAndPack:
2431     return
2432         roundAndPackFloatx80(get_float_rounding_precision(status),
2433             zSign, zExp, zSig0, zSig1, status);
2434 }
2435
2436 /*----------------------------------------------------------------------------
2437 | Returns the result of subtracting the absolute values of the extended
2438 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
2439 | difference is negated before being returned.  `zSign' is ignored if the
2440 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2441 | Standard for Binary Floating-Point Arithmetic.
2442 *----------------------------------------------------------------------------*/
2443
2444 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status)
2445 {
2446     Bit32s aExp, bExp, zExp;
2447     Bit64u aSig, bSig, zSig0, zSig1;
2448     Bit32s expDiff;
2449
2450     // handle unsupported extended double-precision floating encodings
2451     if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
2452     {
2453         float_raise(status, float_flag_invalid);
2454         return floatx80_default_nan;
2455     }
2456
2457     aSig = extractFloatx80Frac(a);
2458     aExp = extractFloatx80Exp(a);
2459     bSig = extractFloatx80Frac(b);
2460     bExp = extractFloatx80Exp(b);
2461
2462     expDiff = aExp - bExp;
2463     if (0 < expDiff) goto aExpBigger;
2464     if (expDiff < 0) goto bExpBigger;
2465     if (aExp == 0x7FFF) {
2466         if ((Bit64u) ((aSig | bSig)<<1))
2467             return propagateFloatx80NaN(a, b, status);
2468         float_raise(status, float_flag_invalid);
2469         return floatx80_default_nan;
2470     }
2471     if (aExp == 0) {
2472         if (aSig | bSig) float_raise(status, float_flag_denormal);
2473         aExp = 1;
2474         bExp = 1;
2475     }
2476     zSig1 = 0;
2477     if (bSig < aSig) goto aBigger;
2478     if (aSig < bSig) goto bBigger;
2479     return packFloatx80(get_float_rounding_mode(status) == float_round_down, 0, 0);
2480  bExpBigger:
2481     if (bExp == 0x7FFF) {
2482         if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
2483         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2484         return packFloatx80(zSign ^ 1, 0x7FFF, BX_CONST64(0x8000000000000000));
2485     }
2486     if ((bExp == 0) && bSig)
2487         float_raise(status, float_flag_denormal);
2488     if (aExp == 0) {
2489         if (aSig) float_raise(status, float_flag_denormal);
2490         ++expDiff;
2491     }
2492     shift128RightJamming(aSig, 0, -expDiff, &aSig, &zSig1);
2493  bBigger:
2494     sub128(bSig, 0, aSig, zSig1, &zSig0, &zSig1);
2495     zExp = bExp;
2496     zSign ^= 1;
2497     goto normalizeRoundAndPack;
2498  aExpBigger:
2499     if (aExp == 0x7FFF) {
2500         if ((Bit64u) (aSig<<1)) return propagateFloatx80NaN(a, b, status);
2501         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2502         return a;
2503     }
2504     if ((aExp == 0) && aSig)
2505         float_raise(status, float_flag_denormal);
2506
2507     if (bExp == 0) {
2508         if (bSig) float_raise(status, float_flag_denormal);
2509         --expDiff;
2510     }
2511     shift128RightJamming(bSig, 0, expDiff, &bSig, &zSig1);
2512  aBigger:
2513     sub128(aSig, 0, bSig, zSig1, &zSig0, &zSig1);
2514     zExp = aExp;
2515  normalizeRoundAndPack:
2516     return
2517         normalizeRoundAndPackFloatx80(get_float_rounding_precision(status),
2518             zSign, zExp, zSig0, zSig1, status);
2519 }
2520
2521 /*----------------------------------------------------------------------------
2522 | Returns the result of adding the extended double-precision floating-point
2523 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
2524 | Standard for Binary Floating-Point Arithmetic.
2525 *----------------------------------------------------------------------------*/
2526
2527 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status_t &status)
2528 {
2529     int aSign = extractFloatx80Sign(a);
2530     int bSign = extractFloatx80Sign(b);
2531
2532     if (aSign == bSign)
2533         return addFloatx80Sigs(a, b, aSign, status);
2534     else
2535         return subFloatx80Sigs(a, b, aSign, status);
2536 }
2537
2538 /*----------------------------------------------------------------------------
2539 | Returns the result of subtracting the extended double-precision floating-
2540 | point values `a' and `b'.  The operation is performed according to the
2541 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2542 *----------------------------------------------------------------------------*/
2543
2544 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status_t &status)
2545 {
2546     int aSign = extractFloatx80Sign(a);
2547     int bSign = extractFloatx80Sign(b);
2548
2549     if (aSign == bSign)
2550         return subFloatx80Sigs(a, b, aSign, status);
2551     else
2552         return addFloatx80Sigs(a, b, aSign, status);
2553 }
2554
2555 /*----------------------------------------------------------------------------
2556 | Returns the result of multiplying the extended double-precision floating-
2557 | point values `a' and `b'.  The operation is performed according to the
2558 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2559 *----------------------------------------------------------------------------*/
2560
2561 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status_t &status)
2562 {
2563     int aSign, bSign, zSign;
2564     Bit32s aExp, bExp, zExp;
2565     Bit64u aSig, bSig, zSig0, zSig1;
2566
2567     // handle unsupported extended double-precision floating encodings
2568     if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
2569     {
2570  invalid:
2571         float_raise(status, float_flag_invalid);
2572         return floatx80_default_nan;
2573     }
2574
2575     aSig = extractFloatx80Frac(a);
2576     aExp = extractFloatx80Exp(a);
2577     aSign = extractFloatx80Sign(a);
2578     bSig = extractFloatx80Frac(b);
2579     bExp = extractFloatx80Exp(b);
2580     bSign = extractFloatx80Sign(b);
2581     zSign = aSign ^ bSign;
2582
2583     if (aExp == 0x7FFF) {
2584         if ((Bit64u) (aSig<<1)
2585              || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
2586         {
2587             return propagateFloatx80NaN(a, b, status);
2588         }
2589         if (bExp == 0) {
2590             if (bSig == 0) goto invalid;
2591             float_raise(status, float_flag_denormal);
2592         }
2593         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2594     }
2595     if (bExp == 0x7FFF) {
2596         if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
2597         if (aExp == 0) {
2598             if (aSig == 0) goto invalid;
2599             float_raise(status, float_flag_denormal);
2600         }
2601         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2602     }
2603     if (aExp == 0) {
2604         if (aSig == 0) {
2605             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2606             return packFloatx80(zSign, 0, 0);
2607         }
2608         float_raise(status, float_flag_denormal);
2609         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
2610     }
2611     if (bExp == 0) {
2612         if (bSig == 0) return packFloatx80(zSign, 0, 0);
2613         float_raise(status, float_flag_denormal);
2614         normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
2615     }
2616     zExp = aExp + bExp - 0x3FFE;
2617     mul64To128(aSig, bSig, &zSig0, &zSig1);
2618     if (0 < (Bit64s) zSig0) {
2619         shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
2620         --zExp;
2621     }
2622     return
2623         roundAndPackFloatx80(get_float_rounding_precision(status),
2624              zSign, zExp, zSig0, zSig1, status);
2625 }
2626
2627 /*----------------------------------------------------------------------------
2628 | Returns the result of dividing the extended double-precision floating-point
2629 | value `a' by the corresponding value `b'.  The operation is performed
2630 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2631 *----------------------------------------------------------------------------*/
2632
2633 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status_t &status)
2634 {
2635     int aSign, bSign, zSign;
2636     Bit32s aExp, bExp, zExp;
2637     Bit64u aSig, bSig, zSig0, zSig1;
2638     Bit64u rem0, rem1, rem2, term0, term1, term2;
2639
2640     // handle unsupported extended double-precision floating encodings
2641     if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
2642     {
2643         float_raise(status, float_flag_invalid);
2644         return floatx80_default_nan;
2645     }
2646
2647     aSig = extractFloatx80Frac(a);
2648     aExp = extractFloatx80Exp(a);
2649     aSign = extractFloatx80Sign(a);
2650     bSig = extractFloatx80Frac(b);
2651     bExp = extractFloatx80Exp(b);
2652     bSign = extractFloatx80Sign(b);
2653
2654     zSign = aSign ^ bSign;
2655     if (aExp == 0x7FFF) {
2656         if ((Bit64u) (aSig<<1)) return propagateFloatx80NaN(a, b, status);
2657         if (bExp == 0x7FFF) {
2658             if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
2659             goto invalid;
2660         }
2661         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2662         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2663     }
2664     if (bExp == 0x7FFF) {
2665         if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
2666         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2667         return packFloatx80(zSign, 0, 0);
2668     }
2669     if (bExp == 0) {
2670         if (bSig == 0) {
2671             if ((aExp | aSig) == 0) {
2672  invalid:
2673                 float_raise(status, float_flag_invalid);
2674                 return floatx80_default_nan;
2675             }
2676             float_raise(status, float_flag_divbyzero);
2677             return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2678         }
2679         float_raise(status, float_flag_denormal);
2680         normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
2681     }
2682     if (aExp == 0) {
2683         if (aSig == 0) return packFloatx80(zSign, 0, 0);
2684         float_raise(status, float_flag_denormal);
2685         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
2686     }
2687     zExp = aExp - bExp + 0x3FFE;
2688     rem1 = 0;
2689     if (bSig <= aSig) {
2690         shift128Right(aSig, 0, 1, &aSig, &rem1);
2691         ++zExp;
2692     }
2693     zSig0 = estimateDiv128To64(aSig, rem1, bSig);
2694     mul64To128(bSig, zSig0, &term0, &term1);
2695     sub128(aSig, rem1, term0, term1, &rem0, &rem1);
2696     while ((Bit64s) rem0 < 0) {
2697         --zSig0;
2698         add128(rem0, rem1, 0, bSig, &rem0, &rem1);
2699     }
2700     zSig1 = estimateDiv128To64(rem1, 0, bSig);
2701     if ((Bit64u) (zSig1<<1) <= 8) {
2702         mul64To128(bSig, zSig1, &term1, &term2);
2703         sub128(rem1, 0, term1, term2, &rem1, &rem2);
2704         while ((Bit64s) rem1 < 0) {
2705             --zSig1;
2706             add128(rem1, rem2, 0, bSig, &rem1, &rem2);
2707         }
2708         zSig1 |= ((rem1 | rem2) != 0);
2709     }
2710     return
2711         roundAndPackFloatx80(get_float_rounding_precision(status),
2712             zSign, zExp, zSig0, zSig1, status);
2713 }
2714
2715 /*----------------------------------------------------------------------------
2716 | Returns the square root of the extended double-precision floating-point
2717 | value `a'.  The operation is performed according to the IEC/IEEE Standard
2718 | for Binary Floating-Point Arithmetic.
2719 *----------------------------------------------------------------------------*/
2720
2721 floatx80 floatx80_sqrt(floatx80 a, float_status_t &status)
2722 {
2723     int aSign;
2724     Bit32s aExp, zExp;
2725     Bit64u aSig0, aSig1, zSig0, zSig1, doubleZSig0;
2726     Bit64u rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2727
2728     // handle unsupported extended double-precision floating encodings
2729     if (floatx80_is_unsupported(a))
2730     {
2731         float_raise(status, float_flag_invalid);
2732         return floatx80_default_nan;
2733     }
2734
2735     aSig0 = extractFloatx80Frac(a);
2736     aExp = extractFloatx80Exp(a);
2737     aSign = extractFloatx80Sign(a);
2738     if (aExp == 0x7FFF) {
2739         if ((Bit64u) (aSig0<<1)) return propagateFloatx80NaN(a, status);
2740         if (! aSign) return a;
2741         goto invalid;
2742     }
2743     if (aSign) {
2744         if ((aExp | aSig0) == 0) return a;
2745  invalid:
2746         float_raise(status, float_flag_invalid);
2747         return floatx80_default_nan;
2748     }
2749     if (aExp == 0) {
2750         if (aSig0 == 0) return packFloatx80(0, 0, 0);
2751         float_raise(status, float_flag_denormal);
2752         normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
2753     }
2754     zExp = ((aExp - 0x3FFF)>>1) + 0x3FFF;
2755     zSig0 = estimateSqrt32(aExp, aSig0>>32);
2756     shift128Right(aSig0, 0, 2 + (aExp & 1), &aSig0, &aSig1);
2757     zSig0 = estimateDiv128To64(aSig0, aSig1, zSig0<<32) + (zSig0<<30);
2758     doubleZSig0 = zSig0<<1;
2759     mul64To128(zSig0, zSig0, &term0, &term1);
2760     sub128(aSig0, aSig1, term0, term1, &rem0, &rem1);
2761     while ((Bit64s) rem0 < 0) {
2762         --zSig0;
2763         doubleZSig0 -= 2;
2764         add128(rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1);
2765     }
2766     zSig1 = estimateDiv128To64(rem1, 0, doubleZSig0);
2767     if ((zSig1 & BX_CONST64(0x3FFFFFFFFFFFFFFF)) <= 5) {
2768         if (zSig1 == 0) zSig1 = 1;
2769         mul64To128(doubleZSig0, zSig1, &term1, &term2);
2770         sub128(rem1, 0, term1, term2, &rem1, &rem2);
2771         mul64To128(zSig1, zSig1, &term2, &term3);
2772         sub192(rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3);
2773         while ((Bit64s) rem1 < 0) {
2774             --zSig1;
2775             shortShift128Left(0, zSig1, 1, &term2, &term3);
2776             term3 |= 1;
2777             term2 |= doubleZSig0;
2778             add192(rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3);
2779         }
2780         zSig1 |= ((rem1 | rem2 | rem3) != 0);
2781     }
2782     shortShift128Left(0, zSig1, 1, &zSig0, &zSig1);
2783     zSig0 |= doubleZSig0;
2784     return
2785         roundAndPackFloatx80(get_float_rounding_precision(status),
2786             0, zExp, zSig0, zSig1, status);
2787 }
2788
2789 #endif
2790
2791 #ifdef FLOAT128
2792
2793 /*----------------------------------------------------------------------------
2794 | Returns the result of converting the extended double-precision floating-
2795 | point value `a' to the quadruple-precision floating-point format. The
2796 | conversion is performed according to the IEC/IEEE Standard for Binary
2797 | Floating-Point Arithmetic.
2798 *----------------------------------------------------------------------------*/
2799
2800 float128 floatx80_to_float128(floatx80 a, float_status_t &status)
2801 {
2802     Bit64u zSig0, zSig1;
2803
2804     Bit64u aSig = extractFloatx80Frac(a);
2805     Bit32s aExp = extractFloatx80Exp(a);
2806     int   aSign = extractFloatx80Sign(a);
2807
2808     if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1))
2809         return commonNaNToFloat128(floatx80ToCommonNaN(a, status));
2810
2811     shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1);
2812     return packFloat128(aSign, aExp, zSig0, zSig1);
2813 }
2814
2815 /*----------------------------------------------------------------------------
2816 | Returns the result of converting the quadruple-precision floating-point
2817 | value `a' to the extended double-precision floating-point format.  The
2818 | conversion is performed according to the IEC/IEEE Standard for Binary
2819 | Floating-Point Arithmetic.
2820 *----------------------------------------------------------------------------*/
2821
2822 floatx80 float128_to_floatx80(float128 a, float_status_t &status)
2823 {
2824     Bit32s aExp;
2825     Bit64u aSig0, aSig1;
2826
2827     aSig1 = extractFloat128Frac1(a);
2828     aSig0 = extractFloat128Frac0(a);
2829     aExp = extractFloat128Exp(a);
2830     int aSign = extractFloat128Sign(a);
2831
2832     if (aExp == 0x7FFF) {
2833         if (aSig0 | aSig1)
2834             return commonNaNToFloatx80(float128ToCommonNaN(a, status));
2835
2836         return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2837     }
2838
2839     if (aExp == 0) {
2840         if ((aSig0 | aSig1) == 0) return packFloatx80(aSign, 0, 0);
2841         float_raise(status, float_flag_denormal);
2842         normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
2843     }
2844     else aSig0 |= BX_CONST64(0x0001000000000000);
2845
2846     shortShift128Left(aSig0, aSig1, 15, &aSig0, &aSig1);
2847     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
2848 }
2849
2850 /*----------------------------------------------------------------------------
2851 | Returns the result of multiplying the extended double-precision floating-
2852 | point value `a' and quadruple-precision floating point value `b'. The
2853 | operation is performed according to the IEC/IEEE Standard for Binary
2854 | Floating-Point Arithmetic.
2855 *----------------------------------------------------------------------------*/
2856
2857 floatx80 floatx80_mul(floatx80 a, float128 b, float_status_t &status)
2858 {
2859     Bit32s aExp, bExp, zExp;
2860     Bit64u aSig, bSig0, bSig1, zSig0, zSig1, zSig2;
2861     int aSign, bSign, zSign;
2862
2863     // handle unsupported extended double-precision floating encodings
2864     if (floatx80_is_unsupported(a))
2865     {
2866  invalid:
2867         float_raise(status, float_flag_invalid);
2868         return floatx80_default_nan;
2869     }
2870
2871     aSig = extractFloatx80Frac(a);
2872     aExp = extractFloatx80Exp(a);
2873     aSign = extractFloatx80Sign(a);
2874     bSig0 = extractFloat128Frac0(b);
2875     bSig1 = extractFloat128Frac1(b);
2876     bExp = extractFloat128Exp(b);
2877     bSign = extractFloat128Sign(b);
2878
2879     zSign = aSign ^ bSign;
2880
2881     if (aExp == 0x7FFF) {
2882         if ((Bit64u) (aSig<<1)
2883              || ((bExp == 0x7FFF) && (bSig0 | bSig1)))
2884         {
2885             floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status));
2886             return propagateFloatx80NaN(a, r, status);
2887         }
2888         if (bExp == 0) {
2889             if ((bSig0 | bSig1) == 0) goto invalid;
2890             float_raise(status, float_flag_denormal);
2891         }
2892         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2893     }
2894     if (bExp == 0x7FFF) {
2895         if (bSig0 | bSig1) {
2896             floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status));
2897             return propagateFloatx80NaN(a, r, status);
2898         }
2899         if (aExp == 0) {
2900             if (aSig == 0) goto invalid;
2901             float_raise(status, float_flag_denormal);
2902         }
2903         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2904     }
2905     if (aExp == 0) {
2906         if (aSig == 0) {
2907             if ((bExp == 0) && (bSig0 | bSig1)) float_raise(status, float_flag_denormal);
2908             return packFloatx80(zSign, 0, 0);
2909         }
2910         float_raise(status, float_flag_denormal);
2911         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
2912     }
2913     if (bExp == 0) {
2914         if ((bSig0 | bSig1) == 0) return packFloatx80(zSign, 0, 0);
2915         float_raise(status, float_flag_denormal);
2916         normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
2917     }
2918     else bSig0 |= BX_CONST64(0x0001000000000000);
2919
2920     zExp = aExp + bExp - 0x3FFE;
2921     shortShift128Left(bSig0, bSig1, 15, &bSig0, &bSig1);
2922     mul128By64To192(bSig0, bSig1, aSig, &zSig0, &zSig1, &zSig2);
2923     if (0 < (Bit64s) zSig0) {
2924         shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
2925         --zExp;
2926     }
2927     return
2928         roundAndPackFloatx80(get_float_rounding_precision(status),
2929              zSign, zExp, zSig0, zSig1, status);
2930 }
2931
2932 /*----------------------------------------------------------------------------
2933 | Returns the result of adding the absolute values of the quadruple-precision
2934 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2935 | before being returned. `zSign' is ignored if the result is a NaN.
2936 | The addition is performed according to the IEC/IEEE Standard for Binary
2937 | Floating-Point Arithmetic.
2938 *----------------------------------------------------------------------------*/
2939
2940 static float128 addFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status)
2941 {
2942     Bit32s aExp, bExp, zExp;
2943     Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
2944     Bit32s expDiff;
2945
2946     aSig1 = extractFloat128Frac1(a);
2947     aSig0 = extractFloat128Frac0(a);
2948     aExp = extractFloat128Exp(a);
2949     bSig1 = extractFloat128Frac1(b);
2950     bSig0 = extractFloat128Frac0(b);
2951     bExp = extractFloat128Exp(b);
2952     expDiff = aExp - bExp;
2953
2954     if (0 < expDiff) {
2955         if (aExp == 0x7FFF) {
2956             if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
2957             return a;
2958         }
2959         if (bExp == 0) --expDiff;
2960         else bSig0 |= BX_CONST64(0x0001000000000000);
2961         shift128ExtraRightJamming(bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2);
2962         zExp = aExp;
2963     }
2964     else if (expDiff < 0) {
2965         if (bExp == 0x7FFF) {
2966             if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
2967             return packFloat128(zSign, 0x7FFF, 0, 0);
2968         }
2969         if (aExp == 0) ++expDiff;
2970         else aSig0 |= BX_CONST64(0x0001000000000000);
2971         shift128ExtraRightJamming(aSig0, aSig1, 0, -expDiff, &aSig0, &aSig1, &zSig2);
2972         zExp = bExp;
2973     }
2974     else {
2975         if (aExp == 0x7FFF) {
2976             if (aSig0 | aSig1 | bSig0 | bSig1)
2977                 return propagateFloat128NaN(a, b, status);
2978
2979             return a;
2980         }
2981         add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
2982         if (aExp == 0) return packFloat128(zSign, 0, zSig0, zSig1);
2983         zSig2 = 0;
2984         zSig0 |= BX_CONST64(0x0002000000000000);
2985         zExp = aExp;
2986         goto shiftRight1;
2987     }
2988     aSig0 |= BX_CONST64(0x0001000000000000);
2989     add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
2990     --zExp;
2991     if (zSig0 < BX_CONST64(0x0002000000000000)) goto roundAndPack;
2992     ++zExp;
2993  shiftRight1:
2994     shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2);
2995  roundAndPack:
2996     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
2997 }
2998
2999 /*----------------------------------------------------------------------------
3000 | Returns the result of subtracting the absolute values of the quadruple-
3001 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3002 | difference is negated before being returned.  `zSign' is ignored if the
3003 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3004 | Standard for Binary Floating-Point Arithmetic.
3005 *----------------------------------------------------------------------------*/
3006
3007 static float128 subFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status)
3008 {
3009     Bit32s aExp, bExp, zExp;
3010     Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
3011     Bit32s expDiff;
3012
3013     aSig1 = extractFloat128Frac1(a);
3014     aSig0 = extractFloat128Frac0(a);
3015     aExp = extractFloat128Exp(a);
3016     bSig1 = extractFloat128Frac1(b);
3017     bSig0 = extractFloat128Frac0(b);
3018     bExp = extractFloat128Exp(b);
3019
3020     expDiff = aExp - bExp;
3021     shortShift128Left(aSig0, aSig1, 14, &aSig0, &aSig1);
3022     shortShift128Left(bSig0, bSig1, 14, &bSig0, &bSig1);
3023     if (0 < expDiff) goto aExpBigger;
3024     if (expDiff < 0) goto bExpBigger;
3025     if (aExp == 0x7FFF) {
3026         if (aSig0 | aSig1 | bSig0 | bSig1)
3027             return propagateFloat128NaN(a, b, status);
3028
3029         float_raise(status, float_flag_invalid);
3030         return float128_default_nan;
3031     }
3032     if (aExp == 0) {
3033         aExp = 1;
3034         bExp = 1;
3035     }
3036     if (bSig0 < aSig0) goto aBigger;
3037     if (aSig0 < bSig0) goto bBigger;
3038     if (bSig1 < aSig1) goto aBigger;
3039     if (aSig1 < bSig1) goto bBigger;
3040     return packFloat128(0, 0);
3041
3042  bExpBigger:
3043     if (bExp == 0x7FFF) {
3044         if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3045         return packFloat128(zSign ^ 1, 0x7FFF, 0, 0);
3046     }
3047     if (aExp == 0) ++expDiff;
3048     else {
3049         aSig0 |= BX_CONST64(0x4000000000000000);
3050     }
3051     shift128RightJamming(aSig0, aSig1, - expDiff, &aSig0, &aSig1);
3052     bSig0 |= BX_CONST64(0x4000000000000000);
3053  bBigger:
3054     sub128(bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1);
3055     zExp = bExp;
3056     zSign ^= 1;
3057     goto normalizeRoundAndPack;
3058  aExpBigger:
3059     if (aExp == 0x7FFF) {
3060         if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3061         return a;
3062     }
3063     if (bExp == 0) --expDiff;
3064     else {
3065         bSig0 |= BX_CONST64(0x4000000000000000);
3066     }
3067     shift128RightJamming(bSig0, bSig1, expDiff, &bSig0, &bSig1);
3068     aSig0 |= BX_CONST64(0x4000000000000000);
3069  aBigger:
3070     sub128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3071     zExp = aExp;
3072  normalizeRoundAndPack:
3073     --zExp;
3074     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, status);
3075 }
3076
3077 /*----------------------------------------------------------------------------
3078 | Returns the result of adding the quadruple-precision floating-point values
3079 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3080 | for Binary Floating-Point Arithmetic.
3081 *----------------------------------------------------------------------------*/
3082
3083 float128 float128_add(float128 a, float128 b, float_status_t &status)
3084 {
3085     int aSign = extractFloat128Sign(a);
3086     int bSign = extractFloat128Sign(b);
3087
3088     if (aSign == bSign) {
3089         return addFloat128Sigs(a, b, aSign, status);
3090     }
3091     else {
3092         return subFloat128Sigs(a, b, aSign, status);
3093     }
3094 }
3095
3096 /*----------------------------------------------------------------------------
3097 | Returns the result of subtracting the quadruple-precision floating-point
3098 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3099 | Standard for Binary Floating-Point Arithmetic.
3100 *----------------------------------------------------------------------------*/
3101
3102 float128 float128_sub(float128 a, float128 b, float_status_t &status)
3103 {
3104     int aSign = extractFloat128Sign(a);
3105     int bSign = extractFloat128Sign(b);
3106
3107     if (aSign == bSign) {
3108         return subFloat128Sigs(a, b, aSign, status);
3109     }
3110     else {
3111         return addFloat128Sigs(a, b, aSign, status);
3112     }
3113 }
3114
3115 /*----------------------------------------------------------------------------
3116 | Returns the result of multiplying the quadruple-precision floating-point
3117 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3118 | Standard for Binary Floating-Point Arithmetic.
3119 *----------------------------------------------------------------------------*/
3120
3121 float128 float128_mul(float128 a, float128 b, float_status_t &status)
3122 {
3123     int aSign, bSign, zSign;
3124     Bit32s aExp, bExp, zExp;
3125     Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
3126
3127     aSig1 = extractFloat128Frac1(a);
3128     aSig0 = extractFloat128Frac0(a);
3129     aExp = extractFloat128Exp(a);
3130     aSign = extractFloat128Sign(a);
3131     bSig1 = extractFloat128Frac1(b);
3132     bSig0 = extractFloat128Frac0(b);
3133     bExp = extractFloat128Exp(b);
3134     bSign = extractFloat128Sign(b);
3135
3136     zSign = aSign ^ bSign;
3137     if (aExp == 0x7FFF) {
3138         if ((aSig0 | aSig1) || ((bExp == 0x7FFF) && (bSig0 | bSig1)))
3139         {
3140             return propagateFloat128NaN(a, b, status);
3141         }
3142         if ((bExp | bSig0 | bSig1) == 0) goto invalid;
3143         return packFloat128(zSign, 0x7FFF, 0, 0);
3144     }
3145     if (bExp == 0x7FFF) {
3146         if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3147         if ((aExp | aSig0 | aSig1) == 0) {
3148  invalid:
3149             float_raise(status, float_flag_invalid);
3150             return float128_default_nan;
3151         }
3152         return packFloat128(zSign, 0x7FFF, 0, 0);
3153     }
3154     if (aExp == 0) {
3155         if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3156         float_raise(status, float_flag_denormal);
3157         normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3158     }
3159     if (bExp == 0) {
3160         if ((bSig0 | bSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3161         float_raise(status, float_flag_denormal);
3162         normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3163     }
3164     zExp = aExp + bExp - 0x4000;
3165     aSig0 |= BX_CONST64(0x0001000000000000);
3166     shortShift128Left(bSig0, bSig1, 16, &bSig0, &bSig1);
3167     mul128To256(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3);
3168     add128(zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1);
3169     zSig2 |= (zSig3 != 0);
3170     if (BX_CONST64(0x0002000000000000) <= zSig0) {
3171         shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2);
3172         ++zExp;
3173     }
3174     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3175 }
3176
3177 /*----------------------------------------------------------------------------
3178 | Returns the result of dividing the quadruple-precision floating-point value
3179 | `a' by the corresponding value `b'.  The operation is performed according to
3180 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3181 *----------------------------------------------------------------------------*/
3182
3183 float128 float128_div(float128 a, float128 b, float_status_t &status)
3184 {
3185     int aSign, bSign, zSign;
3186     Bit32s aExp, bExp, zExp;
3187     Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
3188     Bit64u rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3189
3190     aSig1 = extractFloat128Frac1(a);
3191     aSig0 = extractFloat128Frac0(a);
3192     aExp = extractFloat128Exp(a);
3193     aSign = extractFloat128Sign(a);
3194     bSig1 = extractFloat128Frac1(b);
3195     bSig0 = extractFloat128Frac0(b);
3196     bExp = extractFloat128Exp(b);
3197     bSign = extractFloat128Sign(b);
3198
3199     zSign = aSign ^ bSign;
3200     if (aExp == 0x7FFF) {
3201         if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3202         if (bExp == 0x7FFF) {
3203             if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3204             goto invalid;
3205         }
3206         return packFloat128(zSign, 0x7FFF, 0, 0);
3207     }
3208     if (bExp == 0x7FFF) {
3209         if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3210         return packFloat128(zSign, 0, 0, 0);
3211     }
3212     if (bExp == 0) {
3213         if ((bSig0 | bSig1) == 0) {
3214             if ((aExp | aSig0 | aSig1) == 0) {
3215  invalid:
3216                 float_raise(status, float_flag_invalid);
3217                 return float128_default_nan;
3218             }
3219             float_raise(status, float_flag_divbyzero);
3220             return packFloat128(zSign, 0x7FFF, 0, 0);
3221         }
3222         float_raise(status, float_flag_denormal);
3223         normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3224     }
3225     if (aExp == 0) {
3226         if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3227         float_raise(status, float_flag_denormal);
3228         normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3229     }
3230     zExp = aExp - bExp + 0x3FFD;
3231     shortShift128Left(
3232         aSig0 | BX_CONST64(0x0001000000000000), aSig1, 15, &aSig0, &aSig1);
3233     shortShift128Left(
3234         bSig0 | BX_CONST64(0x0001000000000000), bSig1, 15, &bSig0, &bSig1);
3235     if (le128(bSig0, bSig1, aSig0, aSig1)) {
3236         shift128Right(aSig0, aSig1, 1, &aSig0, &aSig1);
3237         ++zExp;
3238     }
3239     zSig0 = estimateDiv128To64(aSig0, aSig1, bSig0);
3240     mul128By64To192(bSig0, bSig1, zSig0, &term0, &term1, &term2);
3241     sub192(aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2);
3242     while ((Bit64s) rem0 < 0) {
3243         --zSig0;
3244         add192(rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2);
3245     }
3246     zSig1 = estimateDiv128To64(rem1, rem2, bSig0);
3247     if ((zSig1 & 0x3FFF) <= 4) {
3248         mul128By64To192(bSig0, bSig1, zSig1, &term1, &term2, &term3);
3249         sub192(rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3);
3250         while ((Bit64s) rem1 < 0) {
3251             --zSig1;
3252             add192(rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3);
3253         }
3254         zSig1 |= ((rem1 | rem2 | rem3) != 0);
3255     }
3256     shift128ExtraRightJamming(zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2);
3257     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3258 }
3259
3260 #endif