external/bsd/pcc/dist/pcc-libs/libsoftfloat/bits64/softfloat.c

   1 /* $NetBSD: softfloat.c,v 1.5 2007/11/08 21:31:04 martin Exp $ */
   2
   3 /*
   4  * This version hacked for use with gcc -msoft-float by bjh21.
   5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
   6  *  itself).
   7  */
   8
   9 /*
  10  * Things you may want to define:
  11  *
  12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
  13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
  14  *   properly renamed.
  15  */
  16
  17 /*
  18 ===============================================================================
  19
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 #ifdef SOFTFLOAT_FOR_GCC
  48 #include "softfloat-for-gcc.h"
  49 #endif
  50
  51 #include "milieu.h"
  52 #include "softfloat.h"
  53
  54 /*
  55  * Conversions between floats as stored in memory and floats as
  56  * SoftFloat uses them
  57  */
  58 #ifndef FLOAT64_DEMANGLE
  59 #define FLOAT64_DEMANGLE(a)     (a)
  60 #endif
  61 #ifndef FLOAT64_MANGLE
  62 #define FLOAT64_MANGLE(a)       (a)
  63 #endif
  64
  65 /*
  66 -------------------------------------------------------------------------------
  67 Floating-point rounding mode, extended double-precision rounding precision,
  68 and exception flags.
  69 -------------------------------------------------------------------------------
  70 */
  71 fp_rnd float_rounding_mode = float_round_nearest_even;
  72 fp_except float_exception_flags = 0;
  73 #ifdef FLOATX80
  74 int8 floatx80_rounding_precision = 80;
  75 #endif
  76
  77 /*
  78 -------------------------------------------------------------------------------
  79 Primitive arithmetic functions, including multi-word arithmetic, and
  80 division and square root approximations.  (Can be specialized to target if
  81 desired.)
  82 -------------------------------------------------------------------------------
  83 */
  84 #include "softfloat-macros"
  85
  86 /*
  87 -------------------------------------------------------------------------------
  88 Functions and definitions to determine:  (1) whether tininess for underflow
  89 is detected before or after rounding by default, (2) what (if anything)
  90 happens when exceptions are raised, (3) how signaling NaNs are distinguished
  91 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  92 are propagated from function inputs to output.  These details are target-
  93 specific.
  94 -------------------------------------------------------------------------------
  95 */
  96 #include "softfloat-specialize"
  97
  98 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
  99 /*
 100 -------------------------------------------------------------------------------
 101 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 102 and 7, and returns the properly rounded 32-bit integer corresponding to the
 103 input.  If `zSign' is 1, the input is negated before being converted to an
 104 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 105 is simply rounded to an integer, with the inexact exception raised if the
 106 input cannot be represented exactly as an integer.  However, if the fixed-
 107 point input is too large, the invalid exception is raised and the largest
 108 positive or negative integer is returned.
 109 -------------------------------------------------------------------------------
 110 */
 111 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
 112 {
 113     int8 roundingMode;
 114     flag roundNearestEven;
 115     int8 roundIncrement, roundBits;
 116     int32 z;
 117
 118     roundingMode = float_rounding_mode;
 119     roundNearestEven = ( roundingMode == float_round_nearest_even );
 120     roundIncrement = 0x40;
 121     if ( ! roundNearestEven ) {
 122         if ( roundingMode == float_round_to_zero ) {
 123             roundIncrement = 0;
 124         }
 125         else {
 126             roundIncrement = 0x7F;
 127             if ( zSign ) {
 128                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 129             }
 130             else {
 131                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 132             }
 133         }
 134     }
 135     roundBits = absZ & 0x7F;
 136     absZ = ( absZ + roundIncrement )>>7;
 137     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 138     z = absZ;
 139     if ( zSign ) z = - z;
 140     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 141         float_raise( float_flag_invalid );
 142         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 143     }
 144     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 145     return z;
 146
 147 }
 148
 149 /*
 150 -------------------------------------------------------------------------------
 151 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 152 `absZ1', with binary point between bits 63 and 64 (between the input words),
 153 and returns the properly rounded 64-bit integer corresponding to the input.
 154 If `zSign' is 1, the input is negated before being converted to an integer.
 155 Ordinarily, the fixed-point input is simply rounded to an integer, with
 156 the inexact exception raised if the input cannot be represented exactly as
 157 an integer.  However, if the fixed-point input is too large, the invalid
 158 exception is raised and the largest positive or negative integer is
 159 returned.
 160 -------------------------------------------------------------------------------
 161 */
 162 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
 163 {
 164     int8 roundingMode;
 165     flag roundNearestEven, increment;
 166     int64 z;
 167
 168     roundingMode = float_rounding_mode;
 169     roundNearestEven = ( roundingMode == float_round_nearest_even );
 170     increment = ( (sbits64) absZ1 < 0 );
 171     if ( ! roundNearestEven ) {
 172         if ( roundingMode == float_round_to_zero ) {
 173             increment = 0;
 174         }
 175         else {
 176             if ( zSign ) {
 177                 increment = ( roundingMode == float_round_down ) && absZ1;
 178             }
 179             else {
 180                 increment = ( roundingMode == float_round_up ) && absZ1;
 181             }
 182         }
 183     }
 184     if ( increment ) {
 185         ++absZ0;
 186         if ( absZ0 == 0 ) goto overflow;
 187         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 188     }
 189     z = absZ0;
 190     if ( zSign ) z = - z;
 191     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 192  overflow:
 193         float_raise( float_flag_invalid );
 194         return
 195               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 196             : LIT64( 0x7FFFFFFFFFFFFFFF );
 197     }
 198     if ( absZ1 ) float_exception_flags |= float_flag_inexact;
 199     return z;
 200
 201 }
 202 #endif
 203
 204 /*
 205 -------------------------------------------------------------------------------
 206 Returns the fraction bits of the single-precision floating-point value `a'.
 207 -------------------------------------------------------------------------------
 208 */
 209 INLINE bits32 extractFloat32Frac( float32 a )
 210 {
 211
 212     return a & 0x007FFFFF;
 213
 214 }
 215
 216 /*
 217 -------------------------------------------------------------------------------
 218 Returns the exponent bits of the single-precision floating-point value `a'.
 219 -------------------------------------------------------------------------------
 220 */
 221 INLINE int16 extractFloat32Exp( float32 a )
 222 {
 223
 224     return ( a>>23 ) & 0xFF;
 225
 226 }
 227
 228 /*
 229 -------------------------------------------------------------------------------
 230 Returns the sign bit of the single-precision floating-point value `a'.
 231 -------------------------------------------------------------------------------
 232 */
 233 INLINE flag extractFloat32Sign( float32 a )
 234 {
 235
 236     return a>>31;
 237
 238 }
 239
 240 /*
 241 -------------------------------------------------------------------------------
 242 Normalizes the subnormal single-precision floating-point value represented
 243 by the denormalized significand `aSig'.  The normalized exponent and
 244 significand are stored at the locations pointed to by `zExpPtr' and
 245 `zSigPtr', respectively.
 246 -------------------------------------------------------------------------------
 247 */
 248 static void
 249  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 250 {
 251     int8 shiftCount;
 252
 253     shiftCount = countLeadingZeros32( aSig ) - 8;
 254     *zSigPtr = aSig<<shiftCount;
 255     *zExpPtr = 1 - shiftCount;
 256
 257 }
 258
 259 /*
 260 -------------------------------------------------------------------------------
 261 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 262 single-precision floating-point value, returning the result.  After being
 263 shifted into the proper positions, the three fields are simply added
 264 together to form the result.  This means that any integer portion of `zSig'
 265 will be added into the exponent.  Since a properly normalized significand
 266 will have an integer portion equal to 1, the `zExp' input should be 1 less
 267 than the desired result exponent whenever `zSig' is a complete, normalized
 268 significand.
 269 -------------------------------------------------------------------------------
 270 */
 271 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 272 {
 273
 274     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 275
 276 }
 277
 278 /*
 279 -------------------------------------------------------------------------------
 280 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 281 and significand `zSig', and returns the proper single-precision floating-
 282 point value corresponding to the abstract input.  Ordinarily, the abstract
 283 value is simply rounded and packed into the single-precision format, with
 284 the inexact exception raised if the abstract input cannot be represented
 285 exactly.  However, if the abstract value is too large, the overflow and
 286 inexact exceptions are raised and an infinity or maximal finite value is
 287 returned.  If the abstract value is too small, the input value is rounded to
 288 a subnormal number, and the underflow and inexact exceptions are raised if
 289 the abstract input cannot be represented exactly as a subnormal single-
 290 precision floating-point number.
 291     The input significand `zSig' has its binary point between bits 30
 292 and 29, which is 7 bits to the left of the usual location.  This shifted
 293 significand must be normalized or smaller.  If `zSig' is not normalized,
 294 `zExp' must be 0; in that case, the result returned is a subnormal number,
 295 and it must not require rounding.  In the usual case that `zSig' is
 296 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 297 The handling of underflow and overflow follows the IEC/IEEE Standard for
 298 Binary Floating-Point Arithmetic.
 299 -------------------------------------------------------------------------------
 300 */
 301 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 302 {
 303     int8 roundingMode;
 304     flag roundNearestEven;
 305     int8 roundIncrement, roundBits;
 306     flag isTiny;
 307
 308     roundingMode = float_rounding_mode;
 309     roundNearestEven = ( roundingMode == float_round_nearest_even );
 310     roundIncrement = 0x40;
 311     if ( ! roundNearestEven ) {
 312         if ( roundingMode == float_round_to_zero ) {
 313             roundIncrement = 0;
 314         }
 315         else {
 316             roundIncrement = 0x7F;
 317             if ( zSign ) {
 318                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 319             }
 320             else {
 321                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 322             }
 323         }
 324     }
 325     roundBits = zSig & 0x7F;
 326     if ( 0xFD <= (bits16) zExp ) {
 327         if (    ( 0xFD < zExp )
 328              || (    ( zExp == 0xFD )
 329                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 330            ) {
 331             float_raise( float_flag_overflow | float_flag_inexact );
 332             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 333         }
 334         if ( zExp < 0 ) {
 335             isTiny =
 336                    ( float_detect_tininess == float_tininess_before_rounding )
 337                 || ( zExp < -1 )
 338                 || ( zSig + roundIncrement < 0x80000000 );
 339             shift32RightJamming( zSig, - zExp, &zSig );
 340             zExp = 0;
 341             roundBits = zSig & 0x7F;
 342             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 343         }
 344     }
 345     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 346     zSig = ( zSig + roundIncrement )>>7;
 347     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 348     if ( zSig == 0 ) zExp = 0;
 349     return packFloat32( zSign, zExp, zSig );
 350
 351 }
 352
 353 /*
 354 -------------------------------------------------------------------------------
 355 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 356 and significand `zSig', and returns the proper single-precision floating-
 357 point value corresponding to the abstract input.  This routine is just like
 358 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 359 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 360 floating-point exponent.
 361 -------------------------------------------------------------------------------
 362 */
 363 static float32
 364  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 365 {
 366     int8 shiftCount;
 367
 368     shiftCount = countLeadingZeros32( zSig ) - 1;
 369     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
 370
 371 }
 372
 373 /*
 374 -------------------------------------------------------------------------------
 375 Returns the fraction bits of the double-precision floating-point value `a'.
 376 -------------------------------------------------------------------------------
 377 */
 378 INLINE bits64 extractFloat64Frac( float64 a )
 379 {
 380
 381     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
 382
 383 }
 384
 385 /*
 386 -------------------------------------------------------------------------------
 387 Returns the exponent bits of the double-precision floating-point value `a'.
 388 -------------------------------------------------------------------------------
 389 */
 390 INLINE int16 extractFloat64Exp( float64 a )
 391 {
 392
 393     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
 394
 395 }
 396
 397 /*
 398 -------------------------------------------------------------------------------
 399 Returns the sign bit of the double-precision floating-point value `a'.
 400 -------------------------------------------------------------------------------
 401 */
 402 INLINE flag extractFloat64Sign( float64 a )
 403 {
 404
 405     return FLOAT64_DEMANGLE(a)>>63;
 406
 407 }
 408
 409 /*
 410 -------------------------------------------------------------------------------
 411 Normalizes the subnormal double-precision floating-point value represented
 412 by the denormalized significand `aSig'.  The normalized exponent and
 413 significand are stored at the locations pointed to by `zExpPtr' and
 414 `zSigPtr', respectively.
 415 -------------------------------------------------------------------------------
 416 */
 417 static void
 418  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 419 {
 420     int8 shiftCount;
 421
 422     shiftCount = countLeadingZeros64( aSig ) - 11;
 423     *zSigPtr = aSig<<shiftCount;
 424     *zExpPtr = 1 - shiftCount;
 425
 426 }
 427
 428 /*
 429 -------------------------------------------------------------------------------
 430 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 431 double-precision floating-point value, returning the result.  After being
 432 shifted into the proper positions, the three fields are simply added
 433 together to form the result.  This means that any integer portion of `zSig'
 434 will be added into the exponent.  Since a properly normalized significand
 435 will have an integer portion equal to 1, the `zExp' input should be 1 less
 436 than the desired result exponent whenever `zSig' is a complete, normalized
 437 significand.
 438 -------------------------------------------------------------------------------
 439 */
 440 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 441 {
 442
 443     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
 444                            ( ( (bits64) zExp )<<52 ) + zSig );
 445
 446 }
 447
 448 /*
 449 -------------------------------------------------------------------------------
 450 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 451 and significand `zSig', and returns the proper double-precision floating-
 452 point value corresponding to the abstract input.  Ordinarily, the abstract
 453 value is simply rounded and packed into the double-precision format, with
 454 the inexact exception raised if the abstract input cannot be represented
 455 exactly.  However, if the abstract value is too large, the overflow and
 456 inexact exceptions are raised and an infinity or maximal finite value is
 457 returned.  If the abstract value is too small, the input value is rounded to
 458 a subnormal number, and the underflow and inexact exceptions are raised if
 459 the abstract input cannot be represented exactly as a subnormal double-
 460 precision floating-point number.
 461     The input significand `zSig' has its binary point between bits 62
 462 and 61, which is 10 bits to the left of the usual location.  This shifted
 463 significand must be normalized or smaller.  If `zSig' is not normalized,
 464 `zExp' must be 0; in that case, the result returned is a subnormal number,
 465 and it must not require rounding.  In the usual case that `zSig' is
 466 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 467 The handling of underflow and overflow follows the IEC/IEEE Standard for
 468 Binary Floating-Point Arithmetic.
 469 -------------------------------------------------------------------------------
 470 */
 471 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 472 {
 473     int8 roundingMode;
 474     flag roundNearestEven;
 475     int16 roundIncrement, roundBits;
 476     flag isTiny;
 477
 478     roundingMode = float_rounding_mode;
 479     roundNearestEven = ( roundingMode == float_round_nearest_even );
 480     roundIncrement = 0x200;
 481     if ( ! roundNearestEven ) {
 482         if ( roundingMode == float_round_to_zero ) {
 483             roundIncrement = 0;
 484         }
 485         else {
 486             roundIncrement = 0x3FF;
 487             if ( zSign ) {
 488                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 489             }
 490             else {
 491                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 492             }
 493         }
 494     }
 495     roundBits = zSig & 0x3FF;
 496     if ( 0x7FD <= (bits16) zExp ) {
 497         if (    ( 0x7FD < zExp )
 498              || (    ( zExp == 0x7FD )
 499                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 500            ) {
 501             float_raise( float_flag_overflow | float_flag_inexact );
 502             return FLOAT64_MANGLE(
 503                 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
 504                 ( roundIncrement == 0 ));
 505         }
 506         if ( zExp < 0 ) {
 507             isTiny =
 508                    ( float_detect_tininess == float_tininess_before_rounding )
 509                 || ( zExp < -1 )
 510                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 511             shift64RightJamming( zSig, - zExp, &zSig );
 512             zExp = 0;
 513             roundBits = zSig & 0x3FF;
 514             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 515         }
 516     }
 517     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 518     zSig = ( zSig + roundIncrement )>>10;
 519     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 520     if ( zSig == 0 ) zExp = 0;
 521     return packFloat64( zSign, zExp, zSig );
 522
 523 }
 524
 525 /*
 526 -------------------------------------------------------------------------------
 527 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 528 and significand `zSig', and returns the proper double-precision floating-
 529 point value corresponding to the abstract input.  This routine is just like
 530 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 531 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 532 floating-point exponent.
 533 -------------------------------------------------------------------------------
 534 */
 535 static float64
 536  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 537 {
 538     int8 shiftCount;
 539
 540     shiftCount = countLeadingZeros64( zSig ) - 1;
 541     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
 542
 543 }
 544
 545 #ifdef FLOATX80
 546
 547 /*
 548 -------------------------------------------------------------------------------
 549 Returns the fraction bits of the extended double-precision floating-point
 550 value `a'.
 551 -------------------------------------------------------------------------------
 552 */
 553 INLINE bits64 extractFloatx80Frac( floatx80 a )
 554 {
 555
 556     return a.low;
 557
 558 }
 559
 560 /*
 561 -------------------------------------------------------------------------------
 562 Returns the exponent bits of the extended double-precision floating-point
 563 value `a'.
 564 -------------------------------------------------------------------------------
 565 */
 566 INLINE int32 extractFloatx80Exp( floatx80 a )
 567 {
 568
 569     return a.high & 0x7FFF;
 570
 571 }
 572
 573 /*
 574 -------------------------------------------------------------------------------
 575 Returns the sign bit of the extended double-precision floating-point value
 576 `a'.
 577 -------------------------------------------------------------------------------
 578 */
 579 INLINE flag extractFloatx80Sign( floatx80 a )
 580 {
 581
 582     return a.high>>15;
 583
 584 }
 585
 586 /*
 587 -------------------------------------------------------------------------------
 588 Normalizes the subnormal extended double-precision floating-point value
 589 represented by the denormalized significand `aSig'.  The normalized exponent
 590 and significand are stored at the locations pointed to by `zExpPtr' and
 591 `zSigPtr', respectively.
 592 -------------------------------------------------------------------------------
 593 */
 594 static void
 595  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 596 {
 597     int8 shiftCount;
 598
 599     shiftCount = countLeadingZeros64( aSig );
 600     *zSigPtr = aSig<<shiftCount;
 601     *zExpPtr = 1 - shiftCount;
 602
 603 }
 604
 605 /*
 606 -------------------------------------------------------------------------------
 607 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 608 extended double-precision floating-point value, returning the result.
 609 -------------------------------------------------------------------------------
 610 */
 611 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 612 {
 613     floatx80 z;
 614
 615     z.low = zSig;
 616     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 617     return z;
 618
 619 }
 620
 621 /*
 622 -------------------------------------------------------------------------------
 623 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 624 and extended significand formed by the concatenation of `zSig0' and `zSig1',
 625 and returns the proper extended double-precision floating-point value
 626 corresponding to the abstract input.  Ordinarily, the abstract value is
 627 rounded and packed into the extended double-precision format, with the
 628 inexact exception raised if the abstract input cannot be represented
 629 exactly.  However, if the abstract value is too large, the overflow and
 630 inexact exceptions are raised and an infinity or maximal finite value is
 631 returned.  If the abstract value is too small, the input value is rounded to
 632 a subnormal number, and the underflow and inexact exceptions are raised if
 633 the abstract input cannot be represented exactly as a subnormal extended
 634 double-precision floating-point number.
 635     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 636 number of bits as single or double precision, respectively.  Otherwise, the
 637 result is rounded to the full precision of the extended double-precision
 638 format.
 639     The input significand must be normalized or smaller.  If the input
 640 significand is not normalized, `zExp' must be 0; in that case, the result
 641 returned is a subnormal number, and it must not require rounding.  The
 642 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 643 Floating-Point Arithmetic.
 644 -------------------------------------------------------------------------------
 645 */
 646 static floatx80
 647  roundAndPackFloatx80(
 648      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 649  )
 650 {
 651     int8 roundingMode;
 652     flag roundNearestEven, increment, isTiny;
 653     int64 roundIncrement, roundMask, roundBits;
 654
 655     roundingMode = float_rounding_mode;
 656     roundNearestEven = ( roundingMode == float_round_nearest_even );
 657     if ( roundingPrecision == 80 ) goto precision80;
 658     if ( roundingPrecision == 64 ) {
 659         roundIncrement = LIT64( 0x0000000000000400 );
 660         roundMask = LIT64( 0x00000000000007FF );
 661     }
 662     else if ( roundingPrecision == 32 ) {
 663         roundIncrement = LIT64( 0x0000008000000000 );
 664         roundMask = LIT64( 0x000000FFFFFFFFFF );
 665     }
 666     else {
 667         goto precision80;
 668     }
 669     zSig0 |= ( zSig1 != 0 );
 670     if ( ! roundNearestEven ) {
 671         if ( roundingMode == float_round_to_zero ) {
 672             roundIncrement = 0;
 673         }
 674         else {
 675             roundIncrement = roundMask;
 676             if ( zSign ) {
 677                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 678             }
 679             else {
 680                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 681             }
 682         }
 683     }
 684     roundBits = zSig0 & roundMask;
 685     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 686         if (    ( 0x7FFE < zExp )
 687              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 688            ) {
 689             goto overflow;
 690         }
 691         if ( zExp <= 0 ) {
 692             isTiny =
 693                    ( float_detect_tininess == float_tininess_before_rounding )
 694                 || ( zExp < 0 )
 695                 || ( zSig0 <= zSig0 + roundIncrement );
 696             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 697             zExp = 0;
 698             roundBits = zSig0 & roundMask;
 699             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 700             if ( roundBits ) float_exception_flags |= float_flag_inexact;
 701             zSig0 += roundIncrement;
 702             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 703             roundIncrement = roundMask + 1;
 704             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 705                 roundMask |= roundIncrement;
 706             }
 707             zSig0 &= ~ roundMask;
 708             return packFloatx80( zSign, zExp, zSig0 );
 709         }
 710     }
 711     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 712     zSig0 += roundIncrement;
 713     if ( zSig0 < roundIncrement ) {
 714         ++zExp;
 715         zSig0 = LIT64( 0x8000000000000000 );
 716     }
 717     roundIncrement = roundMask + 1;
 718     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 719         roundMask |= roundIncrement;
 720     }
 721     zSig0 &= ~ roundMask;
 722     if ( zSig0 == 0 ) zExp = 0;
 723     return packFloatx80( zSign, zExp, zSig0 );
 724  precision80:
 725     increment = ( (sbits64) zSig1 < 0 );
 726     if ( ! roundNearestEven ) {
 727         if ( roundingMode == float_round_to_zero ) {
 728             increment = 0;
 729         }
 730         else {
 731             if ( zSign ) {
 732                 increment = ( roundingMode == float_round_down ) && zSig1;
 733             }
 734             else {
 735                 increment = ( roundingMode == float_round_up ) && zSig1;
 736             }
 737         }
 738     }
 739     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 740         if (    ( 0x7FFE < zExp )
 741              || (    ( zExp == 0x7FFE )
 742                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 743                   && increment
 744                 )
 745            ) {
 746             roundMask = 0;
 747  overflow:
 748             float_raise( float_flag_overflow | float_flag_inexact );
 749             if (    ( roundingMode == float_round_to_zero )
 750                  || ( zSign && ( roundingMode == float_round_up ) )
 751                  || ( ! zSign && ( roundingMode == float_round_down ) )
 752                ) {
 753                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 754             }
 755             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 756         }
 757         if ( zExp <= 0 ) {
 758             isTiny =
 759                    ( float_detect_tininess == float_tininess_before_rounding )
 760                 || ( zExp < 0 )
 761                 || ! increment
 762                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 763             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 764             zExp = 0;
 765             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
 766             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 767             if ( roundNearestEven ) {
 768                 increment = ( (sbits64) zSig1 < 0 );
 769             }
 770             else {
 771                 if ( zSign ) {
 772                     increment = ( roundingMode == float_round_down ) && zSig1;
 773                 }
 774                 else {
 775                     increment = ( roundingMode == float_round_up ) && zSig1;
 776                 }
 777             }
 778             if ( increment ) {
 779                 ++zSig0;
 780                 zSig0 &=
 781                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 782                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 783             }
 784             return packFloatx80( zSign, zExp, zSig0 );
 785         }
 786     }
 787     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 788     if ( increment ) {
 789         ++zSig0;
 790         if ( zSig0 == 0 ) {
 791             ++zExp;
 792             zSig0 = LIT64( 0x8000000000000000 );
 793         }
 794         else {
 795             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 796         }
 797     }
 798     else {
 799         if ( zSig0 == 0 ) zExp = 0;
 800     }
 801     return packFloatx80( zSign, zExp, zSig0 );
 802
 803 }
 804
 805 /*
 806 -------------------------------------------------------------------------------
 807 Takes an abstract floating-point value having sign `zSign', exponent
 808 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 809 and returns the proper extended double-precision floating-point value
 810 corresponding to the abstract input.  This routine is just like
 811 `roundAndPackFloatx80' except that the input significand does not have to be
 812 normalized.
 813 -------------------------------------------------------------------------------
 814 */
 815 static floatx80
 816  normalizeRoundAndPackFloatx80(
 817      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 818  )
 819 {
 820     int8 shiftCount;
 821
 822     if ( zSig0 == 0 ) {
 823         zSig0 = zSig1;
 824         zSig1 = 0;
 825         zExp -= 64;
 826     }
 827     shiftCount = countLeadingZeros64( zSig0 );
 828     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 829     zExp -= shiftCount;
 830     return
 831         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
 832
 833 }
 834
 835 #endif
 836
 837 #ifdef FLOAT128
 838
 839 /*
 840 -------------------------------------------------------------------------------
 841 Returns the least-significant 64 fraction bits of the quadruple-precision
 842 floating-point value `a'.
 843 -------------------------------------------------------------------------------
 844 */
 845 INLINE bits64 extractFloat128Frac1( float128 a )
 846 {
 847
 848     return a.low;
 849
 850 }
 851
 852 /*
 853 -------------------------------------------------------------------------------
 854 Returns the most-significant 48 fraction bits of the quadruple-precision
 855 floating-point value `a'.
 856 -------------------------------------------------------------------------------
 857 */
 858 INLINE bits64 extractFloat128Frac0( float128 a )
 859 {
 860
 861     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 862
 863 }
 864
 865 /*
 866 -------------------------------------------------------------------------------
 867 Returns the exponent bits of the quadruple-precision floating-point value
 868 `a'.
 869 -------------------------------------------------------------------------------
 870 */
 871 INLINE int32 extractFloat128Exp( float128 a )
 872 {
 873
 874     return ( a.high>>48 ) & 0x7FFF;
 875
 876 }
 877
 878 /*
 879 -------------------------------------------------------------------------------
 880 Returns the sign bit of the quadruple-precision floating-point value `a'.
 881 -------------------------------------------------------------------------------
 882 */
 883 INLINE flag extractFloat128Sign( float128 a )
 884 {
 885
 886     return a.high>>63;
 887
 888 }
 889
 890 /*
 891 -------------------------------------------------------------------------------
 892 Normalizes the subnormal quadruple-precision floating-point value
 893 represented by the denormalized significand formed by the concatenation of
 894 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 895 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 896 significand are stored at the location pointed to by `zSig0Ptr', and the
 897 least significant 64 bits of the normalized significand are stored at the
 898 location pointed to by `zSig1Ptr'.
 899 -------------------------------------------------------------------------------
 900 */
 901 static void
 902  normalizeFloat128Subnormal(
 903      bits64 aSig0,
 904      bits64 aSig1,
 905      int32 *zExpPtr,
 906      bits64 *zSig0Ptr,
 907      bits64 *zSig1Ptr
 908  )
 909 {
 910     int8 shiftCount;
 911
 912     if ( aSig0 == 0 ) {
 913         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 914         if ( shiftCount < 0 ) {
 915             *zSig0Ptr = aSig1>>( - shiftCount );
 916             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 917         }
 918         else {
 919             *zSig0Ptr = aSig1<<shiftCount;
 920             *zSig1Ptr = 0;
 921         }
 922         *zExpPtr = - shiftCount - 63;
 923     }
 924     else {
 925         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 926         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 927         *zExpPtr = 1 - shiftCount;
 928     }
 929
 930 }
 931
 932 /*
 933 -------------------------------------------------------------------------------
 934 Packs the sign `zSign', the exponent `zExp', and the significand formed
 935 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 936 floating-point value, returning the result.  After being shifted into the
 937 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 938 added together to form the most significant 32 bits of the result.  This
 939 means that any integer portion of `zSig0' will be added into the exponent.
 940 Since a properly normalized significand will have an integer portion equal
 941 to 1, the `zExp' input should be 1 less than the desired result exponent
 942 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 943 significand.
 944 -------------------------------------------------------------------------------
 945 */
 946 INLINE float128
 947  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 948 {
 949     float128 z;
 950
 951     z.low = zSig1;
 952     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 953     return z;
 954
 955 }
 956
 957 /*
 958 -------------------------------------------------------------------------------
 959 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 960 and extended significand formed by the concatenation of `zSig0', `zSig1',
 961 and `zSig2', and returns the proper quadruple-precision floating-point value
 962 corresponding to the abstract input.  Ordinarily, the abstract value is
 963 simply rounded and packed into the quadruple-precision format, with the
 964 inexact exception raised if the abstract input cannot be represented
 965 exactly.  However, if the abstract value is too large, the overflow and
 966 inexact exceptions are raised and an infinity or maximal finite value is
 967 returned.  If the abstract value is too small, the input value is rounded to
 968 a subnormal number, and the underflow and inexact exceptions are raised if
 969 the abstract input cannot be represented exactly as a subnormal quadruple-
 970 precision floating-point number.
 971     The input significand must be normalized or smaller.  If the input
 972 significand is not normalized, `zExp' must be 0; in that case, the result
 973 returned is a subnormal number, and it must not require rounding.  In the
 974 usual case that the input significand is normalized, `zExp' must be 1 less
 975 than the ``true'' floating-point exponent.  The handling of underflow and
 976 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 977 -------------------------------------------------------------------------------
 978 */
 979 static float128
 980  roundAndPackFloat128(
 981      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
 982 {
 983     int8 roundingMode;
 984     flag roundNearestEven, increment, isTiny;
 985
 986     roundingMode = float_rounding_mode;
 987     roundNearestEven = ( roundingMode == float_round_nearest_even );
 988     increment = ( (sbits64) zSig2 < 0 );
 989     if ( ! roundNearestEven ) {
 990         if ( roundingMode == float_round_to_zero ) {
 991             increment = 0;
 992         }
 993         else {
 994             if ( zSign ) {
 995                 increment = ( roundingMode == float_round_down ) && zSig2;
 996             }
 997             else {
 998                 increment = ( roundingMode == float_round_up ) && zSig2;
 999             }
1000         }
1001     }
1002     if ( 0x7FFD <= (bits32) zExp ) {
1003         if (    ( 0x7FFD < zExp )
1004              || (    ( zExp == 0x7FFD )
1005                   && eq128(
1006                          LIT64( 0x0001FFFFFFFFFFFF ),
1007                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1008                          zSig0,
1009                          zSig1
1010                      )
1011                   && increment
1012                 )
1013            ) {
1014             float_raise( float_flag_overflow | float_flag_inexact );
1015             if (    ( roundingMode == float_round_to_zero )
1016                  || ( zSign && ( roundingMode == float_round_up ) )
1017                  || ( ! zSign && ( roundingMode == float_round_down ) )
1018                ) {
1019                 return
1020                     packFloat128(
1021                         zSign,
1022                         0x7FFE,
1023                         LIT64( 0x0000FFFFFFFFFFFF ),
1024                         LIT64( 0xFFFFFFFFFFFFFFFF )
1025                     );
1026             }
1027             return packFloat128( zSign, 0x7FFF, 0, 0 );
1028         }
1029         if ( zExp < 0 ) {
1030             isTiny =
1031                    ( float_detect_tininess == float_tininess_before_rounding )
1032                 || ( zExp < -1 )
1033                 || ! increment
1034                 || lt128(
1035                        zSig0,
1036                        zSig1,
1037                        LIT64( 0x0001FFFFFFFFFFFF ),
1038                        LIT64( 0xFFFFFFFFFFFFFFFF )
1039                    );
1040             shift128ExtraRightJamming(
1041                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1042             zExp = 0;
1043             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1044             if ( roundNearestEven ) {
1045                 increment = ( (sbits64) zSig2 < 0 );
1046             }
1047             else {
1048                 if ( zSign ) {
1049                     increment = ( roundingMode == float_round_down ) && zSig2;
1050                 }
1051                 else {
1052                     increment = ( roundingMode == float_round_up ) && zSig2;
1053                 }
1054             }
1055         }
1056     }
1057     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1058     if ( increment ) {
1059         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1060         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1061     }
1062     else {
1063         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1064     }
1065     return packFloat128( zSign, zExp, zSig0, zSig1 );
1066
1067 }
1068
1069 /*
1070 -------------------------------------------------------------------------------
1071 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1072 and significand formed by the concatenation of `zSig0' and `zSig1', and
1073 returns the proper quadruple-precision floating-point value corresponding
1074 to the abstract input.  This routine is just like `roundAndPackFloat128'
1075 except that the input significand has fewer bits and does not have to be
1076 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1077 point exponent.
1078 -------------------------------------------------------------------------------
1079 */
1080 static float128
1081  normalizeRoundAndPackFloat128(
1082      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1083 {
1084     int8 shiftCount;
1085     bits64 zSig2;
1086
1087     if ( zSig0 == 0 ) {
1088         zSig0 = zSig1;
1089         zSig1 = 0;
1090         zExp -= 64;
1091     }
1092     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1093     if ( 0 <= shiftCount ) {
1094         zSig2 = 0;
1095         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1096     }
1097     else {
1098         shift128ExtraRightJamming(
1099             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1100     }
1101     zExp -= shiftCount;
1102     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1103
1104 }
1105
1106 #endif
1107
1108 /*
1109 -------------------------------------------------------------------------------
1110 Returns the result of converting the 32-bit two's complement integer `a'
1111 to the single-precision floating-point format.  The conversion is performed
1112 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 -------------------------------------------------------------------------------
1114 */
1115 float32 int32_to_float32( int32 a )
1116 {
1117     flag zSign;
1118
1119     if ( a == 0 ) return 0;
1120     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1121     zSign = ( a < 0 );
1122     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1123
1124 }
1125
1126 /*
1127 -------------------------------------------------------------------------------
1128 Returns the result of converting the 32-bit two's complement integer `a'
1129 to the double-precision floating-point format.  The conversion is performed
1130 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1131 -------------------------------------------------------------------------------
1132 */
1133 float64 int32_to_float64( int32 a )
1134 {
1135     flag zSign;
1136     uint32 absA;
1137     int8 shiftCount;
1138     bits64 zSig;
1139
1140     if ( a == 0 ) return 0;
1141     zSign = ( a < 0 );
1142     absA = zSign ? - a : a;
1143     shiftCount = countLeadingZeros32( absA ) + 21;
1144     zSig = absA;
1145     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1146
1147 }
1148
1149 #ifdef FLOATX80
1150
1151 /*
1152 -------------------------------------------------------------------------------
1153 Returns the result of converting the 32-bit two's complement integer `a'
1154 to the extended double-precision floating-point format.  The conversion
1155 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156 Arithmetic.
1157 -------------------------------------------------------------------------------
1158 */
1159 floatx80 int32_to_floatx80( int32 a )
1160 {
1161     flag zSign;
1162     uint32 absA;
1163     int8 shiftCount;
1164     bits64 zSig;
1165
1166     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167     zSign = ( a < 0 );
1168     absA = zSign ? - a : a;
1169     shiftCount = countLeadingZeros32( absA ) + 32;
1170     zSig = absA;
1171     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172
1173 }
1174
1175 #endif
1176
1177 #ifdef FLOAT128
1178
1179 /*
1180 -------------------------------------------------------------------------------
1181 Returns the result of converting the 32-bit two's complement integer `a' to
1182 the quadruple-precision floating-point format.  The conversion is performed
1183 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184 -------------------------------------------------------------------------------
1185 */
1186 float128 int32_to_float128( int32 a )
1187 {
1188     flag zSign;
1189     uint32 absA;
1190     int8 shiftCount;
1191     bits64 zSig0;
1192
1193     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194     zSign = ( a < 0 );
1195     absA = zSign ? - a : a;
1196     shiftCount = countLeadingZeros32( absA ) + 17;
1197     zSig0 = absA;
1198     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1199
1200 }
1201
1202 #endif
1203
1204 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1205 /*
1206 -------------------------------------------------------------------------------
1207 Returns the result of converting the 64-bit two's complement integer `a'
1208 to the single-precision floating-point format.  The conversion is performed
1209 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1210 -------------------------------------------------------------------------------
1211 */
1212 float32 int64_to_float32( int64 a )
1213 {
1214     flag zSign;
1215     uint64 absA;
1216     int8 shiftCount;
1217
1218     if ( a == 0 ) return 0;
1219     zSign = ( a < 0 );
1220     absA = zSign ? - a : a;
1221     shiftCount = countLeadingZeros64( absA ) - 40;
1222     if ( 0 <= shiftCount ) {
1223         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1224     }
1225     else {
1226         shiftCount += 7;
1227         if ( shiftCount < 0 ) {
1228             shift64RightJamming( absA, - shiftCount, &absA );
1229         }
1230         else {
1231             absA <<= shiftCount;
1232         }
1233         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1234     }
1235
1236 }
1237
1238 /*
1239 -------------------------------------------------------------------------------
1240 Returns the result of converting the 64-bit two's complement integer `a'
1241 to the double-precision floating-point format.  The conversion is performed
1242 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1243 -------------------------------------------------------------------------------
1244 */
1245 float64 int64_to_float64( int64 a )
1246 {
1247     flag zSign;
1248
1249     if ( a == 0 ) return 0;
1250     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1251         return packFloat64( 1, 0x43E, 0 );
1252     }
1253     zSign = ( a < 0 );
1254     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1255
1256 }
1257
1258 #ifdef FLOATX80
1259
1260 /*
1261 -------------------------------------------------------------------------------
1262 Returns the result of converting the 64-bit two's complement integer `a'
1263 to the extended double-precision floating-point format.  The conversion
1264 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1265 Arithmetic.
1266 -------------------------------------------------------------------------------
1267 */
1268 floatx80 int64_to_floatx80( int64 a )
1269 {
1270     flag zSign;
1271     uint64 absA;
1272     int8 shiftCount;
1273
1274     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1275     zSign = ( a < 0 );
1276     absA = zSign ? - a : a;
1277     shiftCount = countLeadingZeros64( absA );
1278     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1279
1280 }
1281
1282 #endif
1283
1284 #endif /* !SOFTFLOAT_FOR_GCC */
1285
1286 #ifdef FLOAT128
1287
1288 /*
1289 -------------------------------------------------------------------------------
1290 Returns the result of converting the 64-bit two's complement integer `a' to
1291 the quadruple-precision floating-point format.  The conversion is performed
1292 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293 -------------------------------------------------------------------------------
1294 */
1295 float128 int64_to_float128( int64 a )
1296 {
1297     flag zSign;
1298     uint64 absA;
1299     int8 shiftCount;
1300     int32 zExp;
1301     bits64 zSig0, zSig1;
1302
1303     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1304     zSign = ( a < 0 );
1305     absA = zSign ? - a : a;
1306     shiftCount = countLeadingZeros64( absA ) + 49;
1307     zExp = 0x406E - shiftCount;
1308     if ( 64 <= shiftCount ) {
1309         zSig1 = 0;
1310         zSig0 = absA;
1311         shiftCount -= 64;
1312     }
1313     else {
1314         zSig1 = absA;
1315         zSig0 = 0;
1316     }
1317     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1318     return packFloat128( zSign, zExp, zSig0, zSig1 );
1319
1320 }
1321
1322 #endif
1323
1324 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1325 /*
1326 -------------------------------------------------------------------------------
1327 Returns the result of converting the single-precision floating-point value
1328 `a' to the 32-bit two's complement integer format.  The conversion is
1329 performed according to the IEC/IEEE Standard for Binary Floating-Point
1330 Arithmetic---which means in particular that the conversion is rounded
1331 according to the current rounding mode.  If `a' is a NaN, the largest
1332 positive integer is returned.  Otherwise, if the conversion overflows, the
1333 largest integer with the same sign as `a' is returned.
1334 -------------------------------------------------------------------------------
1335 */
1336 int32 float32_to_int32( float32 a )
1337 {
1338     flag aSign;
1339     int16 aExp, shiftCount;
1340     bits32 aSig;
1341     bits64 aSig64;
1342
1343     aSig = extractFloat32Frac( a );
1344     aExp = extractFloat32Exp( a );
1345     aSign = extractFloat32Sign( a );
1346     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1347     if ( aExp ) aSig |= 0x00800000;
1348     shiftCount = 0xAF - aExp;
1349     aSig64 = aSig;
1350     aSig64 <<= 32;
1351     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1352     return roundAndPackInt32( aSign, aSig64 );
1353
1354 }
1355 #endif /* !SOFTFLOAT_FOR_GCC */
1356
1357 /*
1358 -------------------------------------------------------------------------------
1359 Returns the result of converting the single-precision floating-point value
1360 `a' to the 32-bit two's complement integer format.  The conversion is
1361 performed according to the IEC/IEEE Standard for Binary Floating-Point
1362 Arithmetic, except that the conversion is always rounded toward zero.
1363 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1364 the conversion overflows, the largest integer with the same sign as `a' is
1365 returned.
1366 -------------------------------------------------------------------------------
1367 */
1368 int32 float32_to_int32_round_to_zero( float32 a )
1369 {
1370     flag aSign;
1371     int16 aExp, shiftCount;
1372     bits32 aSig;
1373     int32 z;
1374
1375     aSig = extractFloat32Frac( a );
1376     aExp = extractFloat32Exp( a );
1377     aSign = extractFloat32Sign( a );
1378     shiftCount = aExp - 0x9E;
1379     if ( 0 <= shiftCount ) {
1380         if ( a != 0xCF000000 ) {
1381             float_raise( float_flag_invalid );
1382             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1383         }
1384         return (sbits32) 0x80000000;
1385     }
1386     else if ( aExp <= 0x7E ) {
1387         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1388         return 0;
1389     }
1390     aSig = ( aSig | 0x00800000 )<<8;
1391     z = aSig>>( - shiftCount );
1392     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1393         float_exception_flags |= float_flag_inexact;
1394     }
1395     if ( aSign ) z = - z;
1396     return z;
1397
1398 }
1399
1400 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1401 /*
1402 -------------------------------------------------------------------------------
1403 Returns the result of converting the single-precision floating-point value
1404 `a' to the 64-bit two's complement integer format.  The conversion is
1405 performed according to the IEC/IEEE Standard for Binary Floating-Point
1406 Arithmetic---which means in particular that the conversion is rounded
1407 according to the current rounding mode.  If `a' is a NaN, the largest
1408 positive integer is returned.  Otherwise, if the conversion overflows, the
1409 largest integer with the same sign as `a' is returned.
1410 -------------------------------------------------------------------------------
1411 */
1412 int64 float32_to_int64( float32 a )
1413 {
1414     flag aSign;
1415     int16 aExp, shiftCount;
1416     bits32 aSig;
1417     bits64 aSig64, aSigExtra;
1418
1419     aSig = extractFloat32Frac( a );
1420     aExp = extractFloat32Exp( a );
1421     aSign = extractFloat32Sign( a );
1422     shiftCount = 0xBE - aExp;
1423     if ( shiftCount < 0 ) {
1424         float_raise( float_flag_invalid );
1425         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1426             return LIT64( 0x7FFFFFFFFFFFFFFF );
1427         }
1428         return (sbits64) LIT64( 0x8000000000000000 );
1429     }
1430     if ( aExp ) aSig |= 0x00800000;
1431     aSig64 = aSig;
1432     aSig64 <<= 40;
1433     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1434     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1435
1436 }
1437
1438 /*
1439 -------------------------------------------------------------------------------
1440 Returns the result of converting the single-precision floating-point value
1441 `a' to the 64-bit two's complement integer format.  The conversion is
1442 performed according to the IEC/IEEE Standard for Binary Floating-Point
1443 Arithmetic, except that the conversion is always rounded toward zero.  If
1444 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1445 conversion overflows, the largest integer with the same sign as `a' is
1446 returned.
1447 -------------------------------------------------------------------------------
1448 */
1449 int64 float32_to_int64_round_to_zero( float32 a )
1450 {
1451     flag aSign;
1452     int16 aExp, shiftCount;
1453     bits32 aSig;
1454     bits64 aSig64;
1455     int64 z;
1456
1457     aSig = extractFloat32Frac( a );
1458     aExp = extractFloat32Exp( a );
1459     aSign = extractFloat32Sign( a );
1460     shiftCount = aExp - 0xBE;
1461     if ( 0 <= shiftCount ) {
1462         if ( a != 0xDF000000 ) {
1463             float_raise( float_flag_invalid );
1464             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1465                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1466             }
1467         }
1468         return (sbits64) LIT64( 0x8000000000000000 );
1469     }
1470     else if ( aExp <= 0x7E ) {
1471         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1472         return 0;
1473     }
1474     aSig64 = aSig | 0x00800000;
1475     aSig64 <<= 40;
1476     z = aSig64>>( - shiftCount );
1477     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1478         float_exception_flags |= float_flag_inexact;
1479     }
1480     if ( aSign ) z = - z;
1481     return z;
1482
1483 }
1484 #endif /* !SOFTFLOAT_FOR_GCC */
1485
1486 /*
1487 -------------------------------------------------------------------------------
1488 Returns the result of converting the single-precision floating-point value
1489 `a' to the double-precision floating-point format.  The conversion is
1490 performed according to the IEC/IEEE Standard for Binary Floating-Point
1491 Arithmetic.
1492 -------------------------------------------------------------------------------
1493 */
1494 float64 float32_to_float64( float32 a )
1495 {
1496     flag aSign;
1497     int16 aExp;
1498     bits32 aSig;
1499
1500     aSig = extractFloat32Frac( a );
1501     aExp = extractFloat32Exp( a );
1502     aSign = extractFloat32Sign( a );
1503     if ( aExp == 0xFF ) {
1504         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1505         return packFloat64( aSign, 0x7FF, 0 );
1506     }
1507     if ( aExp == 0 ) {
1508         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1509         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1510         --aExp;
1511     }
1512     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1513
1514 }
1515
1516 #ifdef FLOATX80
1517
1518 /*
1519 -------------------------------------------------------------------------------
1520 Returns the result of converting the single-precision floating-point value
1521 `a' to the extended double-precision floating-point format.  The conversion
1522 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1523 Arithmetic.
1524 -------------------------------------------------------------------------------
1525 */
1526 floatx80 float32_to_floatx80( float32 a )
1527 {
1528     flag aSign;
1529     int16 aExp;
1530     bits32 aSig;
1531
1532     aSig = extractFloat32Frac( a );
1533     aExp = extractFloat32Exp( a );
1534     aSign = extractFloat32Sign( a );
1535     if ( aExp == 0xFF ) {
1536         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1537         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1538     }
1539     if ( aExp == 0 ) {
1540         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1541         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1542     }
1543     aSig |= 0x00800000;
1544     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1545
1546 }
1547
1548 #endif
1549
1550 #ifdef FLOAT128
1551
1552 /*
1553 -------------------------------------------------------------------------------
1554 Returns the result of converting the single-precision floating-point value
1555 `a' to the double-precision floating-point format.  The conversion is
1556 performed according to the IEC/IEEE Standard for Binary Floating-Point
1557 Arithmetic.
1558 -------------------------------------------------------------------------------
1559 */
1560 float128 float32_to_float128( float32 a )
1561 {
1562     flag aSign;
1563     int16 aExp;
1564     bits32 aSig;
1565
1566     aSig = extractFloat32Frac( a );
1567     aExp = extractFloat32Exp( a );
1568     aSign = extractFloat32Sign( a );
1569     if ( aExp == 0xFF ) {
1570         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1571         return packFloat128( aSign, 0x7FFF, 0, 0 );
1572     }
1573     if ( aExp == 0 ) {
1574         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1575         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1576         --aExp;
1577     }
1578     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1579
1580 }
1581
1582 #endif
1583
1584 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1585 /*
1586 -------------------------------------------------------------------------------
1587 Rounds the single-precision floating-point value `a' to an integer, and
1588 returns the result as a single-precision floating-point value.  The
1589 operation is performed according to the IEC/IEEE Standard for Binary
1590 Floating-Point Arithmetic.
1591 -------------------------------------------------------------------------------
1592 */
1593 float32 float32_round_to_int( float32 a )
1594 {
1595     flag aSign;
1596     int16 aExp;
1597     bits32 lastBitMask, roundBitsMask;
1598     int8 roundingMode;
1599     float32 z;
1600
1601     aExp = extractFloat32Exp( a );
1602     if ( 0x96 <= aExp ) {
1603         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1604             return propagateFloat32NaN( a, a );
1605         }
1606         return a;
1607     }
1608     if ( aExp <= 0x7E ) {
1609         if ( (bits32) ( a<<1 ) == 0 ) return a;
1610         float_exception_flags |= float_flag_inexact;
1611         aSign = extractFloat32Sign( a );
1612         switch ( float_rounding_mode ) {
1613          case float_round_nearest_even:
1614             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1615                 return packFloat32( aSign, 0x7F, 0 );
1616             }
1617             break;
1618          case float_round_to_zero:
1619             break;
1620          case float_round_down:
1621             return aSign ? 0xBF800000 : 0;
1622          case float_round_up:
1623             return aSign ? 0x80000000 : 0x3F800000;
1624         }
1625         return packFloat32( aSign, 0, 0 );
1626     }
1627     lastBitMask = 1;
1628     lastBitMask <<= 0x96 - aExp;
1629     roundBitsMask = lastBitMask - 1;
1630     z = a;
1631     roundingMode = float_rounding_mode;
1632     if ( roundingMode == float_round_nearest_even ) {
1633         z += lastBitMask>>1;
1634         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1635     }
1636     else if ( roundingMode != float_round_to_zero ) {
1637         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1638             z += roundBitsMask;
1639         }
1640     }
1641     z &= ~ roundBitsMask;
1642     if ( z != a ) float_exception_flags |= float_flag_inexact;
1643     return z;
1644
1645 }
1646 #endif /* !SOFTFLOAT_FOR_GCC */
1647
1648 /*
1649 -------------------------------------------------------------------------------
1650 Returns the result of adding the absolute values of the single-precision
1651 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1652 before being returned.  `zSign' is ignored if the result is a NaN.
1653 The addition is performed according to the IEC/IEEE Standard for Binary
1654 Floating-Point Arithmetic.
1655 -------------------------------------------------------------------------------
1656 */
1657 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1658 {
1659     int16 aExp, bExp, zExp;
1660     bits32 aSig, bSig, zSig;
1661     int16 expDiff;
1662
1663     aSig = extractFloat32Frac( a );
1664     aExp = extractFloat32Exp( a );
1665     bSig = extractFloat32Frac( b );
1666     bExp = extractFloat32Exp( b );
1667     expDiff = aExp - bExp;
1668     aSig <<= 6;
1669     bSig <<= 6;
1670     if ( 0 < expDiff ) {
1671         if ( aExp == 0xFF ) {
1672             if ( aSig ) return propagateFloat32NaN( a, b );
1673             return a;
1674         }
1675         if ( bExp == 0 ) {
1676             --expDiff;
1677         }
1678         else {
1679             bSig |= 0x20000000;
1680         }
1681         shift32RightJamming( bSig, expDiff, &bSig );
1682         zExp = aExp;
1683     }
1684     else if ( expDiff < 0 ) {
1685         if ( bExp == 0xFF ) {
1686             if ( bSig ) return propagateFloat32NaN( a, b );
1687             return packFloat32( zSign, 0xFF, 0 );
1688         }
1689         if ( aExp == 0 ) {
1690             ++expDiff;
1691         }
1692         else {
1693             aSig |= 0x20000000;
1694         }
1695         shift32RightJamming( aSig, - expDiff, &aSig );
1696         zExp = bExp;
1697     }
1698     else {
1699         if ( aExp == 0xFF ) {
1700             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1701             return a;
1702         }
1703         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1704         zSig = 0x40000000 + aSig + bSig;
1705         zExp = aExp;
1706         goto roundAndPack;
1707     }
1708     aSig |= 0x20000000;
1709     zSig = ( aSig + bSig )<<1;
1710     --zExp;
1711     if ( (sbits32) zSig < 0 ) {
1712         zSig = aSig + bSig;
1713         ++zExp;
1714     }
1715  roundAndPack:
1716     return roundAndPackFloat32( zSign, zExp, zSig );
1717
1718 }
1719
1720 /*
1721 -------------------------------------------------------------------------------
1722 Returns the result of subtracting the absolute values of the single-
1723 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1724 difference is negated before being returned.  `zSign' is ignored if the
1725 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1726 Standard for Binary Floating-Point Arithmetic.
1727 -------------------------------------------------------------------------------
1728 */
1729 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1730 {
1731     int16 aExp, bExp, zExp;
1732     bits32 aSig, bSig, zSig;
1733     int16 expDiff;
1734
1735     aSig = extractFloat32Frac( a );
1736     aExp = extractFloat32Exp( a );
1737     bSig = extractFloat32Frac( b );
1738     bExp = extractFloat32Exp( b );
1739     expDiff = aExp - bExp;
1740     aSig <<= 7;
1741     bSig <<= 7;
1742     if ( 0 < expDiff ) goto aExpBigger;
1743     if ( expDiff < 0 ) goto bExpBigger;
1744     if ( aExp == 0xFF ) {
1745         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1746         float_raise( float_flag_invalid );
1747         return float32_default_nan;
1748     }
1749     if ( aExp == 0 ) {
1750         aExp = 1;
1751         bExp = 1;
1752     }
1753     if ( bSig < aSig ) goto aBigger;
1754     if ( aSig < bSig ) goto bBigger;
1755     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1756  bExpBigger:
1757     if ( bExp == 0xFF ) {
1758         if ( bSig ) return propagateFloat32NaN( a, b );
1759         return packFloat32( zSign ^ 1, 0xFF, 0 );
1760     }
1761     if ( aExp == 0 ) {
1762         ++expDiff;
1763     }
1764     else {
1765         aSig |= 0x40000000;
1766     }
1767     shift32RightJamming( aSig, - expDiff, &aSig );
1768     bSig |= 0x40000000;
1769  bBigger:
1770     zSig = bSig - aSig;
1771     zExp = bExp;
1772     zSign ^= 1;
1773     goto normalizeRoundAndPack;
1774  aExpBigger:
1775     if ( aExp == 0xFF ) {
1776         if ( aSig ) return propagateFloat32NaN( a, b );
1777         return a;
1778     }
1779     if ( bExp == 0 ) {
1780         --expDiff;
1781     }
1782     else {
1783         bSig |= 0x40000000;
1784     }
1785     shift32RightJamming( bSig, expDiff, &bSig );
1786     aSig |= 0x40000000;
1787  aBigger:
1788     zSig = aSig - bSig;
1789     zExp = aExp;
1790  normalizeRoundAndPack:
1791     --zExp;
1792     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1793
1794 }
1795
1796 /*
1797 -------------------------------------------------------------------------------
1798 Returns the result of adding the single-precision floating-point values `a'
1799 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1800 Binary Floating-Point Arithmetic.
1801 -------------------------------------------------------------------------------
1802 */
1803 float32 float32_add( float32 a, float32 b )
1804 {
1805     flag aSign, bSign;
1806
1807     aSign = extractFloat32Sign( a );
1808     bSign = extractFloat32Sign( b );
1809     if ( aSign == bSign ) {
1810         return addFloat32Sigs( a, b, aSign );
1811     }
1812     else {
1813         return subFloat32Sigs( a, b, aSign );
1814     }
1815
1816 }
1817
1818 /*
1819 -------------------------------------------------------------------------------
1820 Returns the result of subtracting the single-precision floating-point values
1821 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1822 for Binary Floating-Point Arithmetic.
1823 -------------------------------------------------------------------------------
1824 */
1825 float32 float32_sub( float32 a, float32 b )
1826 {
1827     flag aSign, bSign;
1828
1829     aSign = extractFloat32Sign( a );
1830     bSign = extractFloat32Sign( b );
1831     if ( aSign == bSign ) {
1832         return subFloat32Sigs( a, b, aSign );
1833     }
1834     else {
1835         return addFloat32Sigs( a, b, aSign );
1836     }
1837
1838 }
1839
1840 /*
1841 -------------------------------------------------------------------------------
1842 Returns the result of multiplying the single-precision floating-point values
1843 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1844 for Binary Floating-Point Arithmetic.
1845 -------------------------------------------------------------------------------
1846 */
1847 float32 float32_mul( float32 a, float32 b )
1848 {
1849     flag aSign, bSign, zSign;
1850     int16 aExp, bExp, zExp;
1851     bits32 aSig, bSig;
1852     bits64 zSig64;
1853     bits32 zSig;
1854
1855     aSig = extractFloat32Frac( a );
1856     aExp = extractFloat32Exp( a );
1857     aSign = extractFloat32Sign( a );
1858     bSig = extractFloat32Frac( b );
1859     bExp = extractFloat32Exp( b );
1860     bSign = extractFloat32Sign( b );
1861     zSign = aSign ^ bSign;
1862     if ( aExp == 0xFF ) {
1863         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1864             return propagateFloat32NaN( a, b );
1865         }
1866         if ( ( bExp | bSig ) == 0 ) {
1867             float_raise( float_flag_invalid );
1868             return float32_default_nan;
1869         }
1870         return packFloat32( zSign, 0xFF, 0 );
1871     }
1872     if ( bExp == 0xFF ) {
1873         if ( bSig ) return propagateFloat32NaN( a, b );
1874         if ( ( aExp | aSig ) == 0 ) {
1875             float_raise( float_flag_invalid );
1876             return float32_default_nan;
1877         }
1878         return packFloat32( zSign, 0xFF, 0 );
1879     }
1880     if ( aExp == 0 ) {
1881         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1882         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1883     }
1884     if ( bExp == 0 ) {
1885         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1886         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1887     }
1888     zExp = aExp + bExp - 0x7F;
1889     aSig = ( aSig | 0x00800000 )<<7;
1890     bSig = ( bSig | 0x00800000 )<<8;
1891     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1892     zSig = zSig64;
1893     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1894         zSig <<= 1;
1895         --zExp;
1896     }
1897     return roundAndPackFloat32( zSign, zExp, zSig );
1898
1899 }
1900
1901 /*
1902 -------------------------------------------------------------------------------
1903 Returns the result of dividing the single-precision floating-point value `a'
1904 by the corresponding value `b'.  The operation is performed according to the
1905 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1906 -------------------------------------------------------------------------------
1907 */
1908 float32 float32_div( float32 a, float32 b )
1909 {
1910     flag aSign, bSign, zSign;
1911     int16 aExp, bExp, zExp;
1912     bits32 aSig, bSig, zSig;
1913
1914     aSig = extractFloat32Frac( a );
1915     aExp = extractFloat32Exp( a );
1916     aSign = extractFloat32Sign( a );
1917     bSig = extractFloat32Frac( b );
1918     bExp = extractFloat32Exp( b );
1919     bSign = extractFloat32Sign( b );
1920     zSign = aSign ^ bSign;
1921     if ( aExp == 0xFF ) {
1922         if ( aSig ) return propagateFloat32NaN( a, b );
1923         if ( bExp == 0xFF ) {
1924             if ( bSig ) return propagateFloat32NaN( a, b );
1925             float_raise( float_flag_invalid );
1926             return float32_default_nan;
1927         }
1928         return packFloat32( zSign, 0xFF, 0 );
1929     }
1930     if ( bExp == 0xFF ) {
1931         if ( bSig ) return propagateFloat32NaN( a, b );
1932         return packFloat32( zSign, 0, 0 );
1933     }
1934     if ( bExp == 0 ) {
1935         if ( bSig == 0 ) {
1936             if ( ( aExp | aSig ) == 0 ) {
1937                 float_raise( float_flag_invalid );
1938                 return float32_default_nan;
1939             }
1940             float_raise( float_flag_divbyzero );
1941             return packFloat32( zSign, 0xFF, 0 );
1942         }
1943         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1944     }
1945     if ( aExp == 0 ) {
1946         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1947         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1948     }
1949     zExp = aExp - bExp + 0x7D;
1950     aSig = ( aSig | 0x00800000 )<<7;
1951     bSig = ( bSig | 0x00800000 )<<8;
1952     if ( bSig <= ( aSig + aSig ) ) {
1953         aSig >>= 1;
1954         ++zExp;
1955     }
1956     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1957     if ( ( zSig & 0x3F ) == 0 ) {
1958         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1959     }
1960     return roundAndPackFloat32( zSign, zExp, zSig );
1961
1962 }
1963
1964 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1965 /*
1966 -------------------------------------------------------------------------------
1967 Returns the remainder of the single-precision floating-point value `a'
1968 with respect to the corresponding value `b'.  The operation is performed
1969 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1970 -------------------------------------------------------------------------------
1971 */
1972 float32 float32_rem( float32 a, float32 b )
1973 {
1974     flag aSign, bSign, zSign;
1975     int16 aExp, bExp, expDiff;
1976     bits32 aSig, bSig;
1977     bits32 q;
1978     bits64 aSig64, bSig64, q64;
1979     bits32 alternateASig;
1980     sbits32 sigMean;
1981
1982     aSig = extractFloat32Frac( a );
1983     aExp = extractFloat32Exp( a );
1984     aSign = extractFloat32Sign( a );
1985     bSig = extractFloat32Frac( b );
1986     bExp = extractFloat32Exp( b );
1987     bSign = extractFloat32Sign( b );
1988     if ( aExp == 0xFF ) {
1989         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1990             return propagateFloat32NaN( a, b );
1991         }
1992         float_raise( float_flag_invalid );
1993         return float32_default_nan;
1994     }
1995     if ( bExp == 0xFF ) {
1996         if ( bSig ) return propagateFloat32NaN( a, b );
1997         return a;
1998     }
1999     if ( bExp == 0 ) {
2000         if ( bSig == 0 ) {
2001             float_raise( float_flag_invalid );
2002             return float32_default_nan;
2003         }
2004         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2005     }
2006     if ( aExp == 0 ) {
2007         if ( aSig == 0 ) return a;
2008         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2009     }
2010     expDiff = aExp - bExp;
2011     aSig |= 0x00800000;
2012     bSig |= 0x00800000;
2013     if ( expDiff < 32 ) {
2014         aSig <<= 8;
2015         bSig <<= 8;
2016         if ( expDiff < 0 ) {
2017             if ( expDiff < -1 ) return a;
2018             aSig >>= 1;
2019         }
2020         q = ( bSig <= aSig );
2021         if ( q ) aSig -= bSig;
2022         if ( 0 < expDiff ) {
2023             q = ( ( (bits64) aSig )<<32 ) / bSig;
2024             q >>= 32 - expDiff;
2025             bSig >>= 2;
2026             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2027         }
2028         else {
2029             aSig >>= 2;
2030             bSig >>= 2;
2031         }
2032     }
2033     else {
2034         if ( bSig <= aSig ) aSig -= bSig;
2035         aSig64 = ( (bits64) aSig )<<40;
2036         bSig64 = ( (bits64) bSig )<<40;
2037         expDiff -= 64;
2038         while ( 0 < expDiff ) {
2039             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2040             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2041             aSig64 = - ( ( bSig * q64 )<<38 );
2042             expDiff -= 62;
2043         }
2044         expDiff += 64;
2045         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2046         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2047         q = q64>>( 64 - expDiff );
2048         bSig <<= 6;
2049         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2050     }
2051     do {
2052         alternateASig = aSig;
2053         ++q;
2054         aSig -= bSig;
2055     } while ( 0 <= (sbits32) aSig );
2056     sigMean = aSig + alternateASig;
2057     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2058         aSig = alternateASig;
2059     }
2060     zSign = ( (sbits32) aSig < 0 );
2061     if ( zSign ) aSig = - aSig;
2062     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2063
2064 }
2065 #endif /* !SOFTFLOAT_FOR_GCC */
2066
2067 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2068 /*
2069 -------------------------------------------------------------------------------
2070 Returns the square root of the single-precision floating-point value `a'.
2071 The operation is performed according to the IEC/IEEE Standard for Binary
2072 Floating-Point Arithmetic.
2073 -------------------------------------------------------------------------------
2074 */
2075 float32 float32_sqrt( float32 a )
2076 {
2077     flag aSign;
2078     int16 aExp, zExp;
2079     bits32 aSig, zSig;
2080     bits64 rem, term;
2081
2082     aSig = extractFloat32Frac( a );
2083     aExp = extractFloat32Exp( a );
2084     aSign = extractFloat32Sign( a );
2085     if ( aExp == 0xFF ) {
2086         if ( aSig ) return propagateFloat32NaN( a, 0 );
2087         if ( ! aSign ) return a;
2088         float_raise( float_flag_invalid );
2089         return float32_default_nan;
2090     }
2091     if ( aSign ) {
2092         if ( ( aExp | aSig ) == 0 ) return a;
2093         float_raise( float_flag_invalid );
2094         return float32_default_nan;
2095     }
2096     if ( aExp == 0 ) {
2097         if ( aSig == 0 ) return 0;
2098         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2099     }
2100     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2101     aSig = ( aSig | 0x00800000 )<<8;
2102     zSig = estimateSqrt32( aExp, aSig ) + 2;
2103     if ( ( zSig & 0x7F ) <= 5 ) {
2104         if ( zSig < 2 ) {
2105             zSig = 0x7FFFFFFF;
2106             goto roundAndPack;
2107         }
2108         aSig >>= aExp & 1;
2109         term = ( (bits64) zSig ) * zSig;
2110         rem = ( ( (bits64) aSig )<<32 ) - term;
2111         while ( (sbits64) rem < 0 ) {
2112             --zSig;
2113             rem += ( ( (bits64) zSig )<<1 ) | 1;
2114         }
2115         zSig |= ( rem != 0 );
2116     }
2117     shift32RightJamming( zSig, 1, &zSig );
2118  roundAndPack:
2119     return roundAndPackFloat32( 0, zExp, zSig );
2120
2121 }
2122 #endif /* !SOFTFLOAT_FOR_GCC */
2123
2124 /*
2125 -------------------------------------------------------------------------------
2126 Returns 1 if the single-precision floating-point value `a' is equal to
2127 the corresponding value `b', and 0 otherwise.  The comparison is performed
2128 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2129 -------------------------------------------------------------------------------
2130 */
2131 flag float32_eq( float32 a, float32 b )
2132 {
2133
2134     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2135          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2136        ) {
2137         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2138             float_raise( float_flag_invalid );
2139         }
2140         return 0;
2141     }
2142     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2143
2144 }
2145
2146 /*
2147 -------------------------------------------------------------------------------
2148 Returns 1 if the single-precision floating-point value `a' is less than
2149 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2150 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2151 Arithmetic.
2152 -------------------------------------------------------------------------------
2153 */
2154 flag float32_le( float32 a, float32 b )
2155 {
2156     flag aSign, bSign;
2157
2158     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2159          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2160        ) {
2161         float_raise( float_flag_invalid );
2162         return 0;
2163     }
2164     aSign = extractFloat32Sign( a );
2165     bSign = extractFloat32Sign( b );
2166     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2167     return ( a == b ) || ( aSign ^ ( a < b ) );
2168
2169 }
2170
2171 /*
2172 -------------------------------------------------------------------------------
2173 Returns 1 if the single-precision floating-point value `a' is less than
2174 the corresponding value `b', and 0 otherwise.  The comparison is performed
2175 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176 -------------------------------------------------------------------------------
2177 */
2178 flag float32_lt( float32 a, float32 b )
2179 {
2180     flag aSign, bSign;
2181
2182     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2183          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2184        ) {
2185         float_raise( float_flag_invalid );
2186         return 0;
2187     }
2188     aSign = extractFloat32Sign( a );
2189     bSign = extractFloat32Sign( b );
2190     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2191     return ( a != b ) && ( aSign ^ ( a < b ) );
2192
2193 }
2194
2195 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2196 /*
2197 -------------------------------------------------------------------------------
2198 Returns 1 if the single-precision floating-point value `a' is equal to
2199 the corresponding value `b', and 0 otherwise.  The invalid exception is
2200 raised if either operand is a NaN.  Otherwise, the comparison is performed
2201 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2202 -------------------------------------------------------------------------------
2203 */
2204 flag float32_eq_signaling( float32 a, float32 b )
2205 {
2206
2207     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2208          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2209        ) {
2210         float_raise( float_flag_invalid );
2211         return 0;
2212     }
2213     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2214
2215 }
2216
2217 /*
2218 -------------------------------------------------------------------------------
2219 Returns 1 if the single-precision floating-point value `a' is less than or
2220 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2221 cause an exception.  Otherwise, the comparison is performed according to the
2222 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2223 -------------------------------------------------------------------------------
2224 */
2225 flag float32_le_quiet( float32 a, float32 b )
2226 {
2227     flag aSign, bSign;
2228
2229     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2230          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2231        ) {
2232         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2233             float_raise( float_flag_invalid );
2234         }
2235         return 0;
2236     }
2237     aSign = extractFloat32Sign( a );
2238     bSign = extractFloat32Sign( b );
2239     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2240     return ( a == b ) || ( aSign ^ ( a < b ) );
2241
2242 }
2243
2244 /*
2245 -------------------------------------------------------------------------------
2246 Returns 1 if the single-precision floating-point value `a' is less than
2247 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2248 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2249 Standard for Binary Floating-Point Arithmetic.
2250 -------------------------------------------------------------------------------
2251 */
2252 flag float32_lt_quiet( float32 a, float32 b )
2253 {
2254     flag aSign, bSign;
2255
2256     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2257          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2258        ) {
2259         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2260             float_raise( float_flag_invalid );
2261         }
2262         return 0;
2263     }
2264     aSign = extractFloat32Sign( a );
2265     bSign = extractFloat32Sign( b );
2266     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2267     return ( a != b ) && ( aSign ^ ( a < b ) );
2268
2269 }
2270 #endif /* !SOFTFLOAT_FOR_GCC */
2271
2272 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2273 /*
2274 -------------------------------------------------------------------------------
2275 Returns the result of converting the double-precision floating-point value
2276 `a' to the 32-bit two's complement integer format.  The conversion is
2277 performed according to the IEC/IEEE Standard for Binary Floating-Point
2278 Arithmetic---which means in particular that the conversion is rounded
2279 according to the current rounding mode.  If `a' is a NaN, the largest
2280 positive integer is returned.  Otherwise, if the conversion overflows, the
2281 largest integer with the same sign as `a' is returned.
2282 -------------------------------------------------------------------------------
2283 */
2284 int32 float64_to_int32( float64 a )
2285 {
2286     flag aSign;
2287     int16 aExp, shiftCount;
2288     bits64 aSig;
2289
2290     aSig = extractFloat64Frac( a );
2291     aExp = extractFloat64Exp( a );
2292     aSign = extractFloat64Sign( a );
2293     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2294     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2295     shiftCount = 0x42C - aExp;
2296     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2297     return roundAndPackInt32( aSign, aSig );
2298
2299 }
2300 #endif /* !SOFTFLOAT_FOR_GCC */
2301
2302 /*
2303 -------------------------------------------------------------------------------
2304 Returns the result of converting the double-precision floating-point value
2305 `a' to the 32-bit two's complement integer format.  The conversion is
2306 performed according to the IEC/IEEE Standard for Binary Floating-Point
2307 Arithmetic, except that the conversion is always rounded toward zero.
2308 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2309 the conversion overflows, the largest integer with the same sign as `a' is
2310 returned.
2311 -------------------------------------------------------------------------------
2312 */
2313 int32 float64_to_int32_round_to_zero( float64 a )
2314 {
2315     flag aSign;
2316     int16 aExp, shiftCount;
2317     bits64 aSig, savedASig;
2318     int32 z;
2319
2320     aSig = extractFloat64Frac( a );
2321     aExp = extractFloat64Exp( a );
2322     aSign = extractFloat64Sign( a );
2323     if ( 0x41E < aExp ) {
2324         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2325         goto invalid;
2326     }
2327     else if ( aExp < 0x3FF ) {
2328         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2329         return 0;
2330     }
2331     aSig |= LIT64( 0x0010000000000000 );
2332     shiftCount = 0x433 - aExp;
2333     savedASig = aSig;
2334     aSig >>= shiftCount;
2335     z = aSig;
2336     if ( aSign ) z = - z;
2337     if ( ( z < 0 ) ^ aSign ) {
2338  invalid:
2339         float_raise( float_flag_invalid );
2340         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2341     }
2342     if ( ( aSig<<shiftCount ) != savedASig ) {
2343         float_exception_flags |= float_flag_inexact;
2344     }
2345     return z;
2346
2347 }
2348
2349 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2350 /*
2351 -------------------------------------------------------------------------------
2352 Returns the result of converting the double-precision floating-point value
2353 `a' to the 64-bit two's complement integer format.  The conversion is
2354 performed according to the IEC/IEEE Standard for Binary Floating-Point
2355 Arithmetic---which means in particular that the conversion is rounded
2356 according to the current rounding mode.  If `a' is a NaN, the largest
2357 positive integer is returned.  Otherwise, if the conversion overflows, the
2358 largest integer with the same sign as `a' is returned.
2359 -------------------------------------------------------------------------------
2360 */
2361 int64 float64_to_int64( float64 a )
2362 {
2363     flag aSign;
2364     int16 aExp, shiftCount;
2365     bits64 aSig, aSigExtra;
2366
2367     aSig = extractFloat64Frac( a );
2368     aExp = extractFloat64Exp( a );
2369     aSign = extractFloat64Sign( a );
2370     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2371     shiftCount = 0x433 - aExp;
2372     if ( shiftCount <= 0 ) {
2373         if ( 0x43E < aExp ) {
2374             float_raise( float_flag_invalid );
2375             if (    ! aSign
2376                  || (    ( aExp == 0x7FF )
2377                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2378                ) {
2379                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2380             }
2381             return (sbits64) LIT64( 0x8000000000000000 );
2382         }
2383         aSigExtra = 0;
2384         aSig <<= - shiftCount;
2385     }
2386     else {
2387         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2388     }
2389     return roundAndPackInt64( aSign, aSig, aSigExtra );
2390
2391 }
2392
2393 /*
2394 -------------------------------------------------------------------------------
2395 Returns the result of converting the double-precision floating-point value
2396 `a' to the 64-bit two's complement integer format.  The conversion is
2397 performed according to the IEC/IEEE Standard for Binary Floating-Point
2398 Arithmetic, except that the conversion is always rounded toward zero.
2399 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2400 the conversion overflows, the largest integer with the same sign as `a' is
2401 returned.
2402 -------------------------------------------------------------------------------
2403 */
2404 int64 float64_to_int64_round_to_zero( float64 a )
2405 {
2406     flag aSign;
2407     int16 aExp, shiftCount;
2408     bits64 aSig;
2409     int64 z;
2410
2411     aSig = extractFloat64Frac( a );
2412     aExp = extractFloat64Exp( a );
2413     aSign = extractFloat64Sign( a );
2414     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2415     shiftCount = aExp - 0x433;
2416     if ( 0 <= shiftCount ) {
2417         if ( 0x43E <= aExp ) {
2418             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2419                 float_raise( float_flag_invalid );
2420                 if (    ! aSign
2421                      || (    ( aExp == 0x7FF )
2422                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2423                    ) {
2424                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2425                 }
2426             }
2427             return (sbits64) LIT64( 0x8000000000000000 );
2428         }
2429         z = aSig<<shiftCount;
2430     }
2431     else {
2432         if ( aExp < 0x3FE ) {
2433             if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2434             return 0;
2435         }
2436         z = aSig>>( - shiftCount );
2437         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2438             float_exception_flags |= float_flag_inexact;
2439         }
2440     }
2441     if ( aSign ) z = - z;
2442     return z;
2443
2444 }
2445 #endif /* !SOFTFLOAT_FOR_GCC */
2446
2447 /*
2448 -------------------------------------------------------------------------------
2449 Returns the result of converting the double-precision floating-point value
2450 `a' to the single-precision floating-point format.  The conversion is
2451 performed according to the IEC/IEEE Standard for Binary Floating-Point
2452 Arithmetic.
2453 -------------------------------------------------------------------------------
2454 */
2455 float32 float64_to_float32( float64 a )
2456 {
2457     flag aSign;
2458     int16 aExp;
2459     bits64 aSig;
2460     bits32 zSig;
2461
2462     aSig = extractFloat64Frac( a );
2463     aExp = extractFloat64Exp( a );
2464     aSign = extractFloat64Sign( a );
2465     if ( aExp == 0x7FF ) {
2466         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2467         return packFloat32( aSign, 0xFF, 0 );
2468     }
2469     shift64RightJamming( aSig, 22, &aSig );
2470     zSig = aSig;
2471     if ( aExp || zSig ) {
2472         zSig |= 0x40000000;
2473         aExp -= 0x381;
2474     }
2475     return roundAndPackFloat32( aSign, aExp, zSig );
2476
2477 }
2478
2479 #ifdef FLOATX80
2480
2481 /*
2482 -------------------------------------------------------------------------------
2483 Returns the result of converting the double-precision floating-point value
2484 `a' to the extended double-precision floating-point format.  The conversion
2485 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2486 Arithmetic.
2487 -------------------------------------------------------------------------------
2488 */
2489 floatx80 float64_to_floatx80( float64 a )
2490 {
2491     flag aSign;
2492     int16 aExp;
2493     bits64 aSig;
2494
2495     aSig = extractFloat64Frac( a );
2496     aExp = extractFloat64Exp( a );
2497     aSign = extractFloat64Sign( a );
2498     if ( aExp == 0x7FF ) {
2499         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2500         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2501     }
2502     if ( aExp == 0 ) {
2503         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2504         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2505     }
2506     return
2507         packFloatx80(
2508             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2509
2510 }
2511
2512 #endif
2513
2514 #ifdef FLOAT128
2515
2516 /*
2517 -------------------------------------------------------------------------------
2518 Returns the result of converting the double-precision floating-point value
2519 `a' to the quadruple-precision floating-point format.  The conversion is
2520 performed according to the IEC/IEEE Standard for Binary Floating-Point
2521 Arithmetic.
2522 -------------------------------------------------------------------------------
2523 */
2524 float128 float64_to_float128( float64 a )
2525 {
2526     flag aSign;
2527     int16 aExp;
2528     bits64 aSig, zSig0, zSig1;
2529
2530     aSig = extractFloat64Frac( a );
2531     aExp = extractFloat64Exp( a );
2532     aSign = extractFloat64Sign( a );
2533     if ( aExp == 0x7FF ) {
2534         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2535         return packFloat128( aSign, 0x7FFF, 0, 0 );
2536     }
2537     if ( aExp == 0 ) {
2538         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2539         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2540         --aExp;
2541     }
2542     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2543     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2544
2545 }
2546
2547 #endif
2548
2549 #ifndef SOFTFLOAT_FOR_GCC
2550 /*
2551 -------------------------------------------------------------------------------
2552 Rounds the double-precision floating-point value `a' to an integer, and
2553 returns the result as a double-precision floating-point value.  The
2554 operation is performed according to the IEC/IEEE Standard for Binary
2555 Floating-Point Arithmetic.
2556 -------------------------------------------------------------------------------
2557 */
2558 float64 float64_round_to_int( float64 a )
2559 {
2560     flag aSign;
2561     int16 aExp;
2562     bits64 lastBitMask, roundBitsMask;
2563     int8 roundingMode;
2564     float64 z;
2565
2566     aExp = extractFloat64Exp( a );
2567     if ( 0x433 <= aExp ) {
2568         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2569             return propagateFloat64NaN( a, a );
2570         }
2571         return a;
2572     }
2573     if ( aExp < 0x3FF ) {
2574         if ( (bits64) ( a<<1 ) == 0 ) return a;
2575         float_exception_flags |= float_flag_inexact;
2576         aSign = extractFloat64Sign( a );
2577         switch ( float_rounding_mode ) {
2578          case float_round_nearest_even:
2579             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2580                 return packFloat64( aSign, 0x3FF, 0 );
2581             }
2582             break;
2583          case float_round_to_zero:
2584             break;
2585          case float_round_down:
2586             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2587          case float_round_up:
2588             return
2589             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2590         }
2591         return packFloat64( aSign, 0, 0 );
2592     }
2593     lastBitMask = 1;
2594     lastBitMask <<= 0x433 - aExp;
2595     roundBitsMask = lastBitMask - 1;
2596     z = a;
2597     roundingMode = float_rounding_mode;
2598     if ( roundingMode == float_round_nearest_even ) {
2599         z += lastBitMask>>1;
2600         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2601     }
2602     else if ( roundingMode != float_round_to_zero ) {
2603         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2604             z += roundBitsMask;
2605         }
2606     }
2607     z &= ~ roundBitsMask;
2608     if ( z != a ) float_exception_flags |= float_flag_inexact;
2609     return z;
2610
2611 }
2612 #endif
2613
2614 /*
2615 -------------------------------------------------------------------------------
2616 Returns the result of adding the absolute values of the double-precision
2617 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2618 before being returned.  `zSign' is ignored if the result is a NaN.
2619 The addition is performed according to the IEC/IEEE Standard for Binary
2620 Floating-Point Arithmetic.
2621 -------------------------------------------------------------------------------
2622 */
2623 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2624 {
2625     int16 aExp, bExp, zExp;
2626     bits64 aSig, bSig, zSig;
2627     int16 expDiff;
2628
2629     aSig = extractFloat64Frac( a );
2630     aExp = extractFloat64Exp( a );
2631     bSig = extractFloat64Frac( b );
2632     bExp = extractFloat64Exp( b );
2633     expDiff = aExp - bExp;
2634     aSig <<= 9;
2635     bSig <<= 9;
2636     if ( 0 < expDiff ) {
2637         if ( aExp == 0x7FF ) {
2638             if ( aSig ) return propagateFloat64NaN( a, b );
2639             return a;
2640         }
2641         if ( bExp == 0 ) {
2642             --expDiff;
2643         }
2644         else {
2645             bSig |= LIT64( 0x2000000000000000 );
2646         }
2647         shift64RightJamming( bSig, expDiff, &bSig );
2648         zExp = aExp;
2649     }
2650     else if ( expDiff < 0 ) {
2651         if ( bExp == 0x7FF ) {
2652             if ( bSig ) return propagateFloat64NaN( a, b );
2653             return packFloat64( zSign, 0x7FF, 0 );
2654         }
2655         if ( aExp == 0 ) {
2656             ++expDiff;
2657         }
2658         else {
2659             aSig |= LIT64( 0x2000000000000000 );
2660         }
2661         shift64RightJamming( aSig, - expDiff, &aSig );
2662         zExp = bExp;
2663     }
2664     else {
2665         if ( aExp == 0x7FF ) {
2666             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2667             return a;
2668         }
2669         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2670         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2671         zExp = aExp;
2672         goto roundAndPack;
2673     }
2674     aSig |= LIT64( 0x2000000000000000 );
2675     zSig = ( aSig + bSig )<<1;
2676     --zExp;
2677     if ( (sbits64) zSig < 0 ) {
2678         zSig = aSig + bSig;
2679         ++zExp;
2680     }
2681  roundAndPack:
2682     return roundAndPackFloat64( zSign, zExp, zSig );
2683
2684 }
2685
2686 /*
2687 -------------------------------------------------------------------------------
2688 Returns the result of subtracting the absolute values of the double-
2689 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2690 difference is negated before being returned.  `zSign' is ignored if the
2691 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2692 Standard for Binary Floating-Point Arithmetic.
2693 -------------------------------------------------------------------------------
2694 */
2695 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2696 {
2697     int16 aExp, bExp, zExp;
2698     bits64 aSig, bSig, zSig;
2699     int16 expDiff;
2700
2701     aSig = extractFloat64Frac( a );
2702     aExp = extractFloat64Exp( a );
2703     bSig = extractFloat64Frac( b );
2704     bExp = extractFloat64Exp( b );
2705     expDiff = aExp - bExp;
2706     aSig <<= 10;
2707     bSig <<= 10;
2708     if ( 0 < expDiff ) goto aExpBigger;
2709     if ( expDiff < 0 ) goto bExpBigger;
2710     if ( aExp == 0x7FF ) {
2711         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2712         float_raise( float_flag_invalid );
2713         return float64_default_nan;
2714     }
2715     if ( aExp == 0 ) {
2716         aExp = 1;
2717         bExp = 1;
2718     }
2719     if ( bSig < aSig ) goto aBigger;
2720     if ( aSig < bSig ) goto bBigger;
2721     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2722  bExpBigger:
2723     if ( bExp == 0x7FF ) {
2724         if ( bSig ) return propagateFloat64NaN( a, b );
2725         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2726     }
2727     if ( aExp == 0 ) {
2728         ++expDiff;
2729     }
2730     else {
2731         aSig |= LIT64( 0x4000000000000000 );
2732     }
2733     shift64RightJamming( aSig, - expDiff, &aSig );
2734     bSig |= LIT64( 0x4000000000000000 );
2735  bBigger:
2736     zSig = bSig - aSig;
2737     zExp = bExp;
2738     zSign ^= 1;
2739     goto normalizeRoundAndPack;
2740  aExpBigger:
2741     if ( aExp == 0x7FF ) {
2742         if ( aSig ) return propagateFloat64NaN( a, b );
2743         return a;
2744     }
2745     if ( bExp == 0 ) {
2746         --expDiff;
2747     }
2748     else {
2749         bSig |= LIT64( 0x4000000000000000 );
2750     }
2751     shift64RightJamming( bSig, expDiff, &bSig );
2752     aSig |= LIT64( 0x4000000000000000 );
2753  aBigger:
2754     zSig = aSig - bSig;
2755     zExp = aExp;
2756  normalizeRoundAndPack:
2757     --zExp;
2758     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2759
2760 }
2761
2762 /*
2763 -------------------------------------------------------------------------------
2764 Returns the result of adding the double-precision floating-point values `a'
2765 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2766 Binary Floating-Point Arithmetic.
2767 -------------------------------------------------------------------------------
2768 */
2769 float64 float64_add( float64 a, float64 b )
2770 {
2771     flag aSign, bSign;
2772
2773     aSign = extractFloat64Sign( a );
2774     bSign = extractFloat64Sign( b );
2775     if ( aSign == bSign ) {
2776         return addFloat64Sigs( a, b, aSign );
2777     }
2778     else {
2779         return subFloat64Sigs( a, b, aSign );
2780     }
2781
2782 }
2783
2784 /*
2785 -------------------------------------------------------------------------------
2786 Returns the result of subtracting the double-precision floating-point values
2787 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2788 for Binary Floating-Point Arithmetic.
2789 -------------------------------------------------------------------------------
2790 */
2791 float64 float64_sub( float64 a, float64 b )
2792 {
2793     flag aSign, bSign;
2794
2795     aSign = extractFloat64Sign( a );
2796     bSign = extractFloat64Sign( b );
2797     if ( aSign == bSign ) {
2798         return subFloat64Sigs( a, b, aSign );
2799     }
2800     else {
2801         return addFloat64Sigs( a, b, aSign );
2802     }
2803
2804 }
2805
2806 /*
2807 -------------------------------------------------------------------------------
2808 Returns the result of multiplying the double-precision floating-point values
2809 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2810 for Binary Floating-Point Arithmetic.
2811 -------------------------------------------------------------------------------
2812 */
2813 float64 float64_mul( float64 a, float64 b )
2814 {
2815     flag aSign, bSign, zSign;
2816     int16 aExp, bExp, zExp;
2817     bits64 aSig, bSig, zSig0, zSig1;
2818
2819     aSig = extractFloat64Frac( a );
2820     aExp = extractFloat64Exp( a );
2821     aSign = extractFloat64Sign( a );
2822     bSig = extractFloat64Frac( b );
2823     bExp = extractFloat64Exp( b );
2824     bSign = extractFloat64Sign( b );
2825     zSign = aSign ^ bSign;
2826     if ( aExp == 0x7FF ) {
2827         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2828             return propagateFloat64NaN( a, b );
2829         }
2830         if ( ( bExp | bSig ) == 0 ) {
2831             float_raise( float_flag_invalid );
2832             return float64_default_nan;
2833         }
2834         return packFloat64( zSign, 0x7FF, 0 );
2835     }
2836     if ( bExp == 0x7FF ) {
2837         if ( bSig ) return propagateFloat64NaN( a, b );
2838         if ( ( aExp | aSig ) == 0 ) {
2839             float_raise( float_flag_invalid );
2840             return float64_default_nan;
2841         }
2842         return packFloat64( zSign, 0x7FF, 0 );
2843     }
2844     if ( aExp == 0 ) {
2845         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2846         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2847     }
2848     if ( bExp == 0 ) {
2849         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2850         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2851     }
2852     zExp = aExp + bExp - 0x3FF;
2853     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2854     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2855     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2856     zSig0 |= ( zSig1 != 0 );
2857     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2858         zSig0 <<= 1;
2859         --zExp;
2860     }
2861     return roundAndPackFloat64( zSign, zExp, zSig0 );
2862
2863 }
2864
2865 /*
2866 -------------------------------------------------------------------------------
2867 Returns the result of dividing the double-precision floating-point value `a'
2868 by the corresponding value `b'.  The operation is performed according to
2869 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2870 -------------------------------------------------------------------------------
2871 */
2872 float64 float64_div( float64 a, float64 b )
2873 {
2874     flag aSign, bSign, zSign;
2875     int16 aExp, bExp, zExp;
2876     bits64 aSig, bSig, zSig;
2877     bits64 rem0, rem1;
2878     bits64 term0, term1;
2879
2880     aSig = extractFloat64Frac( a );
2881     aExp = extractFloat64Exp( a );
2882     aSign = extractFloat64Sign( a );
2883     bSig = extractFloat64Frac( b );
2884     bExp = extractFloat64Exp( b );
2885     bSign = extractFloat64Sign( b );
2886     zSign = aSign ^ bSign;
2887     if ( aExp == 0x7FF ) {
2888         if ( aSig ) return propagateFloat64NaN( a, b );
2889         if ( bExp == 0x7FF ) {
2890             if ( bSig ) return propagateFloat64NaN( a, b );
2891             float_raise( float_flag_invalid );
2892             return float64_default_nan;
2893         }
2894         return packFloat64( zSign, 0x7FF, 0 );
2895     }
2896     if ( bExp == 0x7FF ) {
2897         if ( bSig ) return propagateFloat64NaN( a, b );
2898         return packFloat64( zSign, 0, 0 );
2899     }
2900     if ( bExp == 0 ) {
2901         if ( bSig == 0 ) {
2902             if ( ( aExp | aSig ) == 0 ) {
2903                 float_raise( float_flag_invalid );
2904                 return float64_default_nan;
2905             }
2906             float_raise( float_flag_divbyzero );
2907             return packFloat64( zSign, 0x7FF, 0 );
2908         }
2909         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2910     }
2911     if ( aExp == 0 ) {
2912         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2913         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2914     }
2915     zExp = aExp - bExp + 0x3FD;
2916     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2917     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2918     if ( bSig <= ( aSig + aSig ) ) {
2919         aSig >>= 1;
2920         ++zExp;
2921     }
2922     zSig = estimateDiv128To64( aSig, 0, bSig );
2923     if ( ( zSig & 0x1FF ) <= 2 ) {
2924         mul64To128( bSig, zSig, &term0, &term1 );
2925         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2926         while ( (sbits64) rem0 < 0 ) {
2927             --zSig;
2928             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2929         }
2930         zSig |= ( rem1 != 0 );
2931     }
2932     return roundAndPackFloat64( zSign, zExp, zSig );
2933
2934 }
2935
2936 #ifndef SOFTFLOAT_FOR_GCC
2937 /*
2938 -------------------------------------------------------------------------------
2939 Returns the remainder of the double-precision floating-point value `a'
2940 with respect to the corresponding value `b'.  The operation is performed
2941 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2942 -------------------------------------------------------------------------------
2943 */
2944 float64 float64_rem( float64 a, float64 b )
2945 {
2946     flag aSign, bSign, zSign;
2947     int16 aExp, bExp, expDiff;
2948     bits64 aSig, bSig;
2949     bits64 q, alternateASig;
2950     sbits64 sigMean;
2951
2952     aSig = extractFloat64Frac( a );
2953     aExp = extractFloat64Exp( a );
2954     aSign = extractFloat64Sign( a );
2955     bSig = extractFloat64Frac( b );
2956     bExp = extractFloat64Exp( b );
2957     bSign = extractFloat64Sign( b );
2958     if ( aExp == 0x7FF ) {
2959         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2960             return propagateFloat64NaN( a, b );
2961         }
2962         float_raise( float_flag_invalid );
2963         return float64_default_nan;
2964     }
2965     if ( bExp == 0x7FF ) {
2966         if ( bSig ) return propagateFloat64NaN( a, b );
2967         return a;
2968     }
2969     if ( bExp == 0 ) {
2970         if ( bSig == 0 ) {
2971             float_raise( float_flag_invalid );
2972             return float64_default_nan;
2973         }
2974         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2975     }
2976     if ( aExp == 0 ) {
2977         if ( aSig == 0 ) return a;
2978         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2979     }
2980     expDiff = aExp - bExp;
2981     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2982     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2983     if ( expDiff < 0 ) {
2984         if ( expDiff < -1 ) return a;
2985         aSig >>= 1;
2986     }
2987     q = ( bSig <= aSig );
2988     if ( q ) aSig -= bSig;
2989     expDiff -= 64;
2990     while ( 0 < expDiff ) {
2991         q = estimateDiv128To64( aSig, 0, bSig );
2992         q = ( 2 < q ) ? q - 2 : 0;
2993         aSig = - ( ( bSig>>2 ) * q );
2994         expDiff -= 62;
2995     }
2996     expDiff += 64;
2997     if ( 0 < expDiff ) {
2998         q = estimateDiv128To64( aSig, 0, bSig );
2999         q = ( 2 < q ) ? q - 2 : 0;
3000         q >>= 64 - expDiff;
3001         bSig >>= 2;
3002         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3003     }
3004     else {
3005         aSig >>= 2;
3006         bSig >>= 2;
3007     }
3008     do {
3009         alternateASig = aSig;
3010         ++q;
3011         aSig -= bSig;
3012     } while ( 0 <= (sbits64) aSig );
3013     sigMean = aSig + alternateASig;
3014     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3015         aSig = alternateASig;
3016     }
3017     zSign = ( (sbits64) aSig < 0 );
3018     if ( zSign ) aSig = - aSig;
3019     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3020
3021 }
3022
3023 /*
3024 -------------------------------------------------------------------------------
3025 Returns the square root of the double-precision floating-point value `a'.
3026 The operation is performed according to the IEC/IEEE Standard for Binary
3027 Floating-Point Arithmetic.
3028 -------------------------------------------------------------------------------
3029 */
3030 float64 float64_sqrt( float64 a )
3031 {
3032     flag aSign;
3033     int16 aExp, zExp;
3034     bits64 aSig, zSig, doubleZSig;
3035     bits64 rem0, rem1, term0, term1;
3036
3037     aSig = extractFloat64Frac( a );
3038     aExp = extractFloat64Exp( a );
3039     aSign = extractFloat64Sign( a );
3040     if ( aExp == 0x7FF ) {
3041         if ( aSig ) return propagateFloat64NaN( a, a );
3042         if ( ! aSign ) return a;
3043         float_raise( float_flag_invalid );
3044         return float64_default_nan;
3045     }
3046     if ( aSign ) {
3047         if ( ( aExp | aSig ) == 0 ) return a;
3048         float_raise( float_flag_invalid );
3049         return float64_default_nan;
3050     }
3051     if ( aExp == 0 ) {
3052         if ( aSig == 0 ) return 0;
3053         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3054     }
3055     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3056     aSig |= LIT64( 0x0010000000000000 );
3057     zSig = estimateSqrt32( aExp, aSig>>21 );
3058     aSig <<= 9 - ( aExp & 1 );
3059     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3060     if ( ( zSig & 0x1FF ) <= 5 ) {
3061         doubleZSig = zSig<<1;
3062         mul64To128( zSig, zSig, &term0, &term1 );
3063         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3064         while ( (sbits64) rem0 < 0 ) {
3065             --zSig;
3066             doubleZSig -= 2;
3067             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3068         }
3069         zSig |= ( ( rem0 | rem1 ) != 0 );
3070     }
3071     return roundAndPackFloat64( 0, zExp, zSig );
3072
3073 }
3074 #endif
3075
3076 /*
3077 -------------------------------------------------------------------------------
3078 Returns 1 if the double-precision floating-point value `a' is equal to the
3079 corresponding value `b', and 0 otherwise.  The comparison is performed
3080 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3081 -------------------------------------------------------------------------------
3082 */
3083 flag float64_eq( float64 a, float64 b )
3084 {
3085
3086     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3087          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3088        ) {
3089         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3090             float_raise( float_flag_invalid );
3091         }
3092         return 0;
3093     }
3094     return ( a == b ) ||
3095         ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3096
3097 }
3098
3099 /*
3100 -------------------------------------------------------------------------------
3101 Returns 1 if the double-precision floating-point value `a' is less than or
3102 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3103 performed according to the IEC/IEEE Standard for Binary Floating-Point
3104 Arithmetic.
3105 -------------------------------------------------------------------------------
3106 */
3107 flag float64_le( float64 a, float64 b )
3108 {
3109     flag aSign, bSign;
3110
3111     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3112          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3113        ) {
3114         float_raise( float_flag_invalid );
3115         return 0;
3116     }
3117     aSign = extractFloat64Sign( a );
3118     bSign = extractFloat64Sign( b );
3119     if ( aSign != bSign )
3120         return aSign ||
3121             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3122               0 );
3123     return ( a == b ) ||
3124         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3125
3126 }
3127
3128 /*
3129 -------------------------------------------------------------------------------
3130 Returns 1 if the double-precision floating-point value `a' is less than
3131 the corresponding value `b', and 0 otherwise.  The comparison is performed
3132 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3133 -------------------------------------------------------------------------------
3134 */
3135 flag float64_lt( float64 a, float64 b )
3136 {
3137     flag aSign, bSign;
3138
3139     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3140          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3141        ) {
3142         float_raise( float_flag_invalid );
3143         return 0;
3144     }
3145     aSign = extractFloat64Sign( a );
3146     bSign = extractFloat64Sign( b );
3147     if ( aSign != bSign )
3148         return aSign &&
3149             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3150               0 );
3151     return ( a != b ) &&
3152         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3153
3154 }
3155
3156 #ifndef SOFTFLOAT_FOR_GCC
3157 /*
3158 -------------------------------------------------------------------------------
3159 Returns 1 if the double-precision floating-point value `a' is equal to the
3160 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3161 if either operand is a NaN.  Otherwise, the comparison is performed
3162 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3163 -------------------------------------------------------------------------------
3164 */
3165 flag float64_eq_signaling( float64 a, float64 b )
3166 {
3167
3168     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3169          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3170        ) {
3171         float_raise( float_flag_invalid );
3172         return 0;
3173     }
3174     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3175
3176 }
3177
3178 /*
3179 -------------------------------------------------------------------------------
3180 Returns 1 if the double-precision floating-point value `a' is less than or
3181 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3182 cause an exception.  Otherwise, the comparison is performed according to the
3183 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3184 -------------------------------------------------------------------------------
3185 */
3186 flag float64_le_quiet( float64 a, float64 b )
3187 {
3188     flag aSign, bSign;
3189
3190     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3191          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3192        ) {
3193         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3194             float_raise( float_flag_invalid );
3195         }
3196         return 0;
3197     }
3198     aSign = extractFloat64Sign( a );
3199     bSign = extractFloat64Sign( b );
3200     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3201     return ( a == b ) || ( aSign ^ ( a < b ) );
3202
3203 }
3204
3205 /*
3206 -------------------------------------------------------------------------------
3207 Returns 1 if the double-precision floating-point value `a' is less than
3208 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3209 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3210 Standard for Binary Floating-Point Arithmetic.
3211 -------------------------------------------------------------------------------
3212 */
3213 flag float64_lt_quiet( float64 a, float64 b )
3214 {
3215     flag aSign, bSign;
3216
3217     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3218          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3219        ) {
3220         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3221             float_raise( float_flag_invalid );
3222         }
3223         return 0;
3224     }
3225     aSign = extractFloat64Sign( a );
3226     bSign = extractFloat64Sign( b );
3227     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3228     return ( a != b ) && ( aSign ^ ( a < b ) );
3229
3230 }
3231 #endif
3232
3233 #ifdef FLOATX80
3234
3235 /*
3236 -------------------------------------------------------------------------------
3237 Returns the result of converting the extended double-precision floating-
3238 point value `a' to the 32-bit two's complement integer format.  The
3239 conversion is performed according to the IEC/IEEE Standard for Binary
3240 Floating-Point Arithmetic---which means in particular that the conversion
3241 is rounded according to the current rounding mode.  If `a' is a NaN, the
3242 largest positive integer is returned.  Otherwise, if the conversion
3243 overflows, the largest integer with the same sign as `a' is returned.
3244 -------------------------------------------------------------------------------
3245 */
3246 int32 floatx80_to_int32( floatx80 a )
3247 {
3248     flag aSign;
3249     int32 aExp, shiftCount;
3250     bits64 aSig;
3251
3252     aSig = extractFloatx80Frac( a );
3253     aExp = extractFloatx80Exp( a );
3254     aSign = extractFloatx80Sign( a );
3255     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3256     shiftCount = 0x4037 - aExp;
3257     if ( shiftCount <= 0 ) shiftCount = 1;
3258     shift64RightJamming( aSig, shiftCount, &aSig );
3259     return roundAndPackInt32( aSign, aSig );
3260
3261 }
3262
3263 /*
3264 -------------------------------------------------------------------------------
3265 Returns the result of converting the extended double-precision floating-
3266 point value `a' to the 32-bit two's complement integer format.  The
3267 conversion is performed according to the IEC/IEEE Standard for Binary
3268 Floating-Point Arithmetic, except that the conversion is always rounded
3269 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3270 Otherwise, if the conversion overflows, the largest integer with the same
3271 sign as `a' is returned.
3272 -------------------------------------------------------------------------------
3273 */
3274 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3275 {
3276     flag aSign;
3277     int32 aExp, shiftCount;
3278     bits64 aSig, savedASig;
3279     int32 z;
3280
3281     aSig = extractFloatx80Frac( a );
3282     aExp = extractFloatx80Exp( a );
3283     aSign = extractFloatx80Sign( a );
3284     if ( 0x401E < aExp ) {
3285         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3286         goto invalid;
3287     }
3288     else if ( aExp < 0x3FFF ) {
3289         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3290         return 0;
3291     }
3292     shiftCount = 0x403E - aExp;
3293     savedASig = aSig;
3294     aSig >>= shiftCount;
3295     z = aSig;
3296     if ( aSign ) z = - z;
3297     if ( ( z < 0 ) ^ aSign ) {
3298  invalid:
3299         float_raise( float_flag_invalid );
3300         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3301     }
3302     if ( ( aSig<<shiftCount ) != savedASig ) {
3303         float_exception_flags |= float_flag_inexact;
3304     }
3305     return z;
3306
3307 }
3308
3309 /*
3310 -------------------------------------------------------------------------------
3311 Returns the result of converting the extended double-precision floating-
3312 point value `a' to the 64-bit two's complement integer format.  The
3313 conversion is performed according to the IEC/IEEE Standard for Binary
3314 Floating-Point Arithmetic---which means in particular that the conversion
3315 is rounded according to the current rounding mode.  If `a' is a NaN,
3316 the largest positive integer is returned.  Otherwise, if the conversion
3317 overflows, the largest integer with the same sign as `a' is returned.
3318 -------------------------------------------------------------------------------
3319 */
3320 int64 floatx80_to_int64( floatx80 a )
3321 {
3322     flag aSign;
3323     int32 aExp, shiftCount;
3324     bits64 aSig, aSigExtra;
3325
3326     aSig = extractFloatx80Frac( a );
3327     aExp = extractFloatx80Exp( a );
3328     aSign = extractFloatx80Sign( a );
3329     shiftCount = 0x403E - aExp;
3330     if ( shiftCount <= 0 ) {
3331         if ( shiftCount ) {
3332             float_raise( float_flag_invalid );
3333             if (    ! aSign
3334                  || (    ( aExp == 0x7FFF )
3335                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3336                ) {
3337                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3338             }
3339             return (sbits64) LIT64( 0x8000000000000000 );
3340         }
3341         aSigExtra = 0;
3342     }
3343     else {
3344         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3345     }
3346     return roundAndPackInt64( aSign, aSig, aSigExtra );
3347
3348 }
3349
3350 /*
3351 -------------------------------------------------------------------------------
3352 Returns the result of converting the extended double-precision floating-
3353 point value `a' to the 64-bit two's complement integer format.  The
3354 conversion is performed according to the IEC/IEEE Standard for Binary
3355 Floating-Point Arithmetic, except that the conversion is always rounded
3356 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3357 Otherwise, if the conversion overflows, the largest integer with the same
3358 sign as `a' is returned.
3359 -------------------------------------------------------------------------------
3360 */
3361 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3362 {
3363     flag aSign;
3364     int32 aExp, shiftCount;
3365     bits64 aSig;
3366     int64 z;
3367
3368     aSig = extractFloatx80Frac( a );
3369     aExp = extractFloatx80Exp( a );
3370     aSign = extractFloatx80Sign( a );
3371     shiftCount = aExp - 0x403E;
3372     if ( 0 <= shiftCount ) {
3373         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3374         if ( ( a.high != 0xC03E ) || aSig ) {
3375             float_raise( float_flag_invalid );
3376             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3377                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3378             }
3379         }
3380         return (sbits64) LIT64( 0x8000000000000000 );
3381     }
3382     else if ( aExp < 0x3FFF ) {
3383         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3384         return 0;
3385     }
3386     z = aSig>>( - shiftCount );
3387     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3388         float_exception_flags |= float_flag_inexact;
3389     }
3390     if ( aSign ) z = - z;
3391     return z;
3392
3393 }
3394
3395 /*
3396 -------------------------------------------------------------------------------
3397 Returns the result of converting the extended double-precision floating-
3398 point value `a' to the single-precision floating-point format.  The
3399 conversion is performed according to the IEC/IEEE Standard for Binary
3400 Floating-Point Arithmetic.
3401 -------------------------------------------------------------------------------
3402 */
3403 float32 floatx80_to_float32( floatx80 a )
3404 {
3405     flag aSign;
3406     int32 aExp;
3407     bits64 aSig;
3408
3409     aSig = extractFloatx80Frac( a );
3410     aExp = extractFloatx80Exp( a );
3411     aSign = extractFloatx80Sign( a );
3412     if ( aExp == 0x7FFF ) {
3413         if ( (bits64) ( aSig<<1 ) ) {
3414             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3415         }
3416         return packFloat32( aSign, 0xFF, 0 );
3417     }
3418     shift64RightJamming( aSig, 33, &aSig );
3419     if ( aExp || aSig ) aExp -= 0x3F81;
3420     return roundAndPackFloat32( aSign, aExp, aSig );
3421
3422 }
3423
3424 /*
3425 -------------------------------------------------------------------------------
3426 Returns the result of converting the extended double-precision floating-
3427 point value `a' to the double-precision floating-point format.  The
3428 conversion is performed according to the IEC/IEEE Standard for Binary
3429 Floating-Point Arithmetic.
3430 -------------------------------------------------------------------------------
3431 */
3432 float64 floatx80_to_float64( floatx80 a )
3433 {
3434     flag aSign;
3435     int32 aExp;
3436     bits64 aSig, zSig;
3437
3438     aSig = extractFloatx80Frac( a );
3439     aExp = extractFloatx80Exp( a );
3440     aSign = extractFloatx80Sign( a );
3441     if ( aExp == 0x7FFF ) {
3442         if ( (bits64) ( aSig<<1 ) ) {
3443             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3444         }
3445         return packFloat64( aSign, 0x7FF, 0 );
3446     }
3447     shift64RightJamming( aSig, 1, &zSig );
3448     if ( aExp || aSig ) aExp -= 0x3C01;
3449     return roundAndPackFloat64( aSign, aExp, zSig );
3450
3451 }
3452
3453 #ifdef FLOAT128
3454
3455 /*
3456 -------------------------------------------------------------------------------
3457 Returns the result of converting the extended double-precision floating-
3458 point value `a' to the quadruple-precision floating-point format.  The
3459 conversion is performed according to the IEC/IEEE Standard for Binary
3460 Floating-Point Arithmetic.
3461 -------------------------------------------------------------------------------
3462 */
3463 float128 floatx80_to_float128( floatx80 a )
3464 {
3465     flag aSign;
3466     int16 aExp;
3467     bits64 aSig, zSig0, zSig1;
3468
3469     aSig = extractFloatx80Frac( a );
3470     aExp = extractFloatx80Exp( a );
3471     aSign = extractFloatx80Sign( a );
3472     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3473         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3474     }
3475     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3476     return packFloat128( aSign, aExp, zSig0, zSig1 );
3477
3478 }
3479
3480 #endif
3481
3482 /*
3483 -------------------------------------------------------------------------------
3484 Rounds the extended double-precision floating-point value `a' to an integer,
3485 and returns the result as an extended quadruple-precision floating-point
3486 value.  The operation is performed according to the IEC/IEEE Standard for
3487 Binary Floating-Point Arithmetic.
3488 -------------------------------------------------------------------------------
3489 */
3490 floatx80 floatx80_round_to_int( floatx80 a )
3491 {
3492     flag aSign;
3493     int32 aExp;
3494     bits64 lastBitMask, roundBitsMask;
3495     int8 roundingMode;
3496     floatx80 z;
3497
3498     aExp = extractFloatx80Exp( a );
3499     if ( 0x403E <= aExp ) {
3500         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3501             return propagateFloatx80NaN( a, a );
3502         }
3503         return a;
3504     }
3505     if ( aExp < 0x3FFF ) {
3506         if (    ( aExp == 0 )
3507              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3508             return a;
3509         }
3510         float_exception_flags |= float_flag_inexact;
3511         aSign = extractFloatx80Sign( a );
3512         switch ( float_rounding_mode ) {
3513          case float_round_nearest_even:
3514             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3515                ) {
3516                 return
3517                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3518             }
3519             break;
3520          case float_round_to_zero:
3521             break;
3522          case float_round_down:
3523             return
3524                   aSign ?
3525                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3526                 : packFloatx80( 0, 0, 0 );
3527          case float_round_up:
3528             return
3529                   aSign ? packFloatx80( 1, 0, 0 )
3530                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3531         }
3532         return packFloatx80( aSign, 0, 0 );
3533     }
3534     lastBitMask = 1;
3535     lastBitMask <<= 0x403E - aExp;
3536     roundBitsMask = lastBitMask - 1;
3537     z = a;
3538     roundingMode = float_rounding_mode;
3539     if ( roundingMode == float_round_nearest_even ) {
3540         z.low += lastBitMask>>1;
3541         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3542     }
3543     else if ( roundingMode != float_round_to_zero ) {
3544         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3545             z.low += roundBitsMask;
3546         }
3547     }
3548     z.low &= ~ roundBitsMask;
3549     if ( z.low == 0 ) {
3550         ++z.high;
3551         z.low = LIT64( 0x8000000000000000 );
3552     }
3553     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3554     return z;
3555
3556 }
3557
3558 /*
3559 -------------------------------------------------------------------------------
3560 Returns the result of adding the absolute values of the extended double-
3561 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3562 negated before being returned.  `zSign' is ignored if the result is a NaN.
3563 The addition is performed according to the IEC/IEEE Standard for Binary
3564 Floating-Point Arithmetic.
3565 -------------------------------------------------------------------------------
3566 */
3567 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3568 {
3569     int32 aExp, bExp, zExp;
3570     bits64 aSig, bSig, zSig0, zSig1;
3571     int32 expDiff;
3572
3573     aSig = extractFloatx80Frac( a );
3574     aExp = extractFloatx80Exp( a );
3575     bSig = extractFloatx80Frac( b );
3576     bExp = extractFloatx80Exp( b );
3577     expDiff = aExp - bExp;
3578     if ( 0 < expDiff ) {
3579         if ( aExp == 0x7FFF ) {
3580             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3581             return a;
3582         }
3583         if ( bExp == 0 ) --expDiff;
3584         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3585         zExp = aExp;
3586     }
3587     else if ( expDiff < 0 ) {
3588         if ( bExp == 0x7FFF ) {
3589             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3590             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3591         }
3592         if ( aExp == 0 ) ++expDiff;
3593         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3594         zExp = bExp;
3595     }
3596     else {
3597         if ( aExp == 0x7FFF ) {
3598             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3599                 return propagateFloatx80NaN( a, b );
3600             }
3601             return a;
3602         }
3603         zSig1 = 0;
3604         zSig0 = aSig + bSig;
3605         if ( aExp == 0 ) {
3606             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3607             goto roundAndPack;
3608         }
3609         zExp = aExp;
3610         goto shiftRight1;
3611     }
3612     zSig0 = aSig + bSig;
3613     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3614  shiftRight1:
3615     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3616     zSig0 |= LIT64( 0x8000000000000000 );
3617     ++zExp;
3618  roundAndPack:
3619     return
3620         roundAndPackFloatx80(
3621             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3622
3623 }
3624
3625 /*
3626 -------------------------------------------------------------------------------
3627 Returns the result of subtracting the absolute values of the extended
3628 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3629 difference is negated before being returned.  `zSign' is ignored if the
3630 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3631 Standard for Binary Floating-Point Arithmetic.
3632 -------------------------------------------------------------------------------
3633 */
3634 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3635 {
3636     int32 aExp, bExp, zExp;
3637     bits64 aSig, bSig, zSig0, zSig1;
3638     int32 expDiff;
3639     floatx80 z;
3640
3641     aSig = extractFloatx80Frac( a );
3642     aExp = extractFloatx80Exp( a );
3643     bSig = extractFloatx80Frac( b );
3644     bExp = extractFloatx80Exp( b );
3645     expDiff = aExp - bExp;
3646     if ( 0 < expDiff ) goto aExpBigger;
3647     if ( expDiff < 0 ) goto bExpBigger;
3648     if ( aExp == 0x7FFF ) {
3649         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3650             return propagateFloatx80NaN( a, b );
3651         }
3652         float_raise( float_flag_invalid );
3653         z.low = floatx80_default_nan_low;
3654         z.high = floatx80_default_nan_high;
3655         return z;
3656     }
3657     if ( aExp == 0 ) {
3658         aExp = 1;
3659         bExp = 1;
3660     }
3661     zSig1 = 0;
3662     if ( bSig < aSig ) goto aBigger;
3663     if ( aSig < bSig ) goto bBigger;
3664     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3665  bExpBigger:
3666     if ( bExp == 0x7FFF ) {
3667         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3668         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3669     }
3670     if ( aExp == 0 ) ++expDiff;
3671     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3672  bBigger:
3673     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3674     zExp = bExp;
3675     zSign ^= 1;
3676     goto normalizeRoundAndPack;
3677  aExpBigger:
3678     if ( aExp == 0x7FFF ) {
3679         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3680         return a;
3681     }
3682     if ( bExp == 0 ) --expDiff;
3683     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3684  aBigger:
3685     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3686     zExp = aExp;
3687  normalizeRoundAndPack:
3688     return
3689         normalizeRoundAndPackFloatx80(
3690             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3691
3692 }
3693
3694 /*
3695 -------------------------------------------------------------------------------
3696 Returns the result of adding the extended double-precision floating-point
3697 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3698 Standard for Binary Floating-Point Arithmetic.
3699 -------------------------------------------------------------------------------
3700 */
3701 floatx80 floatx80_add( floatx80 a, floatx80 b )
3702 {
3703     flag aSign, bSign;
3704
3705     aSign = extractFloatx80Sign( a );
3706     bSign = extractFloatx80Sign( b );
3707     if ( aSign == bSign ) {
3708         return addFloatx80Sigs( a, b, aSign );
3709     }
3710     else {
3711         return subFloatx80Sigs( a, b, aSign );
3712     }
3713
3714 }
3715
3716 /*
3717 -------------------------------------------------------------------------------
3718 Returns the result of subtracting the extended double-precision floating-
3719 point values `a' and `b'.  The operation is performed according to the
3720 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3721 -------------------------------------------------------------------------------
3722 */
3723 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3724 {
3725     flag aSign, bSign;
3726
3727     aSign = extractFloatx80Sign( a );
3728     bSign = extractFloatx80Sign( b );
3729     if ( aSign == bSign ) {
3730         return subFloatx80Sigs( a, b, aSign );
3731     }
3732     else {
3733         return addFloatx80Sigs( a, b, aSign );
3734     }
3735
3736 }
3737
3738 /*
3739 -------------------------------------------------------------------------------
3740 Returns the result of multiplying the extended double-precision floating-
3741 point values `a' and `b'.  The operation is performed according to the
3742 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3743 -------------------------------------------------------------------------------
3744 */
3745 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3746 {
3747     flag aSign, bSign, zSign;
3748     int32 aExp, bExp, zExp;
3749     bits64 aSig, bSig, zSig0, zSig1;
3750     floatx80 z;
3751
3752     aSig = extractFloatx80Frac( a );
3753     aExp = extractFloatx80Exp( a );
3754     aSign = extractFloatx80Sign( a );
3755     bSig = extractFloatx80Frac( b );
3756     bExp = extractFloatx80Exp( b );
3757     bSign = extractFloatx80Sign( b );
3758     zSign = aSign ^ bSign;
3759     if ( aExp == 0x7FFF ) {
3760         if (    (bits64) ( aSig<<1 )
3761              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3762             return propagateFloatx80NaN( a, b );
3763         }
3764         if ( ( bExp | bSig ) == 0 ) goto invalid;
3765         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3766     }
3767     if ( bExp == 0x7FFF ) {
3768         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3769         if ( ( aExp | aSig ) == 0 ) {
3770  invalid:
3771             float_raise( float_flag_invalid );
3772             z.low = floatx80_default_nan_low;
3773             z.high = floatx80_default_nan_high;
3774             return z;
3775         }
3776         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3777     }
3778     if ( aExp == 0 ) {
3779         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3780         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3781     }
3782     if ( bExp == 0 ) {
3783         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3784         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3785     }
3786     zExp = aExp + bExp - 0x3FFE;
3787     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3788     if ( 0 < (sbits64) zSig0 ) {
3789         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3790         --zExp;
3791     }
3792     return
3793         roundAndPackFloatx80(
3794             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3795
3796 }
3797
3798 /*
3799 -------------------------------------------------------------------------------
3800 Returns the result of dividing the extended double-precision floating-point
3801 value `a' by the corresponding value `b'.  The operation is performed
3802 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3803 -------------------------------------------------------------------------------
3804 */
3805 floatx80 floatx80_div( floatx80 a, floatx80 b )
3806 {
3807     flag aSign, bSign, zSign;
3808     int32 aExp, bExp, zExp;
3809     bits64 aSig, bSig, zSig0, zSig1;
3810     bits64 rem0, rem1, rem2, term0, term1, term2;
3811     floatx80 z;
3812
3813     aSig = extractFloatx80Frac( a );
3814     aExp = extractFloatx80Exp( a );
3815     aSign = extractFloatx80Sign( a );
3816     bSig = extractFloatx80Frac( b );
3817     bExp = extractFloatx80Exp( b );
3818     bSign = extractFloatx80Sign( b );
3819     zSign = aSign ^ bSign;
3820     if ( aExp == 0x7FFF ) {
3821         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3822         if ( bExp == 0x7FFF ) {
3823             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3824             goto invalid;
3825         }
3826         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3827     }
3828     if ( bExp == 0x7FFF ) {
3829         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3830         return packFloatx80( zSign, 0, 0 );
3831     }
3832     if ( bExp == 0 ) {
3833         if ( bSig == 0 ) {
3834             if ( ( aExp | aSig ) == 0 ) {
3835  invalid:
3836                 float_raise( float_flag_invalid );
3837                 z.low = floatx80_default_nan_low;
3838                 z.high = floatx80_default_nan_high;
3839                 return z;
3840             }
3841             float_raise( float_flag_divbyzero );
3842             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3843         }
3844         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3845     }
3846     if ( aExp == 0 ) {
3847         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3848         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3849     }
3850     zExp = aExp - bExp + 0x3FFE;
3851     rem1 = 0;
3852     if ( bSig <= aSig ) {
3853         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3854         ++zExp;
3855     }
3856     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3857     mul64To128( bSig, zSig0, &term0, &term1 );
3858     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3859     while ( (sbits64) rem0 < 0 ) {
3860         --zSig0;
3861         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3862     }
3863     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3864     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3865         mul64To128( bSig, zSig1, &term1, &term2 );
3866         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3867         while ( (sbits64) rem1 < 0 ) {
3868             --zSig1;
3869             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3870         }
3871         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3872     }
3873     return
3874         roundAndPackFloatx80(
3875             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3876
3877 }
3878
3879 /*
3880 -------------------------------------------------------------------------------
3881 Returns the remainder of the extended double-precision floating-point value
3882 `a' with respect to the corresponding value `b'.  The operation is performed
3883 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3884 -------------------------------------------------------------------------------
3885 */
3886 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3887 {
3888     flag aSign, bSign, zSign;
3889     int32 aExp, bExp, expDiff;
3890     bits64 aSig0, aSig1, bSig;
3891     bits64 q, term0, term1, alternateASig0, alternateASig1;
3892     floatx80 z;
3893
3894     aSig0 = extractFloatx80Frac( a );
3895     aExp = extractFloatx80Exp( a );
3896     aSign = extractFloatx80Sign( a );
3897     bSig = extractFloatx80Frac( b );
3898     bExp = extractFloatx80Exp( b );
3899     bSign = extractFloatx80Sign( b );
3900     if ( aExp == 0x7FFF ) {
3901         if (    (bits64) ( aSig0<<1 )
3902              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3903             return propagateFloatx80NaN( a, b );
3904         }
3905         goto invalid;
3906     }
3907     if ( bExp == 0x7FFF ) {
3908         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3909         return a;
3910     }
3911     if ( bExp == 0 ) {
3912         if ( bSig == 0 ) {
3913  invalid:
3914             float_raise( float_flag_invalid );
3915             z.low = floatx80_default_nan_low;
3916             z.high = floatx80_default_nan_high;
3917             return z;
3918         }
3919         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3920     }
3921     if ( aExp == 0 ) {
3922         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3923         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3924     }
3925     bSig |= LIT64( 0x8000000000000000 );
3926     zSign = aSign;
3927     expDiff = aExp - bExp;
3928     aSig1 = 0;
3929     if ( expDiff < 0 ) {
3930         if ( expDiff < -1 ) return a;
3931         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3932         expDiff = 0;
3933     }
3934     q = ( bSig <= aSig0 );
3935     if ( q ) aSig0 -= bSig;
3936     expDiff -= 64;
3937     while ( 0 < expDiff ) {
3938         q = estimateDiv128To64( aSig0, aSig1, bSig );
3939         q = ( 2 < q ) ? q - 2 : 0;
3940         mul64To128( bSig, q, &term0, &term1 );
3941         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3942         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3943         expDiff -= 62;
3944     }
3945     expDiff += 64;
3946     if ( 0 < expDiff ) {
3947         q = estimateDiv128To64( aSig0, aSig1, bSig );
3948         q = ( 2 < q ) ? q - 2 : 0;
3949         q >>= 64 - expDiff;
3950         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3951         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3952         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3953         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3954             ++q;
3955             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3956         }
3957     }
3958     else {
3959         term1 = 0;
3960         term0 = bSig;
3961     }
3962     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3963     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3964          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3965               && ( q & 1 ) )
3966        ) {
3967         aSig0 = alternateASig0;
3968         aSig1 = alternateASig1;
3969         zSign = ! zSign;
3970     }
3971     return
3972         normalizeRoundAndPackFloatx80(
3973             80, zSign, bExp + expDiff, aSig0, aSig1 );
3974
3975 }
3976
3977 /*
3978 -------------------------------------------------------------------------------
3979 Returns the square root of the extended double-precision floating-point
3980 value `a'.  The operation is performed according to the IEC/IEEE Standard
3981 for Binary Floating-Point Arithmetic.
3982 -------------------------------------------------------------------------------
3983 */
3984 floatx80 floatx80_sqrt( floatx80 a )
3985 {
3986     flag aSign;
3987     int32 aExp, zExp;
3988     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3989     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3990     floatx80 z;
3991
3992     aSig0 = extractFloatx80Frac( a );
3993     aExp = extractFloatx80Exp( a );
3994     aSign = extractFloatx80Sign( a );
3995     if ( aExp == 0x7FFF ) {
3996         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3997         if ( ! aSign ) return a;
3998         goto invalid;
3999     }
4000     if ( aSign ) {
4001         if ( ( aExp | aSig0 ) == 0 ) return a;
4002  invalid:
4003         float_raise( float_flag_invalid );
4004         z.low = floatx80_default_nan_low;
4005         z.high = floatx80_default_nan_high;
4006         return z;
4007     }
4008     if ( aExp == 0 ) {
4009         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4010         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4011     }
4012     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4013     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4014     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4015     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4016     doubleZSig0 = zSig0<<1;
4017     mul64To128( zSig0, zSig0, &term0, &term1 );
4018     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4019     while ( (sbits64) rem0 < 0 ) {
4020         --zSig0;
4021         doubleZSig0 -= 2;
4022         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4023     }
4024     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4025     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4026         if ( zSig1 == 0 ) zSig1 = 1;
4027         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4028         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4029         mul64To128( zSig1, zSig1, &term2, &term3 );
4030         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4031         while ( (sbits64) rem1 < 0 ) {
4032             --zSig1;
4033             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4034             term3 |= 1;
4035             term2 |= doubleZSig0;
4036             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4037         }
4038         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4039     }
4040     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4041     zSig0 |= doubleZSig0;
4042     return
4043         roundAndPackFloatx80(
4044             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4045
4046 }
4047
4048 /*
4049 -------------------------------------------------------------------------------
4050 Returns 1 if the extended double-precision floating-point value `a' is
4051 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4052 performed according to the IEC/IEEE Standard for Binary Floating-Point
4053 Arithmetic.
4054 -------------------------------------------------------------------------------
4055 */
4056 flag floatx80_eq( floatx80 a, floatx80 b )
4057 {
4058
4059     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4060               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4061          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4062               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4063        ) {
4064         if (    floatx80_is_signaling_nan( a )
4065              || floatx80_is_signaling_nan( b ) ) {
4066             float_raise( float_flag_invalid );
4067         }
4068         return 0;
4069     }
4070     return
4071            ( a.low == b.low )
4072         && (    ( a.high == b.high )
4073              || (    ( a.low == 0 )
4074                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4075            );
4076
4077 }
4078
4079 /*
4080 -------------------------------------------------------------------------------
4081 Returns 1 if the extended double-precision floating-point value `a' is
4082 less than or equal to the corresponding value `b', and 0 otherwise.  The
4083 comparison is performed according to the IEC/IEEE Standard for Binary
4084 Floating-Point Arithmetic.
4085 -------------------------------------------------------------------------------
4086 */
4087 flag floatx80_le( floatx80 a, floatx80 b )
4088 {
4089     flag aSign, bSign;
4090
4091     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4092               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4093          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4094               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4095        ) {
4096         float_raise( float_flag_invalid );
4097         return 0;
4098     }
4099     aSign = extractFloatx80Sign( a );
4100     bSign = extractFloatx80Sign( b );
4101     if ( aSign != bSign ) {
4102         return
4103                aSign
4104             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4105                  == 0 );
4106     }
4107     return
4108           aSign ? le128( b.high, b.low, a.high, a.low )
4109         : le128( a.high, a.low, b.high, b.low );
4110
4111 }
4112
4113 /*
4114 -------------------------------------------------------------------------------
4115 Returns 1 if the extended double-precision floating-point value `a' is
4116 less than the corresponding value `b', and 0 otherwise.  The comparison
4117 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4118 Arithmetic.
4119 -------------------------------------------------------------------------------
4120 */
4121 flag floatx80_lt( floatx80 a, floatx80 b )
4122 {
4123     flag aSign, bSign;
4124
4125     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4126               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4127          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4128               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4129        ) {
4130         float_raise( float_flag_invalid );
4131         return 0;
4132     }
4133     aSign = extractFloatx80Sign( a );
4134     bSign = extractFloatx80Sign( b );
4135     if ( aSign != bSign ) {
4136         return
4137                aSign
4138             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4139                  != 0 );
4140     }
4141     return
4142           aSign ? lt128( b.high, b.low, a.high, a.low )
4143         : lt128( a.high, a.low, b.high, b.low );
4144
4145 }
4146
4147 /*
4148 -------------------------------------------------------------------------------
4149 Returns 1 if the extended double-precision floating-point value `a' is equal
4150 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4151 raised if either operand is a NaN.  Otherwise, the comparison is performed
4152 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4153 -------------------------------------------------------------------------------
4154 */
4155 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4156 {
4157
4158     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4159               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4160          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4161               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4162        ) {
4163         float_raise( float_flag_invalid );
4164         return 0;
4165     }
4166     return
4167            ( a.low == b.low )
4168         && (    ( a.high == b.high )
4169              || (    ( a.low == 0 )
4170                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4171            );
4172
4173 }
4174
4175 /*
4176 -------------------------------------------------------------------------------
4177 Returns 1 if the extended double-precision floating-point value `a' is less
4178 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4179 do not cause an exception.  Otherwise, the comparison is performed according
4180 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4181 -------------------------------------------------------------------------------
4182 */
4183 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4184 {
4185     flag aSign, bSign;
4186
4187     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4188               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4189          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4190               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4191        ) {
4192         if (    floatx80_is_signaling_nan( a )
4193              || floatx80_is_signaling_nan( b ) ) {
4194             float_raise( float_flag_invalid );
4195         }
4196         return 0;
4197     }
4198     aSign = extractFloatx80Sign( a );
4199     bSign = extractFloatx80Sign( b );
4200     if ( aSign != bSign ) {
4201         return
4202                aSign
4203             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4204                  == 0 );
4205     }
4206     return
4207           aSign ? le128( b.high, b.low, a.high, a.low )
4208         : le128( a.high, a.low, b.high, b.low );
4209
4210 }
4211
4212 /*
4213 -------------------------------------------------------------------------------
4214 Returns 1 if the extended double-precision floating-point value `a' is less
4215 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4216 an exception.  Otherwise, the comparison is performed according to the
4217 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4218 -------------------------------------------------------------------------------
4219 */
4220 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4221 {
4222     flag aSign, bSign;
4223
4224     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4225               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4226          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4227               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4228        ) {
4229         if (    floatx80_is_signaling_nan( a )
4230              || floatx80_is_signaling_nan( b ) ) {
4231             float_raise( float_flag_invalid );
4232         }
4233         return 0;
4234     }
4235     aSign = extractFloatx80Sign( a );
4236     bSign = extractFloatx80Sign( b );
4237     if ( aSign != bSign ) {
4238         return
4239                aSign
4240             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4241                  != 0 );
4242     }
4243     return
4244           aSign ? lt128( b.high, b.low, a.high, a.low )
4245         : lt128( a.high, a.low, b.high, b.low );
4246
4247 }
4248
4249 #endif
4250
4251 #ifdef FLOAT128
4252
4253 /*
4254 -------------------------------------------------------------------------------
4255 Returns the result of converting the quadruple-precision floating-point
4256 value `a' to the 32-bit two's complement integer format.  The conversion
4257 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4258 Arithmetic---which means in particular that the conversion is rounded
4259 according to the current rounding mode.  If `a' is a NaN, the largest
4260 positive integer is returned.  Otherwise, if the conversion overflows, the
4261 largest integer with the same sign as `a' is returned.
4262 -------------------------------------------------------------------------------
4263 */
4264 int32 float128_to_int32( float128 a )
4265 {
4266     flag aSign;
4267     int32 aExp, shiftCount;
4268     bits64 aSig0, aSig1;
4269
4270     aSig1 = extractFloat128Frac1( a );
4271     aSig0 = extractFloat128Frac0( a );
4272     aExp = extractFloat128Exp( a );
4273     aSign = extractFloat128Sign( a );
4274     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4275     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4276     aSig0 |= ( aSig1 != 0 );
4277     shiftCount = 0x4028 - aExp;
4278     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4279     return roundAndPackInt32( aSign, aSig0 );
4280
4281 }
4282
4283 /*
4284 -------------------------------------------------------------------------------
4285 Returns the result of converting the quadruple-precision floating-point
4286 value `a' to the 32-bit two's complement integer format.  The conversion
4287 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4288 Arithmetic, except that the conversion is always rounded toward zero.  If
4289 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4290 conversion overflows, the largest integer with the same sign as `a' is
4291 returned.
4292 -------------------------------------------------------------------------------
4293 */
4294 int32 float128_to_int32_round_to_zero( float128 a )
4295 {
4296     flag aSign;
4297     int32 aExp, shiftCount;
4298     bits64 aSig0, aSig1, savedASig;
4299     int32 z;
4300
4301     aSig1 = extractFloat128Frac1( a );
4302     aSig0 = extractFloat128Frac0( a );
4303     aExp = extractFloat128Exp( a );
4304     aSign = extractFloat128Sign( a );
4305     aSig0 |= ( aSig1 != 0 );
4306     if ( 0x401E < aExp ) {
4307         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4308         goto invalid;
4309     }
4310     else if ( aExp < 0x3FFF ) {
4311         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4312         return 0;
4313     }
4314     aSig0 |= LIT64( 0x0001000000000000 );
4315     shiftCount = 0x402F - aExp;
4316     savedASig = aSig0;
4317     aSig0 >>= shiftCount;
4318     z = aSig0;
4319     if ( aSign ) z = - z;
4320     if ( ( z < 0 ) ^ aSign ) {
4321  invalid:
4322         float_raise( float_flag_invalid );
4323         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4324     }
4325     if ( ( aSig0<<shiftCount ) != savedASig ) {
4326         float_exception_flags |= float_flag_inexact;
4327     }
4328     return z;
4329
4330 }
4331
4332 /*
4333 -------------------------------------------------------------------------------
4334 Returns the result of converting the quadruple-precision floating-point
4335 value `a' to the 64-bit two's complement integer format.  The conversion
4336 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4337 Arithmetic---which means in particular that the conversion is rounded
4338 according to the current rounding mode.  If `a' is a NaN, the largest
4339 positive integer is returned.  Otherwise, if the conversion overflows, the
4340 largest integer with the same sign as `a' is returned.
4341 -------------------------------------------------------------------------------
4342 */
4343 int64 float128_to_int64( float128 a )
4344 {
4345     flag aSign;
4346     int32 aExp, shiftCount;
4347     bits64 aSig0, aSig1;
4348
4349     aSig1 = extractFloat128Frac1( a );
4350     aSig0 = extractFloat128Frac0( a );
4351     aExp = extractFloat128Exp( a );
4352     aSign = extractFloat128Sign( a );
4353     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4354     shiftCount = 0x402F - aExp;
4355     if ( shiftCount <= 0 ) {
4356         if ( 0x403E < aExp ) {
4357             float_raise( float_flag_invalid );
4358             if (    ! aSign
4359                  || (    ( aExp == 0x7FFF )
4360                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4361                     )
4362                ) {
4363                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4364             }
4365             return (sbits64) LIT64( 0x8000000000000000 );
4366         }
4367         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4368     }
4369     else {
4370         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4371     }
4372     return roundAndPackInt64( aSign, aSig0, aSig1 );
4373
4374 }
4375
4376 /*
4377 -------------------------------------------------------------------------------
4378 Returns the result of converting the quadruple-precision floating-point
4379 value `a' to the 64-bit two's complement integer format.  The conversion
4380 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4381 Arithmetic, except that the conversion is always rounded toward zero.
4382 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4383 the conversion overflows, the largest integer with the same sign as `a' is
4384 returned.
4385 -------------------------------------------------------------------------------
4386 */
4387 int64 float128_to_int64_round_to_zero( float128 a )
4388 {
4389     flag aSign;
4390     int32 aExp, shiftCount;
4391     bits64 aSig0, aSig1;
4392     int64 z;
4393
4394     aSig1 = extractFloat128Frac1( a );
4395     aSig0 = extractFloat128Frac0( a );
4396     aExp = extractFloat128Exp( a );
4397     aSign = extractFloat128Sign( a );
4398     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4399     shiftCount = aExp - 0x402F;
4400     if ( 0 < shiftCount ) {
4401         if ( 0x403E <= aExp ) {
4402             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4403             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4404                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4405                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4406             }
4407             else {
4408                 float_raise( float_flag_invalid );
4409                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4410                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4411                 }
4412             }
4413             return (sbits64) LIT64( 0x8000000000000000 );
4414         }
4415         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4416         if ( (bits64) ( aSig1<<shiftCount ) ) {
4417             float_exception_flags |= float_flag_inexact;
4418         }
4419     }
4420     else {
4421         if ( aExp < 0x3FFF ) {
4422             if ( aExp | aSig0 | aSig1 ) {
4423                 float_exception_flags |= float_flag_inexact;
4424             }
4425             return 0;
4426         }
4427         z = aSig0>>( - shiftCount );
4428         if (    aSig1
4429              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4430             float_exception_flags |= float_flag_inexact;
4431         }
4432     }
4433     if ( aSign ) z = - z;
4434     return z;
4435
4436 }
4437
4438 /*
4439  * just like above - but do not care for overflow of signed results
4440  */
4441 uint64 float128_to_uint64_round_to_zero( float128 a )
4442 {
4443     flag aSign;
4444     int32 aExp, shiftCount;
4445     bits64 aSig0, aSig1;
4446     uint64 z;
4447
4448     aSig1 = extractFloat128Frac1( a );
4449     aSig0 = extractFloat128Frac0( a );
4450     aExp = extractFloat128Exp( a );
4451     aSign = extractFloat128Sign( a );
4452     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4453     shiftCount = aExp - 0x402F;
4454     if ( 0 < shiftCount ) {
4455         if ( 0x403F <= aExp ) {
4456             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4457             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4458                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4459                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4460             }
4461             else {
4462                 float_raise( float_flag_invalid );
4463             }
4464             return LIT64( 0xFFFFFFFFFFFFFFFF );
4465         }
4466         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4467         if ( (bits64) ( aSig1<<shiftCount ) ) {
4468             float_exception_flags |= float_flag_inexact;
4469         }
4470     }
4471     else {
4472         if ( aExp < 0x3FFF ) {
4473             if ( aExp | aSig0 | aSig1 ) {
4474                 float_exception_flags |= float_flag_inexact;
4475             }
4476             return 0;
4477         }
4478         z = aSig0>>( - shiftCount );
4479         if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4480             float_exception_flags |= float_flag_inexact;
4481         }
4482     }
4483     if ( aSign ) z = - z;
4484     return z;
4485
4486 }
4487
4488 /*
4489 -------------------------------------------------------------------------------
4490 Returns the result of converting the quadruple-precision floating-point
4491 value `a' to the single-precision floating-point format.  The conversion
4492 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4493 Arithmetic.
4494 -------------------------------------------------------------------------------
4495 */
4496 float32 float128_to_float32( float128 a )
4497 {
4498     flag aSign;
4499     int32 aExp;
4500     bits64 aSig0, aSig1;
4501     bits32 zSig;
4502
4503     aSig1 = extractFloat128Frac1( a );
4504     aSig0 = extractFloat128Frac0( a );
4505     aExp = extractFloat128Exp( a );
4506     aSign = extractFloat128Sign( a );
4507     if ( aExp == 0x7FFF ) {
4508         if ( aSig0 | aSig1 ) {
4509             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4510         }
4511         return packFloat32( aSign, 0xFF, 0 );
4512     }
4513     aSig0 |= ( aSig1 != 0 );
4514     shift64RightJamming( aSig0, 18, &aSig0 );
4515     zSig = aSig0;
4516     if ( aExp || zSig ) {
4517         zSig |= 0x40000000;
4518         aExp -= 0x3F81;
4519     }
4520     return roundAndPackFloat32( aSign, aExp, zSig );
4521
4522 }
4523
4524 /*
4525 -------------------------------------------------------------------------------
4526 Returns the result of converting the quadruple-precision floating-point
4527 value `a' to the double-precision floating-point format.  The conversion
4528 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4529 Arithmetic.
4530 -------------------------------------------------------------------------------
4531 */
4532 float64 float128_to_float64( float128 a )
4533 {
4534     flag aSign;
4535     int32 aExp;
4536     bits64 aSig0, aSig1;
4537
4538     aSig1 = extractFloat128Frac1( a );
4539     aSig0 = extractFloat128Frac0( a );
4540     aExp = extractFloat128Exp( a );
4541     aSign = extractFloat128Sign( a );
4542     if ( aExp == 0x7FFF ) {
4543         if ( aSig0 | aSig1 ) {
4544             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4545         }
4546         return packFloat64( aSign, 0x7FF, 0 );
4547     }
4548     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4549     aSig0 |= ( aSig1 != 0 );
4550     if ( aExp || aSig0 ) {
4551         aSig0 |= LIT64( 0x4000000000000000 );
4552         aExp -= 0x3C01;
4553     }
4554     return roundAndPackFloat64( aSign, aExp, aSig0 );
4555
4556 }
4557
4558 #ifdef FLOATX80
4559
4560 /*
4561 -------------------------------------------------------------------------------
4562 Returns the result of converting the quadruple-precision floating-point
4563 value `a' to the extended double-precision floating-point format.  The
4564 conversion is performed according to the IEC/IEEE Standard for Binary
4565 Floating-Point Arithmetic.
4566 -------------------------------------------------------------------------------
4567 */
4568 floatx80 float128_to_floatx80( float128 a )
4569 {
4570     flag aSign;
4571     int32 aExp;
4572     bits64 aSig0, aSig1;
4573
4574     aSig1 = extractFloat128Frac1( a );
4575     aSig0 = extractFloat128Frac0( a );
4576     aExp = extractFloat128Exp( a );
4577     aSign = extractFloat128Sign( a );
4578     if ( aExp == 0x7FFF ) {
4579         if ( aSig0 | aSig1 ) {
4580             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4581         }
4582         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4583     }
4584     if ( aExp == 0 ) {
4585         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4586         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4587     }
4588     else {
4589         aSig0 |= LIT64( 0x0001000000000000 );
4590     }
4591     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4592     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4593
4594 }
4595
4596 #endif
4597
4598 /*
4599 -------------------------------------------------------------------------------
4600 Rounds the quadruple-precision floating-point value `a' to an integer, and
4601 returns the result as a quadruple-precision floating-point value.  The
4602 operation is performed according to the IEC/IEEE Standard for Binary
4603 Floating-Point Arithmetic.
4604 -------------------------------------------------------------------------------
4605 */
4606 float128 float128_round_to_int( float128 a )
4607 {
4608     flag aSign;
4609     int32 aExp;
4610     bits64 lastBitMask, roundBitsMask;
4611     int8 roundingMode;
4612     float128 z;
4613
4614     aExp = extractFloat128Exp( a );
4615     if ( 0x402F <= aExp ) {
4616         if ( 0x406F <= aExp ) {
4617             if (    ( aExp == 0x7FFF )
4618                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4619                ) {
4620                 return propagateFloat128NaN( a, a );
4621             }
4622             return a;
4623         }
4624         lastBitMask = 1;
4625         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4626         roundBitsMask = lastBitMask - 1;
4627         z = a;
4628         roundingMode = float_rounding_mode;
4629         if ( roundingMode == float_round_nearest_even ) {
4630             if ( lastBitMask ) {
4631                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4632                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4633             }
4634             else {
4635                 if ( (sbits64) z.low < 0 ) {
4636                     ++z.high;
4637                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4638                 }
4639             }
4640         }
4641         else if ( roundingMode != float_round_to_zero ) {
4642             if (   extractFloat128Sign( z )
4643                  ^ ( roundingMode == float_round_up ) ) {
4644                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4645             }
4646         }
4647         z.low &= ~ roundBitsMask;
4648     }
4649     else {
4650         if ( aExp < 0x3FFF ) {
4651             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4652             float_exception_flags |= float_flag_inexact;
4653             aSign = extractFloat128Sign( a );
4654             switch ( float_rounding_mode ) {
4655              case float_round_nearest_even:
4656                 if (    ( aExp == 0x3FFE )
4657                      && (   extractFloat128Frac0( a )
4658                           | extractFloat128Frac1( a ) )
4659                    ) {
4660                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4661                 }
4662                 break;
4663              case float_round_to_zero:
4664                 break;
4665              case float_round_down:
4666                 return
4667                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4668                     : packFloat128( 0, 0, 0, 0 );
4669              case float_round_up:
4670                 return
4671                       aSign ? packFloat128( 1, 0, 0, 0 )
4672                     : packFloat128( 0, 0x3FFF, 0, 0 );
4673             }
4674             return packFloat128( aSign, 0, 0, 0 );
4675         }
4676         lastBitMask = 1;
4677         lastBitMask <<= 0x402F - aExp;
4678         roundBitsMask = lastBitMask - 1;
4679         z.low = 0;
4680         z.high = a.high;
4681         roundingMode = float_rounding_mode;
4682         if ( roundingMode == float_round_nearest_even ) {
4683             z.high += lastBitMask>>1;
4684             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4685                 z.high &= ~ lastBitMask;
4686             }
4687         }
4688         else if ( roundingMode != float_round_to_zero ) {
4689             if (   extractFloat128Sign( z )
4690                  ^ ( roundingMode == float_round_up ) ) {
4691                 z.high |= ( a.low != 0 );
4692                 z.high += roundBitsMask;
4693             }
4694         }
4695         z.high &= ~ roundBitsMask;
4696     }
4697     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4698         float_exception_flags |= float_flag_inexact;
4699     }
4700     return z;
4701
4702 }
4703
4704 /*
4705 -------------------------------------------------------------------------------
4706 Returns the result of adding the absolute values of the quadruple-precision
4707 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4708 before being returned.  `zSign' is ignored if the result is a NaN.
4709 The addition is performed according to the IEC/IEEE Standard for Binary
4710 Floating-Point Arithmetic.
4711 -------------------------------------------------------------------------------
4712 */
4713 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4714 {
4715     int32 aExp, bExp, zExp;
4716     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4717     int32 expDiff;
4718
4719     aSig1 = extractFloat128Frac1( a );
4720     aSig0 = extractFloat128Frac0( a );
4721     aExp = extractFloat128Exp( a );
4722     bSig1 = extractFloat128Frac1( b );
4723     bSig0 = extractFloat128Frac0( b );
4724     bExp = extractFloat128Exp( b );
4725     expDiff = aExp - bExp;
4726     if ( 0 < expDiff ) {
4727         if ( aExp == 0x7FFF ) {
4728             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4729             return a;
4730         }
4731         if ( bExp == 0 ) {
4732             --expDiff;
4733         }
4734         else {
4735             bSig0 |= LIT64( 0x0001000000000000 );
4736         }
4737         shift128ExtraRightJamming(
4738             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4739         zExp = aExp;
4740     }
4741     else if ( expDiff < 0 ) {
4742         if ( bExp == 0x7FFF ) {
4743             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4744             return packFloat128( zSign, 0x7FFF, 0, 0 );
4745         }
4746         if ( aExp == 0 ) {
4747             ++expDiff;
4748         }
4749         else {
4750             aSig0 |= LIT64( 0x0001000000000000 );
4751         }
4752         shift128ExtraRightJamming(
4753             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4754         zExp = bExp;
4755     }
4756     else {
4757         if ( aExp == 0x7FFF ) {
4758             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4759                 return propagateFloat128NaN( a, b );
4760             }
4761             return a;
4762         }
4763         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4764         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4765         zSig2 = 0;
4766         zSig0 |= LIT64( 0x0002000000000000 );
4767         zExp = aExp;
4768         goto shiftRight1;
4769     }
4770     aSig0 |= LIT64( 0x0001000000000000 );
4771     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4772     --zExp;
4773     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4774     ++zExp;
4775  shiftRight1:
4776     shift128ExtraRightJamming(
4777         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4778  roundAndPack:
4779     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4780
4781 }
4782
4783 /*
4784 -------------------------------------------------------------------------------
4785 Returns the result of subtracting the absolute values of the quadruple-
4786 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4787 difference is negated before being returned.  `zSign' is ignored if the
4788 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4789 Standard for Binary Floating-Point Arithmetic.
4790 -------------------------------------------------------------------------------
4791 */
4792 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4793 {
4794     int32 aExp, bExp, zExp;
4795     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4796     int32 expDiff;
4797     float128 z;
4798
4799     aSig1 = extractFloat128Frac1( a );
4800     aSig0 = extractFloat128Frac0( a );
4801     aExp = extractFloat128Exp( a );
4802     bSig1 = extractFloat128Frac1( b );
4803     bSig0 = extractFloat128Frac0( b );
4804     bExp = extractFloat128Exp( b );
4805     expDiff = aExp - bExp;
4806     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4807     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4808     if ( 0 < expDiff ) goto aExpBigger;
4809     if ( expDiff < 0 ) goto bExpBigger;
4810     if ( aExp == 0x7FFF ) {
4811         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4812             return propagateFloat128NaN( a, b );
4813         }
4814         float_raise( float_flag_invalid );
4815         z.low = float128_default_nan_low;
4816         z.high = float128_default_nan_high;
4817         return z;
4818     }
4819     if ( aExp == 0 ) {
4820         aExp = 1;
4821         bExp = 1;
4822     }
4823     if ( bSig0 < aSig0 ) goto aBigger;
4824     if ( aSig0 < bSig0 ) goto bBigger;
4825     if ( bSig1 < aSig1 ) goto aBigger;
4826     if ( aSig1 < bSig1 ) goto bBigger;
4827     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4828  bExpBigger:
4829     if ( bExp == 0x7FFF ) {
4830         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4831         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4832     }
4833     if ( aExp == 0 ) {
4834         ++expDiff;
4835     }
4836     else {
4837         aSig0 |= LIT64( 0x4000000000000000 );
4838     }
4839     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4840     bSig0 |= LIT64( 0x4000000000000000 );
4841  bBigger:
4842     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4843     zExp = bExp;
4844     zSign ^= 1;
4845     goto normalizeRoundAndPack;
4846  aExpBigger:
4847     if ( aExp == 0x7FFF ) {
4848         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4849         return a;
4850     }
4851     if ( bExp == 0 ) {
4852         --expDiff;
4853     }
4854     else {
4855         bSig0 |= LIT64( 0x4000000000000000 );
4856     }
4857     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4858     aSig0 |= LIT64( 0x4000000000000000 );
4859  aBigger:
4860     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4861     zExp = aExp;
4862  normalizeRoundAndPack:
4863     --zExp;
4864     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4865
4866 }
4867
4868 /*
4869 -------------------------------------------------------------------------------
4870 Returns the result of adding the quadruple-precision floating-point values
4871 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4872 for Binary Floating-Point Arithmetic.
4873 -------------------------------------------------------------------------------
4874 */
4875 float128 float128_add( float128 a, float128 b )
4876 {
4877     flag aSign, bSign;
4878
4879     aSign = extractFloat128Sign( a );
4880     bSign = extractFloat128Sign( b );
4881     if ( aSign == bSign ) {
4882         return addFloat128Sigs( a, b, aSign );
4883     }
4884     else {
4885         return subFloat128Sigs( a, b, aSign );
4886     }
4887
4888 }
4889
4890 /*
4891 -------------------------------------------------------------------------------
4892 Returns the result of subtracting the quadruple-precision floating-point
4893 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4894 Standard for Binary Floating-Point Arithmetic.
4895 -------------------------------------------------------------------------------
4896 */
4897 float128 float128_sub( float128 a, float128 b )
4898 {
4899     flag aSign, bSign;
4900
4901     aSign = extractFloat128Sign( a );
4902     bSign = extractFloat128Sign( b );
4903     if ( aSign == bSign ) {
4904         return subFloat128Sigs( a, b, aSign );
4905     }
4906     else {
4907         return addFloat128Sigs( a, b, aSign );
4908     }
4909
4910 }
4911
4912 /*
4913 -------------------------------------------------------------------------------
4914 Returns the result of multiplying the quadruple-precision floating-point
4915 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4916 Standard for Binary Floating-Point Arithmetic.
4917 -------------------------------------------------------------------------------
4918 */
4919 float128 float128_mul( float128 a, float128 b )
4920 {
4921     flag aSign, bSign, zSign;
4922     int32 aExp, bExp, zExp;
4923     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4924     float128 z;
4925
4926     aSig1 = extractFloat128Frac1( a );
4927     aSig0 = extractFloat128Frac0( a );
4928     aExp = extractFloat128Exp( a );
4929     aSign = extractFloat128Sign( a );
4930     bSig1 = extractFloat128Frac1( b );
4931     bSig0 = extractFloat128Frac0( b );
4932     bExp = extractFloat128Exp( b );
4933     bSign = extractFloat128Sign( b );
4934     zSign = aSign ^ bSign;
4935     if ( aExp == 0x7FFF ) {
4936         if (    ( aSig0 | aSig1 )
4937              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4938             return propagateFloat128NaN( a, b );
4939         }
4940         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4941         return packFloat128( zSign, 0x7FFF, 0, 0 );
4942     }
4943     if ( bExp == 0x7FFF ) {
4944         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4945         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4946  invalid:
4947             float_raise( float_flag_invalid );
4948             z.low = float128_default_nan_low;
4949             z.high = float128_default_nan_high;
4950             return z;
4951         }
4952         return packFloat128( zSign, 0x7FFF, 0, 0 );
4953     }
4954     if ( aExp == 0 ) {
4955         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4956         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4957     }
4958     if ( bExp == 0 ) {
4959         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4960         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4961     }
4962     zExp = aExp + bExp - 0x4000;
4963     aSig0 |= LIT64( 0x0001000000000000 );
4964     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4965     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4966     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4967     zSig2 |= ( zSig3 != 0 );
4968     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4969         shift128ExtraRightJamming(
4970             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4971         ++zExp;
4972     }
4973     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4974
4975 }
4976
4977 /*
4978 -------------------------------------------------------------------------------
4979 Returns the result of dividing the quadruple-precision floating-point value
4980 `a' by the corresponding value `b'.  The operation is performed according to
4981 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4982 -------------------------------------------------------------------------------
4983 */
4984 float128 float128_div( float128 a, float128 b )
4985 {
4986     flag aSign, bSign, zSign;
4987     int32 aExp, bExp, zExp;
4988     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4989     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4990     float128 z;
4991
4992     aSig1 = extractFloat128Frac1( a );
4993     aSig0 = extractFloat128Frac0( a );
4994     aExp = extractFloat128Exp( a );
4995     aSign = extractFloat128Sign( a );
4996     bSig1 = extractFloat128Frac1( b );
4997     bSig0 = extractFloat128Frac0( b );
4998     bExp = extractFloat128Exp( b );
4999     bSign = extractFloat128Sign( b );
5000     zSign = aSign ^ bSign;
5001     if ( aExp == 0x7FFF ) {
5002         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5003         if ( bExp == 0x7FFF ) {
5004             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5005             goto invalid;
5006         }
5007         return packFloat128( zSign, 0x7FFF, 0, 0 );
5008     }
5009     if ( bExp == 0x7FFF ) {
5010         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5011         return packFloat128( zSign, 0, 0, 0 );
5012     }
5013     if ( bExp == 0 ) {
5014         if ( ( bSig0 | bSig1 ) == 0 ) {
5015             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5016  invalid:
5017                 float_raise( float_flag_invalid );
5018                 z.low = float128_default_nan_low;
5019                 z.high = float128_default_nan_high;
5020                 return z;
5021             }
5022             float_raise( float_flag_divbyzero );
5023             return packFloat128( zSign, 0x7FFF, 0, 0 );
5024         }
5025         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5026     }
5027     if ( aExp == 0 ) {
5028         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5029         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5030     }
5031     zExp = aExp - bExp + 0x3FFD;
5032     shortShift128Left(
5033         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5034     shortShift128Left(
5035         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5036     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5037         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5038         ++zExp;
5039     }
5040     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5041     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5042     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5043     while ( (sbits64) rem0 < 0 ) {
5044         --zSig0;
5045         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5046     }
5047     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5048     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5049         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5050         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5051         while ( (sbits64) rem1 < 0 ) {
5052             --zSig1;
5053             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5054         }
5055         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5056     }
5057     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5058     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5059
5060 }
5061
5062 /*
5063 -------------------------------------------------------------------------------
5064 Returns the remainder of the quadruple-precision floating-point value `a'
5065 with respect to the corresponding value `b'.  The operation is performed
5066 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5067 -------------------------------------------------------------------------------
5068 */
5069 float128 float128_rem( float128 a, float128 b )
5070 {
5071     flag aSign, bSign, zSign;
5072     int32 aExp, bExp, expDiff;
5073     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5074     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5075     sbits64 sigMean0;
5076     float128 z;
5077
5078     aSig1 = extractFloat128Frac1( a );
5079     aSig0 = extractFloat128Frac0( a );
5080     aExp = extractFloat128Exp( a );
5081     aSign = extractFloat128Sign( a );
5082     bSig1 = extractFloat128Frac1( b );
5083     bSig0 = extractFloat128Frac0( b );
5084     bExp = extractFloat128Exp( b );
5085     bSign = extractFloat128Sign( b );
5086     if ( aExp == 0x7FFF ) {
5087         if (    ( aSig0 | aSig1 )
5088              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5089             return propagateFloat128NaN( a, b );
5090         }
5091         goto invalid;
5092     }
5093     if ( bExp == 0x7FFF ) {
5094         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5095         return a;
5096     }
5097     if ( bExp == 0 ) {
5098         if ( ( bSig0 | bSig1 ) == 0 ) {
5099  invalid:
5100             float_raise( float_flag_invalid );
5101             z.low = float128_default_nan_low;
5102             z.high = float128_default_nan_high;
5103             return z;
5104         }
5105         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5106     }
5107     if ( aExp == 0 ) {
5108         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5109         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5110     }
5111     expDiff = aExp - bExp;
5112     if ( expDiff < -1 ) return a;
5113     shortShift128Left(
5114         aSig0 | LIT64( 0x0001000000000000 ),
5115         aSig1,
5116         15 - ( expDiff < 0 ),
5117         &aSig0,
5118         &aSig1
5119     );
5120     shortShift128Left(
5121         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5122     q = le128( bSig0, bSig1, aSig0, aSig1 );
5123     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5124     expDiff -= 64;
5125     while ( 0 < expDiff ) {
5126         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5127         q = ( 4 < q ) ? q - 4 : 0;
5128         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5129         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5130         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5131         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5132         expDiff -= 61;
5133     }
5134     if ( -64 < expDiff ) {
5135         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5136         q = ( 4 < q ) ? q - 4 : 0;
5137         q >>= - expDiff;
5138         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5139         expDiff += 52;
5140         if ( expDiff < 0 ) {
5141             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5142         }
5143         else {
5144             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5145         }
5146         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5147         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5148     }
5149     else {
5150         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5151         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5152     }
5153     do {
5154         alternateASig0 = aSig0;
5155         alternateASig1 = aSig1;
5156         ++q;
5157         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5158     } while ( 0 <= (sbits64) aSig0 );
5159     add128(
5160         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5161     if (    ( sigMean0 < 0 )
5162          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5163         aSig0 = alternateASig0;
5164         aSig1 = alternateASig1;
5165     }
5166     zSign = ( (sbits64) aSig0 < 0 );
5167     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5168     return
5169         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5170
5171 }
5172
5173 /*
5174 -------------------------------------------------------------------------------
5175 Returns the square root of the quadruple-precision floating-point value `a'.
5176 The operation is performed according to the IEC/IEEE Standard for Binary
5177 Floating-Point Arithmetic.
5178 -------------------------------------------------------------------------------
5179 */
5180 float128 float128_sqrt( float128 a )
5181 {
5182     flag aSign;
5183     int32 aExp, zExp;
5184     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5185     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5186     float128 z;
5187
5188     aSig1 = extractFloat128Frac1( a );
5189     aSig0 = extractFloat128Frac0( a );
5190     aExp = extractFloat128Exp( a );
5191     aSign = extractFloat128Sign( a );
5192     if ( aExp == 0x7FFF ) {
5193         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5194         if ( ! aSign ) return a;
5195         goto invalid;
5196     }
5197     if ( aSign ) {
5198         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5199  invalid:
5200         float_raise( float_flag_invalid );
5201         z.low = float128_default_nan_low;
5202         z.high = float128_default_nan_high;
5203         return z;
5204     }
5205     if ( aExp == 0 ) {
5206         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5207         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5208     }
5209     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5210     aSig0 |= LIT64( 0x0001000000000000 );
5211     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5212     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5213     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5214     doubleZSig0 = zSig0<<1;
5215     mul64To128( zSig0, zSig0, &term0, &term1 );
5216     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5217     while ( (sbits64) rem0 < 0 ) {
5218         --zSig0;
5219         doubleZSig0 -= 2;
5220         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5221     }
5222     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5223     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5224         if ( zSig1 == 0 ) zSig1 = 1;
5225         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5226         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5227         mul64To128( zSig1, zSig1, &term2, &term3 );
5228         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5229         while ( (sbits64) rem1 < 0 ) {
5230             --zSig1;
5231             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5232             term3 |= 1;
5233             term2 |= doubleZSig0;
5234             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5235         }
5236         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5237     }
5238     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5239     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5240
5241 }
5242
5243 /*
5244 -------------------------------------------------------------------------------
5245 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5246 the corresponding value `b', and 0 otherwise.  The comparison is performed
5247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5248 -------------------------------------------------------------------------------
5249 */
5250 flag float128_eq( float128 a, float128 b )
5251 {
5252
5253     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5254               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5255          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5256               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5257        ) {
5258         if (    float128_is_signaling_nan( a )
5259              || float128_is_signaling_nan( b ) ) {
5260             float_raise( float_flag_invalid );
5261         }
5262         return 0;
5263     }
5264     return
5265            ( a.low == b.low )
5266         && (    ( a.high == b.high )
5267              || (    ( a.low == 0 )
5268                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5269            );
5270
5271 }
5272
5273 /*
5274 -------------------------------------------------------------------------------
5275 Returns 1 if the quadruple-precision floating-point value `a' is less than
5276 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5277 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5278 Arithmetic.
5279 -------------------------------------------------------------------------------
5280 */
5281 flag float128_le( float128 a, float128 b )
5282 {
5283     flag aSign, bSign;
5284
5285     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5286               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5287          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5288               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5289        ) {
5290         float_raise( float_flag_invalid );
5291         return 0;
5292     }
5293     aSign = extractFloat128Sign( a );
5294     bSign = extractFloat128Sign( b );
5295     if ( aSign != bSign ) {
5296         return
5297                aSign
5298             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5299                  == 0 );
5300     }
5301     return
5302           aSign ? le128( b.high, b.low, a.high, a.low )
5303         : le128( a.high, a.low, b.high, b.low );
5304
5305 }
5306
5307 /*
5308 -------------------------------------------------------------------------------
5309 Returns 1 if the quadruple-precision floating-point value `a' is less than
5310 the corresponding value `b', and 0 otherwise.  The comparison is performed
5311 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5312 -------------------------------------------------------------------------------
5313 */
5314 flag float128_lt( float128 a, float128 b )
5315 {
5316     flag aSign, bSign;
5317
5318     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5319               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5320          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5321               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5322        ) {
5323         float_raise( float_flag_invalid );
5324         return 0;
5325     }
5326     aSign = extractFloat128Sign( a );
5327     bSign = extractFloat128Sign( b );
5328     if ( aSign != bSign ) {
5329         return
5330                aSign
5331             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5332                  != 0 );
5333     }
5334     return
5335           aSign ? lt128( b.high, b.low, a.high, a.low )
5336         : lt128( a.high, a.low, b.high, b.low );
5337
5338 }
5339
5340 /*
5341 -------------------------------------------------------------------------------
5342 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5343 the corresponding value `b', and 0 otherwise.  The invalid exception is
5344 raised if either operand is a NaN.  Otherwise, the comparison is performed
5345 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5346 -------------------------------------------------------------------------------
5347 */
5348 flag float128_eq_signaling( float128 a, float128 b )
5349 {
5350
5351     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5352               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5353          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5354               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5355        ) {
5356         float_raise( float_flag_invalid );
5357         return 0;
5358     }
5359     return
5360            ( a.low == b.low )
5361         && (    ( a.high == b.high )
5362              || (    ( a.low == 0 )
5363                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5364            );
5365
5366 }
5367
5368 /*
5369 -------------------------------------------------------------------------------
5370 Returns 1 if the quadruple-precision floating-point value `a' is less than
5371 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5372 cause an exception.  Otherwise, the comparison is performed according to the
5373 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5374 -------------------------------------------------------------------------------
5375 */
5376 flag float128_le_quiet( float128 a, float128 b )
5377 {
5378     flag aSign, bSign;
5379
5380     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5381               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5382          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5383               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5384        ) {
5385         if (    float128_is_signaling_nan( a )
5386              || float128_is_signaling_nan( b ) ) {
5387             float_raise( float_flag_invalid );
5388         }
5389         return 0;
5390     }
5391     aSign = extractFloat128Sign( a );
5392     bSign = extractFloat128Sign( b );
5393     if ( aSign != bSign ) {
5394         return
5395                aSign
5396             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5397                  == 0 );
5398     }
5399     return
5400           aSign ? le128( b.high, b.low, a.high, a.low )
5401         : le128( a.high, a.low, b.high, b.low );
5402
5403 }
5404
5405 /*
5406 -------------------------------------------------------------------------------
5407 Returns 1 if the quadruple-precision floating-point value `a' is less than
5408 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5409 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5410 Standard for Binary Floating-Point Arithmetic.
5411 -------------------------------------------------------------------------------
5412 */
5413 flag float128_lt_quiet( float128 a, float128 b )
5414 {
5415     flag aSign, bSign;
5416
5417     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5418               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5419          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5420               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5421        ) {
5422         if (    float128_is_signaling_nan( a )
5423              || float128_is_signaling_nan( b ) ) {
5424             float_raise( float_flag_invalid );
5425         }
5426         return 0;
5427     }
5428     aSign = extractFloat128Sign( a );
5429     bSign = extractFloat128Sign( b );
5430     if ( aSign != bSign ) {
5431         return
5432                aSign
5433             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5434                  != 0 );
5435     }
5436     return
5437           aSign ? lt128( b.high, b.low, a.high, a.low )
5438         : lt128( a.high, a.low, b.high, b.low );
5439
5440 }
5441
5442 #endif
5443
5444
5445 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5446
5447 /*
5448  * These two routines are not part of the original softfloat distribution.
5449  *
5450  * They are based on the corresponding conversions to integer but return
5451  * unsigned numbers instead since these functions are required by GCC.
5452  *
5453  * Added by Mark Brinicombe <mark@NetBSD.org>   27/09/97
5454  *
5455  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5456  */
5457
5458 /*
5459 -------------------------------------------------------------------------------
5460 Returns the result of converting the double-precision floating-point value
5461 `a' to the 32-bit unsigned integer format.  The conversion is
5462 performed according to the IEC/IEEE Standard for Binary Floating-point
5463 Arithmetic, except that the conversion is always rounded toward zero.  If
5464 `a' is a NaN, the largest positive integer is returned.  If the conversion
5465 overflows, the largest integer positive is returned.
5466 -------------------------------------------------------------------------------
5467 */
5468 uint32 float64_to_uint32_round_to_zero( float64 a )
5469 {
5470     flag aSign;
5471     int16 aExp, shiftCount;
5472     bits64 aSig, savedASig;
5473     uint32 z;
5474
5475     aSig = extractFloat64Frac( a );
5476     aExp = extractFloat64Exp( a );
5477     aSign = extractFloat64Sign( a );
5478
5479     if (aSign) {
5480         float_raise( float_flag_invalid );
5481         return(0);
5482     }
5483
5484     if ( 0x41E < aExp ) {
5485         float_raise( float_flag_invalid );
5486         return 0xffffffff;
5487     }
5488     else if ( aExp < 0x3FF ) {
5489         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5490         return 0;
5491     }
5492     aSig |= LIT64( 0x0010000000000000 );
5493     shiftCount = 0x433 - aExp;
5494     savedASig = aSig;
5495     aSig >>= shiftCount;
5496     z = aSig;
5497     if ( ( aSig<<shiftCount ) != savedASig ) {
5498         float_exception_flags |= float_flag_inexact;
5499     }
5500     return z;
5501
5502 }
5503
5504 /*
5505 -------------------------------------------------------------------------------
5506 Returns the result of converting the single-precision floating-point value
5507 `a' to the 32-bit unsigned integer format.  The conversion is
5508 performed according to the IEC/IEEE Standard for Binary Floating-point
5509 Arithmetic, except that the conversion is always rounded toward zero.  If
5510 `a' is a NaN, the largest positive integer is returned.  If the conversion
5511 overflows, the largest positive integer is returned.
5512 -------------------------------------------------------------------------------
5513 */
5514 uint32 float32_to_uint32_round_to_zero( float32 a )
5515 {
5516     flag aSign;
5517     int16 aExp, shiftCount;
5518     bits32 aSig;
5519     uint32 z;
5520
5521     aSig = extractFloat32Frac( a );
5522     aExp = extractFloat32Exp( a );
5523     aSign = extractFloat32Sign( a );
5524     shiftCount = aExp - 0x9E;
5525
5526     if (aSign) {
5527         float_raise( float_flag_invalid );
5528         return(0);
5529     }
5530     if ( 0 < shiftCount ) {
5531         float_raise( float_flag_invalid );
5532         return 0xFFFFFFFF;
5533     }
5534     else if ( aExp <= 0x7E ) {
5535         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5536         return 0;
5537     }
5538     aSig = ( aSig | 0x800000 )<<8;
5539     z = aSig>>( - shiftCount );
5540     if ( aSig<<( shiftCount & 31 ) ) {
5541         float_exception_flags |= float_flag_inexact;
5542     }
5543     return z;
5544
5545 }
5546
5547 #endif