lib/libc/softfloat/bits64/softfloat.c

   1 /* $NetBSD: softfloat.c,v 1.5 2007/11/08 21:31:04 martin Exp $ */
   2
   3 /*
   4  * This version hacked for use with gcc -msoft-float by bjh21.
   5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
   6  *  itself).
   7  */
   8
   9 /*
  10  * Things you may want to define:
  11  *
  12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
  13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
  14  *   properly renamed.
  15  */
  16
  17 /*
  18 ===============================================================================
  19
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 #include <sys/cdefs.h>
  48 #if defined(LIBC_SCCS) && !defined(lint)
  49 __RCSID("$NetBSD: softfloat.c,v 1.5 2007/11/08 21:31:04 martin Exp $");
  50 #endif /* LIBC_SCCS and not lint */
  51
  52 #ifdef SOFTFLOAT_FOR_GCC
  53 #include "softfloat-for-gcc.h"
  54 #endif
  55
  56 #include "milieu.h"
  57 #include "softfloat.h"
  58
  59 /*
  60  * Conversions between floats as stored in memory and floats as
  61  * SoftFloat uses them
  62  */
  63 #ifndef FLOAT64_DEMANGLE
  64 #define FLOAT64_DEMANGLE(a)     (a)
  65 #endif
  66 #ifndef FLOAT64_MANGLE
  67 #define FLOAT64_MANGLE(a)       (a)
  68 #endif
  69
  70 /*
  71 -------------------------------------------------------------------------------
  72 Floating-point rounding mode, extended double-precision rounding precision,
  73 and exception flags.
  74 -------------------------------------------------------------------------------
  75 */
  76 fp_rnd float_rounding_mode = float_round_nearest_even;
  77 fp_except float_exception_flags = 0;
  78 #ifdef FLOATX80
  79 int8 floatx80_rounding_precision = 80;
  80 #endif
  81
  82 /*
  83 -------------------------------------------------------------------------------
  84 Primitive arithmetic functions, including multi-word arithmetic, and
  85 division and square root approximations.  (Can be specialized to target if
  86 desired.)
  87 -------------------------------------------------------------------------------
  88 */
  89 #include "softfloat-macros"
  90
  91 /*
  92 -------------------------------------------------------------------------------
  93 Functions and definitions to determine:  (1) whether tininess for underflow
  94 is detected before or after rounding by default, (2) what (if anything)
  95 happens when exceptions are raised, (3) how signaling NaNs are distinguished
  96 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  97 are propagated from function inputs to output.  These details are target-
  98 specific.
  99 -------------------------------------------------------------------------------
 100 */
 101 #include "softfloat-specialize"
 102
 103 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
 104 /*
 105 -------------------------------------------------------------------------------
 106 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 107 and 7, and returns the properly rounded 32-bit integer corresponding to the
 108 input.  If `zSign' is 1, the input is negated before being converted to an
 109 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 110 is simply rounded to an integer, with the inexact exception raised if the
 111 input cannot be represented exactly as an integer.  However, if the fixed-
 112 point input is too large, the invalid exception is raised and the largest
 113 positive or negative integer is returned.
 114 -------------------------------------------------------------------------------
 115 */
 116 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
 117 {
 118     int8 roundingMode;
 119     flag roundNearestEven;
 120     int8 roundIncrement, roundBits;
 121     int32 z;
 122
 123     roundingMode = float_rounding_mode;
 124     roundNearestEven = ( roundingMode == float_round_nearest_even );
 125     roundIncrement = 0x40;
 126     if ( ! roundNearestEven ) {
 127         if ( roundingMode == float_round_to_zero ) {
 128             roundIncrement = 0;
 129         }
 130         else {
 131             roundIncrement = 0x7F;
 132             if ( zSign ) {
 133                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 134             }
 135             else {
 136                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 137             }
 138         }
 139     }
 140     roundBits = absZ & 0x7F;
 141     absZ = ( absZ + roundIncrement )>>7;
 142     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 143     z = absZ;
 144     if ( zSign ) z = - z;
 145     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 146         float_raise( float_flag_invalid );
 147         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 148     }
 149     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 150     return z;
 151
 152 }
 153
 154 /*
 155 -------------------------------------------------------------------------------
 156 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 157 `absZ1', with binary point between bits 63 and 64 (between the input words),
 158 and returns the properly rounded 64-bit integer corresponding to the input.
 159 If `zSign' is 1, the input is negated before being converted to an integer.
 160 Ordinarily, the fixed-point input is simply rounded to an integer, with
 161 the inexact exception raised if the input cannot be represented exactly as
 162 an integer.  However, if the fixed-point input is too large, the invalid
 163 exception is raised and the largest positive or negative integer is
 164 returned.
 165 -------------------------------------------------------------------------------
 166 */
 167 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
 168 {
 169     int8 roundingMode;
 170     flag roundNearestEven, increment;
 171     int64 z;
 172
 173     roundingMode = float_rounding_mode;
 174     roundNearestEven = ( roundingMode == float_round_nearest_even );
 175     increment = ( (sbits64) absZ1 < 0 );
 176     if ( ! roundNearestEven ) {
 177         if ( roundingMode == float_round_to_zero ) {
 178             increment = 0;
 179         }
 180         else {
 181             if ( zSign ) {
 182                 increment = ( roundingMode == float_round_down ) && absZ1;
 183             }
 184             else {
 185                 increment = ( roundingMode == float_round_up ) && absZ1;
 186             }
 187         }
 188     }
 189     if ( increment ) {
 190         ++absZ0;
 191         if ( absZ0 == 0 ) goto overflow;
 192         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 193     }
 194     z = absZ0;
 195     if ( zSign ) z = - z;
 196     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 197  overflow:
 198         float_raise( float_flag_invalid );
 199         return
 200               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 201             : LIT64( 0x7FFFFFFFFFFFFFFF );
 202     }
 203     if ( absZ1 ) float_exception_flags |= float_flag_inexact;
 204     return z;
 205
 206 }
 207 #endif
 208
 209 /*
 210 -------------------------------------------------------------------------------
 211 Returns the fraction bits of the single-precision floating-point value `a'.
 212 -------------------------------------------------------------------------------
 213 */
 214 INLINE bits32 extractFloat32Frac( float32 a )
 215 {
 216
 217     return a & 0x007FFFFF;
 218
 219 }
 220
 221 /*
 222 -------------------------------------------------------------------------------
 223 Returns the exponent bits of the single-precision floating-point value `a'.
 224 -------------------------------------------------------------------------------
 225 */
 226 INLINE int16 extractFloat32Exp( float32 a )
 227 {
 228
 229     return ( a>>23 ) & 0xFF;
 230
 231 }
 232
 233 /*
 234 -------------------------------------------------------------------------------
 235 Returns the sign bit of the single-precision floating-point value `a'.
 236 -------------------------------------------------------------------------------
 237 */
 238 INLINE flag extractFloat32Sign( float32 a )
 239 {
 240
 241     return a>>31;
 242
 243 }
 244
 245 /*
 246 -------------------------------------------------------------------------------
 247 Normalizes the subnormal single-precision floating-point value represented
 248 by the denormalized significand `aSig'.  The normalized exponent and
 249 significand are stored at the locations pointed to by `zExpPtr' and
 250 `zSigPtr', respectively.
 251 -------------------------------------------------------------------------------
 252 */
 253 static void
 254  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 255 {
 256     int8 shiftCount;
 257
 258     shiftCount = countLeadingZeros32( aSig ) - 8;
 259     *zSigPtr = aSig<<shiftCount;
 260     *zExpPtr = 1 - shiftCount;
 261
 262 }
 263
 264 /*
 265 -------------------------------------------------------------------------------
 266 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 267 single-precision floating-point value, returning the result.  After being
 268 shifted into the proper positions, the three fields are simply added
 269 together to form the result.  This means that any integer portion of `zSig'
 270 will be added into the exponent.  Since a properly normalized significand
 271 will have an integer portion equal to 1, the `zExp' input should be 1 less
 272 than the desired result exponent whenever `zSig' is a complete, normalized
 273 significand.
 274 -------------------------------------------------------------------------------
 275 */
 276 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 277 {
 278
 279     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 280
 281 }
 282
 283 /*
 284 -------------------------------------------------------------------------------
 285 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 286 and significand `zSig', and returns the proper single-precision floating-
 287 point value corresponding to the abstract input.  Ordinarily, the abstract
 288 value is simply rounded and packed into the single-precision format, with
 289 the inexact exception raised if the abstract input cannot be represented
 290 exactly.  However, if the abstract value is too large, the overflow and
 291 inexact exceptions are raised and an infinity or maximal finite value is
 292 returned.  If the abstract value is too small, the input value is rounded to
 293 a subnormal number, and the underflow and inexact exceptions are raised if
 294 the abstract input cannot be represented exactly as a subnormal single-
 295 precision floating-point number.
 296     The input significand `zSig' has its binary point between bits 30
 297 and 29, which is 7 bits to the left of the usual location.  This shifted
 298 significand must be normalized or smaller.  If `zSig' is not normalized,
 299 `zExp' must be 0; in that case, the result returned is a subnormal number,
 300 and it must not require rounding.  In the usual case that `zSig' is
 301 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 302 The handling of underflow and overflow follows the IEC/IEEE Standard for
 303 Binary Floating-Point Arithmetic.
 304 -------------------------------------------------------------------------------
 305 */
 306 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 307 {
 308     int8 roundingMode;
 309     flag roundNearestEven;
 310     int8 roundIncrement, roundBits;
 311     flag isTiny;
 312
 313     roundingMode = float_rounding_mode;
 314     roundNearestEven = ( roundingMode == float_round_nearest_even );
 315     roundIncrement = 0x40;
 316     if ( ! roundNearestEven ) {
 317         if ( roundingMode == float_round_to_zero ) {
 318             roundIncrement = 0;
 319         }
 320         else {
 321             roundIncrement = 0x7F;
 322             if ( zSign ) {
 323                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 324             }
 325             else {
 326                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 327             }
 328         }
 329     }
 330     roundBits = zSig & 0x7F;
 331     if ( 0xFD <= (bits16) zExp ) {
 332         if (    ( 0xFD < zExp )
 333              || (    ( zExp == 0xFD )
 334                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 335            ) {
 336             float_raise( float_flag_overflow | float_flag_inexact );
 337             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 338         }
 339         if ( zExp < 0 ) {
 340             isTiny =
 341                    ( float_detect_tininess == float_tininess_before_rounding )
 342                 || ( zExp < -1 )
 343                 || ( zSig + roundIncrement < 0x80000000 );
 344             shift32RightJamming( zSig, - zExp, &zSig );
 345             zExp = 0;
 346             roundBits = zSig & 0x7F;
 347             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 348         }
 349     }
 350     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 351     zSig = ( zSig + roundIncrement )>>7;
 352     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 353     if ( zSig == 0 ) zExp = 0;
 354     return packFloat32( zSign, zExp, zSig );
 355
 356 }
 357
 358 /*
 359 -------------------------------------------------------------------------------
 360 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 361 and significand `zSig', and returns the proper single-precision floating-
 362 point value corresponding to the abstract input.  This routine is just like
 363 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 364 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 365 floating-point exponent.
 366 -------------------------------------------------------------------------------
 367 */
 368 static float32
 369  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 370 {
 371     int8 shiftCount;
 372
 373     shiftCount = countLeadingZeros32( zSig ) - 1;
 374     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
 375
 376 }
 377
 378 /*
 379 -------------------------------------------------------------------------------
 380 Returns the fraction bits of the double-precision floating-point value `a'.
 381 -------------------------------------------------------------------------------
 382 */
 383 INLINE bits64 extractFloat64Frac( float64 a )
 384 {
 385
 386     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
 387
 388 }
 389
 390 /*
 391 -------------------------------------------------------------------------------
 392 Returns the exponent bits of the double-precision floating-point value `a'.
 393 -------------------------------------------------------------------------------
 394 */
 395 INLINE int16 extractFloat64Exp( float64 a )
 396 {
 397
 398     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
 399
 400 }
 401
 402 /*
 403 -------------------------------------------------------------------------------
 404 Returns the sign bit of the double-precision floating-point value `a'.
 405 -------------------------------------------------------------------------------
 406 */
 407 INLINE flag extractFloat64Sign( float64 a )
 408 {
 409
 410     return FLOAT64_DEMANGLE(a)>>63;
 411
 412 }
 413
 414 /*
 415 -------------------------------------------------------------------------------
 416 Normalizes the subnormal double-precision floating-point value represented
 417 by the denormalized significand `aSig'.  The normalized exponent and
 418 significand are stored at the locations pointed to by `zExpPtr' and
 419 `zSigPtr', respectively.
 420 -------------------------------------------------------------------------------
 421 */
 422 static void
 423  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 424 {
 425     int8 shiftCount;
 426
 427     shiftCount = countLeadingZeros64( aSig ) - 11;
 428     *zSigPtr = aSig<<shiftCount;
 429     *zExpPtr = 1 - shiftCount;
 430
 431 }
 432
 433 /*
 434 -------------------------------------------------------------------------------
 435 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 436 double-precision floating-point value, returning the result.  After being
 437 shifted into the proper positions, the three fields are simply added
 438 together to form the result.  This means that any integer portion of `zSig'
 439 will be added into the exponent.  Since a properly normalized significand
 440 will have an integer portion equal to 1, the `zExp' input should be 1 less
 441 than the desired result exponent whenever `zSig' is a complete, normalized
 442 significand.
 443 -------------------------------------------------------------------------------
 444 */
 445 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 446 {
 447
 448     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
 449                            ( ( (bits64) zExp )<<52 ) + zSig );
 450
 451 }
 452
 453 /*
 454 -------------------------------------------------------------------------------
 455 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 456 and significand `zSig', and returns the proper double-precision floating-
 457 point value corresponding to the abstract input.  Ordinarily, the abstract
 458 value is simply rounded and packed into the double-precision format, with
 459 the inexact exception raised if the abstract input cannot be represented
 460 exactly.  However, if the abstract value is too large, the overflow and
 461 inexact exceptions are raised and an infinity or maximal finite value is
 462 returned.  If the abstract value is too small, the input value is rounded to
 463 a subnormal number, and the underflow and inexact exceptions are raised if
 464 the abstract input cannot be represented exactly as a subnormal double-
 465 precision floating-point number.
 466     The input significand `zSig' has its binary point between bits 62
 467 and 61, which is 10 bits to the left of the usual location.  This shifted
 468 significand must be normalized or smaller.  If `zSig' is not normalized,
 469 `zExp' must be 0; in that case, the result returned is a subnormal number,
 470 and it must not require rounding.  In the usual case that `zSig' is
 471 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 472 The handling of underflow and overflow follows the IEC/IEEE Standard for
 473 Binary Floating-Point Arithmetic.
 474 -------------------------------------------------------------------------------
 475 */
 476 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 477 {
 478     int8 roundingMode;
 479     flag roundNearestEven;
 480     int16 roundIncrement, roundBits;
 481     flag isTiny;
 482
 483     roundingMode = float_rounding_mode;
 484     roundNearestEven = ( roundingMode == float_round_nearest_even );
 485     roundIncrement = 0x200;
 486     if ( ! roundNearestEven ) {
 487         if ( roundingMode == float_round_to_zero ) {
 488             roundIncrement = 0;
 489         }
 490         else {
 491             roundIncrement = 0x3FF;
 492             if ( zSign ) {
 493                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 494             }
 495             else {
 496                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 497             }
 498         }
 499     }
 500     roundBits = zSig & 0x3FF;
 501     if ( 0x7FD <= (bits16) zExp ) {
 502         if (    ( 0x7FD < zExp )
 503              || (    ( zExp == 0x7FD )
 504                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 505            ) {
 506             float_raise( float_flag_overflow | float_flag_inexact );
 507             return FLOAT64_MANGLE(
 508                 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
 509                 ( roundIncrement == 0 ));
 510         }
 511         if ( zExp < 0 ) {
 512             isTiny =
 513                    ( float_detect_tininess == float_tininess_before_rounding )
 514                 || ( zExp < -1 )
 515                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 516             shift64RightJamming( zSig, - zExp, &zSig );
 517             zExp = 0;
 518             roundBits = zSig & 0x3FF;
 519             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 520         }
 521     }
 522     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 523     zSig = ( zSig + roundIncrement )>>10;
 524     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 525     if ( zSig == 0 ) zExp = 0;
 526     return packFloat64( zSign, zExp, zSig );
 527
 528 }
 529
 530 /*
 531 -------------------------------------------------------------------------------
 532 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 533 and significand `zSig', and returns the proper double-precision floating-
 534 point value corresponding to the abstract input.  This routine is just like
 535 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 536 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 537 floating-point exponent.
 538 -------------------------------------------------------------------------------
 539 */
 540 static float64
 541  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 542 {
 543     int8 shiftCount;
 544
 545     shiftCount = countLeadingZeros64( zSig ) - 1;
 546     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
 547
 548 }
 549
 550 #ifdef FLOATX80
 551
 552 /*
 553 -------------------------------------------------------------------------------
 554 Returns the fraction bits of the extended double-precision floating-point
 555 value `a'.
 556 -------------------------------------------------------------------------------
 557 */
 558 INLINE bits64 extractFloatx80Frac( floatx80 a )
 559 {
 560
 561     return a.low;
 562
 563 }
 564
 565 /*
 566 -------------------------------------------------------------------------------
 567 Returns the exponent bits of the extended double-precision floating-point
 568 value `a'.
 569 -------------------------------------------------------------------------------
 570 */
 571 INLINE int32 extractFloatx80Exp( floatx80 a )
 572 {
 573
 574     return a.high & 0x7FFF;
 575
 576 }
 577
 578 /*
 579 -------------------------------------------------------------------------------
 580 Returns the sign bit of the extended double-precision floating-point value
 581 `a'.
 582 -------------------------------------------------------------------------------
 583 */
 584 INLINE flag extractFloatx80Sign( floatx80 a )
 585 {
 586
 587     return a.high>>15;
 588
 589 }
 590
 591 /*
 592 -------------------------------------------------------------------------------
 593 Normalizes the subnormal extended double-precision floating-point value
 594 represented by the denormalized significand `aSig'.  The normalized exponent
 595 and significand are stored at the locations pointed to by `zExpPtr' and
 596 `zSigPtr', respectively.
 597 -------------------------------------------------------------------------------
 598 */
 599 static void
 600  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 601 {
 602     int8 shiftCount;
 603
 604     shiftCount = countLeadingZeros64( aSig );
 605     *zSigPtr = aSig<<shiftCount;
 606     *zExpPtr = 1 - shiftCount;
 607
 608 }
 609
 610 /*
 611 -------------------------------------------------------------------------------
 612 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 613 extended double-precision floating-point value, returning the result.
 614 -------------------------------------------------------------------------------
 615 */
 616 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 617 {
 618     floatx80 z;
 619
 620     z.low = zSig;
 621     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 622     return z;
 623
 624 }
 625
 626 /*
 627 -------------------------------------------------------------------------------
 628 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 629 and extended significand formed by the concatenation of `zSig0' and `zSig1',
 630 and returns the proper extended double-precision floating-point value
 631 corresponding to the abstract input.  Ordinarily, the abstract value is
 632 rounded and packed into the extended double-precision format, with the
 633 inexact exception raised if the abstract input cannot be represented
 634 exactly.  However, if the abstract value is too large, the overflow and
 635 inexact exceptions are raised and an infinity or maximal finite value is
 636 returned.  If the abstract value is too small, the input value is rounded to
 637 a subnormal number, and the underflow and inexact exceptions are raised if
 638 the abstract input cannot be represented exactly as a subnormal extended
 639 double-precision floating-point number.
 640     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 641 number of bits as single or double precision, respectively.  Otherwise, the
 642 result is rounded to the full precision of the extended double-precision
 643 format.
 644     The input significand must be normalized or smaller.  If the input
 645 significand is not normalized, `zExp' must be 0; in that case, the result
 646 returned is a subnormal number, and it must not require rounding.  The
 647 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 648 Floating-Point Arithmetic.
 649 -------------------------------------------------------------------------------
 650 */
 651 static floatx80
 652  roundAndPackFloatx80(
 653      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 654  )
 655 {
 656     int8 roundingMode;
 657     flag roundNearestEven, increment, isTiny;
 658     int64 roundIncrement, roundMask, roundBits;
 659
 660     roundingMode = float_rounding_mode;
 661     roundNearestEven = ( roundingMode == float_round_nearest_even );
 662     if ( roundingPrecision == 80 ) goto precision80;
 663     if ( roundingPrecision == 64 ) {
 664         roundIncrement = LIT64( 0x0000000000000400 );
 665         roundMask = LIT64( 0x00000000000007FF );
 666     }
 667     else if ( roundingPrecision == 32 ) {
 668         roundIncrement = LIT64( 0x0000008000000000 );
 669         roundMask = LIT64( 0x000000FFFFFFFFFF );
 670     }
 671     else {
 672         goto precision80;
 673     }
 674     zSig0 |= ( zSig1 != 0 );
 675     if ( ! roundNearestEven ) {
 676         if ( roundingMode == float_round_to_zero ) {
 677             roundIncrement = 0;
 678         }
 679         else {
 680             roundIncrement = roundMask;
 681             if ( zSign ) {
 682                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 683             }
 684             else {
 685                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 686             }
 687         }
 688     }
 689     roundBits = zSig0 & roundMask;
 690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 691         if (    ( 0x7FFE < zExp )
 692              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 693            ) {
 694             goto overflow;
 695         }
 696         if ( zExp <= 0 ) {
 697             isTiny =
 698                    ( float_detect_tininess == float_tininess_before_rounding )
 699                 || ( zExp < 0 )
 700                 || ( zSig0 <= zSig0 + roundIncrement );
 701             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 702             zExp = 0;
 703             roundBits = zSig0 & roundMask;
 704             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 705             if ( roundBits ) float_exception_flags |= float_flag_inexact;
 706             zSig0 += roundIncrement;
 707             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 708             roundIncrement = roundMask + 1;
 709             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 710                 roundMask |= roundIncrement;
 711             }
 712             zSig0 &= ~ roundMask;
 713             return packFloatx80( zSign, zExp, zSig0 );
 714         }
 715     }
 716     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 717     zSig0 += roundIncrement;
 718     if ( zSig0 < roundIncrement ) {
 719         ++zExp;
 720         zSig0 = LIT64( 0x8000000000000000 );
 721     }
 722     roundIncrement = roundMask + 1;
 723     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 724         roundMask |= roundIncrement;
 725     }
 726     zSig0 &= ~ roundMask;
 727     if ( zSig0 == 0 ) zExp = 0;
 728     return packFloatx80( zSign, zExp, zSig0 );
 729  precision80:
 730     increment = ( (sbits64) zSig1 < 0 );
 731     if ( ! roundNearestEven ) {
 732         if ( roundingMode == float_round_to_zero ) {
 733             increment = 0;
 734         }
 735         else {
 736             if ( zSign ) {
 737                 increment = ( roundingMode == float_round_down ) && zSig1;
 738             }
 739             else {
 740                 increment = ( roundingMode == float_round_up ) && zSig1;
 741             }
 742         }
 743     }
 744     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 745         if (    ( 0x7FFE < zExp )
 746              || (    ( zExp == 0x7FFE )
 747                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 748                   && increment
 749                 )
 750            ) {
 751             roundMask = 0;
 752  overflow:
 753             float_raise( float_flag_overflow | float_flag_inexact );
 754             if (    ( roundingMode == float_round_to_zero )
 755                  || ( zSign && ( roundingMode == float_round_up ) )
 756                  || ( ! zSign && ( roundingMode == float_round_down ) )
 757                ) {
 758                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 759             }
 760             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 761         }
 762         if ( zExp <= 0 ) {
 763             isTiny =
 764                    ( float_detect_tininess == float_tininess_before_rounding )
 765                 || ( zExp < 0 )
 766                 || ! increment
 767                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 768             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 769             zExp = 0;
 770             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
 771             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 772             if ( roundNearestEven ) {
 773                 increment = ( (sbits64) zSig1 < 0 );
 774             }
 775             else {
 776                 if ( zSign ) {
 777                     increment = ( roundingMode == float_round_down ) && zSig1;
 778                 }
 779                 else {
 780                     increment = ( roundingMode == float_round_up ) && zSig1;
 781                 }
 782             }
 783             if ( increment ) {
 784                 ++zSig0;
 785                 zSig0 &=
 786                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 787                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 788             }
 789             return packFloatx80( zSign, zExp, zSig0 );
 790         }
 791     }
 792     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 793     if ( increment ) {
 794         ++zSig0;
 795         if ( zSig0 == 0 ) {
 796             ++zExp;
 797             zSig0 = LIT64( 0x8000000000000000 );
 798         }
 799         else {
 800             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 801         }
 802     }
 803     else {
 804         if ( zSig0 == 0 ) zExp = 0;
 805     }
 806     return packFloatx80( zSign, zExp, zSig0 );
 807
 808 }
 809
 810 /*
 811 -------------------------------------------------------------------------------
 812 Takes an abstract floating-point value having sign `zSign', exponent
 813 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 814 and returns the proper extended double-precision floating-point value
 815 corresponding to the abstract input.  This routine is just like
 816 `roundAndPackFloatx80' except that the input significand does not have to be
 817 normalized.
 818 -------------------------------------------------------------------------------
 819 */
 820 static floatx80
 821  normalizeRoundAndPackFloatx80(
 822      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 823  )
 824 {
 825     int8 shiftCount;
 826
 827     if ( zSig0 == 0 ) {
 828         zSig0 = zSig1;
 829         zSig1 = 0;
 830         zExp -= 64;
 831     }
 832     shiftCount = countLeadingZeros64( zSig0 );
 833     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 834     zExp -= shiftCount;
 835     return
 836         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
 837
 838 }
 839
 840 #endif
 841
 842 #ifdef FLOAT128
 843
 844 /*
 845 -------------------------------------------------------------------------------
 846 Returns the least-significant 64 fraction bits of the quadruple-precision
 847 floating-point value `a'.
 848 -------------------------------------------------------------------------------
 849 */
 850 INLINE bits64 extractFloat128Frac1( float128 a )
 851 {
 852
 853     return a.low;
 854
 855 }
 856
 857 /*
 858 -------------------------------------------------------------------------------
 859 Returns the most-significant 48 fraction bits of the quadruple-precision
 860 floating-point value `a'.
 861 -------------------------------------------------------------------------------
 862 */
 863 INLINE bits64 extractFloat128Frac0( float128 a )
 864 {
 865
 866     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 867
 868 }
 869
 870 /*
 871 -------------------------------------------------------------------------------
 872 Returns the exponent bits of the quadruple-precision floating-point value
 873 `a'.
 874 -------------------------------------------------------------------------------
 875 */
 876 INLINE int32 extractFloat128Exp( float128 a )
 877 {
 878
 879     return ( a.high>>48 ) & 0x7FFF;
 880
 881 }
 882
 883 /*
 884 -------------------------------------------------------------------------------
 885 Returns the sign bit of the quadruple-precision floating-point value `a'.
 886 -------------------------------------------------------------------------------
 887 */
 888 INLINE flag extractFloat128Sign( float128 a )
 889 {
 890
 891     return a.high>>63;
 892
 893 }
 894
 895 /*
 896 -------------------------------------------------------------------------------
 897 Normalizes the subnormal quadruple-precision floating-point value
 898 represented by the denormalized significand formed by the concatenation of
 899 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 900 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 901 significand are stored at the location pointed to by `zSig0Ptr', and the
 902 least significant 64 bits of the normalized significand are stored at the
 903 location pointed to by `zSig1Ptr'.
 904 -------------------------------------------------------------------------------
 905 */
 906 static void
 907  normalizeFloat128Subnormal(
 908      bits64 aSig0,
 909      bits64 aSig1,
 910      int32 *zExpPtr,
 911      bits64 *zSig0Ptr,
 912      bits64 *zSig1Ptr
 913  )
 914 {
 915     int8 shiftCount;
 916
 917     if ( aSig0 == 0 ) {
 918         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 919         if ( shiftCount < 0 ) {
 920             *zSig0Ptr = aSig1>>( - shiftCount );
 921             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 922         }
 923         else {
 924             *zSig0Ptr = aSig1<<shiftCount;
 925             *zSig1Ptr = 0;
 926         }
 927         *zExpPtr = - shiftCount - 63;
 928     }
 929     else {
 930         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 931         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 932         *zExpPtr = 1 - shiftCount;
 933     }
 934
 935 }
 936
 937 /*
 938 -------------------------------------------------------------------------------
 939 Packs the sign `zSign', the exponent `zExp', and the significand formed
 940 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 941 floating-point value, returning the result.  After being shifted into the
 942 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 943 added together to form the most significant 32 bits of the result.  This
 944 means that any integer portion of `zSig0' will be added into the exponent.
 945 Since a properly normalized significand will have an integer portion equal
 946 to 1, the `zExp' input should be 1 less than the desired result exponent
 947 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 948 significand.
 949 -------------------------------------------------------------------------------
 950 */
 951 INLINE float128
 952  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 953 {
 954     float128 z;
 955
 956     z.low = zSig1;
 957     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 958     return z;
 959
 960 }
 961
 962 /*
 963 -------------------------------------------------------------------------------
 964 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 965 and extended significand formed by the concatenation of `zSig0', `zSig1',
 966 and `zSig2', and returns the proper quadruple-precision floating-point value
 967 corresponding to the abstract input.  Ordinarily, the abstract value is
 968 simply rounded and packed into the quadruple-precision format, with the
 969 inexact exception raised if the abstract input cannot be represented
 970 exactly.  However, if the abstract value is too large, the overflow and
 971 inexact exceptions are raised and an infinity or maximal finite value is
 972 returned.  If the abstract value is too small, the input value is rounded to
 973 a subnormal number, and the underflow and inexact exceptions are raised if
 974 the abstract input cannot be represented exactly as a subnormal quadruple-
 975 precision floating-point number.
 976     The input significand must be normalized or smaller.  If the input
 977 significand is not normalized, `zExp' must be 0; in that case, the result
 978 returned is a subnormal number, and it must not require rounding.  In the
 979 usual case that the input significand is normalized, `zExp' must be 1 less
 980 than the ``true'' floating-point exponent.  The handling of underflow and
 981 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 982 -------------------------------------------------------------------------------
 983 */
 984 static float128
 985  roundAndPackFloat128(
 986      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
 987 {
 988     int8 roundingMode;
 989     flag roundNearestEven, increment, isTiny;
 990
 991     roundingMode = float_rounding_mode;
 992     roundNearestEven = ( roundingMode == float_round_nearest_even );
 993     increment = ( (sbits64) zSig2 < 0 );
 994     if ( ! roundNearestEven ) {
 995         if ( roundingMode == float_round_to_zero ) {
 996             increment = 0;
 997         }
 998         else {
 999             if ( zSign ) {
1000                 increment = ( roundingMode == float_round_down ) && zSig2;
1001             }
1002             else {
1003                 increment = ( roundingMode == float_round_up ) && zSig2;
1004             }
1005         }
1006     }
1007     if ( 0x7FFD <= (bits32) zExp ) {
1008         if (    ( 0x7FFD < zExp )
1009              || (    ( zExp == 0x7FFD )
1010                   && eq128(
1011                          LIT64( 0x0001FFFFFFFFFFFF ),
1012                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1013                          zSig0,
1014                          zSig1
1015                      )
1016                   && increment
1017                 )
1018            ) {
1019             float_raise( float_flag_overflow | float_flag_inexact );
1020             if (    ( roundingMode == float_round_to_zero )
1021                  || ( zSign && ( roundingMode == float_round_up ) )
1022                  || ( ! zSign && ( roundingMode == float_round_down ) )
1023                ) {
1024                 return
1025                     packFloat128(
1026                         zSign,
1027                         0x7FFE,
1028                         LIT64( 0x0000FFFFFFFFFFFF ),
1029                         LIT64( 0xFFFFFFFFFFFFFFFF )
1030                     );
1031             }
1032             return packFloat128( zSign, 0x7FFF, 0, 0 );
1033         }
1034         if ( zExp < 0 ) {
1035             isTiny =
1036                    ( float_detect_tininess == float_tininess_before_rounding )
1037                 || ( zExp < -1 )
1038                 || ! increment
1039                 || lt128(
1040                        zSig0,
1041                        zSig1,
1042                        LIT64( 0x0001FFFFFFFFFFFF ),
1043                        LIT64( 0xFFFFFFFFFFFFFFFF )
1044                    );
1045             shift128ExtraRightJamming(
1046                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1047             zExp = 0;
1048             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1049             if ( roundNearestEven ) {
1050                 increment = ( (sbits64) zSig2 < 0 );
1051             }
1052             else {
1053                 if ( zSign ) {
1054                     increment = ( roundingMode == float_round_down ) && zSig2;
1055                 }
1056                 else {
1057                     increment = ( roundingMode == float_round_up ) && zSig2;
1058                 }
1059             }
1060         }
1061     }
1062     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1063     if ( increment ) {
1064         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1065         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1066     }
1067     else {
1068         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1069     }
1070     return packFloat128( zSign, zExp, zSig0, zSig1 );
1071
1072 }
1073
1074 /*
1075 -------------------------------------------------------------------------------
1076 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1077 and significand formed by the concatenation of `zSig0' and `zSig1', and
1078 returns the proper quadruple-precision floating-point value corresponding
1079 to the abstract input.  This routine is just like `roundAndPackFloat128'
1080 except that the input significand has fewer bits and does not have to be
1081 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1082 point exponent.
1083 -------------------------------------------------------------------------------
1084 */
1085 static float128
1086  normalizeRoundAndPackFloat128(
1087      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1088 {
1089     int8 shiftCount;
1090     bits64 zSig2;
1091
1092     if ( zSig0 == 0 ) {
1093         zSig0 = zSig1;
1094         zSig1 = 0;
1095         zExp -= 64;
1096     }
1097     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1098     if ( 0 <= shiftCount ) {
1099         zSig2 = 0;
1100         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1101     }
1102     else {
1103         shift128ExtraRightJamming(
1104             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1105     }
1106     zExp -= shiftCount;
1107     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1108
1109 }
1110
1111 #endif
1112
1113 /*
1114 -------------------------------------------------------------------------------
1115 Returns the result of converting the 32-bit two's complement integer `a'
1116 to the single-precision floating-point format.  The conversion is performed
1117 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1118 -------------------------------------------------------------------------------
1119 */
1120 float32 int32_to_float32( int32 a )
1121 {
1122     flag zSign;
1123
1124     if ( a == 0 ) return 0;
1125     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1126     zSign = ( a < 0 );
1127     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1128
1129 }
1130
1131 /*
1132 -------------------------------------------------------------------------------
1133 Returns the result of converting the 32-bit two's complement integer `a'
1134 to the double-precision floating-point format.  The conversion is performed
1135 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1136 -------------------------------------------------------------------------------
1137 */
1138 float64 int32_to_float64( int32 a )
1139 {
1140     flag zSign;
1141     uint32 absA;
1142     int8 shiftCount;
1143     bits64 zSig;
1144
1145     if ( a == 0 ) return 0;
1146     zSign = ( a < 0 );
1147     absA = zSign ? - a : a;
1148     shiftCount = countLeadingZeros32( absA ) + 21;
1149     zSig = absA;
1150     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1151
1152 }
1153
1154 #ifdef FLOATX80
1155
1156 /*
1157 -------------------------------------------------------------------------------
1158 Returns the result of converting the 32-bit two's complement integer `a'
1159 to the extended double-precision floating-point format.  The conversion
1160 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161 Arithmetic.
1162 -------------------------------------------------------------------------------
1163 */
1164 floatx80 int32_to_floatx80( int32 a )
1165 {
1166     flag zSign;
1167     uint32 absA;
1168     int8 shiftCount;
1169     bits64 zSig;
1170
1171     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172     zSign = ( a < 0 );
1173     absA = zSign ? - a : a;
1174     shiftCount = countLeadingZeros32( absA ) + 32;
1175     zSig = absA;
1176     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177
1178 }
1179
1180 #endif
1181
1182 #ifdef FLOAT128
1183
1184 /*
1185 -------------------------------------------------------------------------------
1186 Returns the result of converting the 32-bit two's complement integer `a' to
1187 the quadruple-precision floating-point format.  The conversion is performed
1188 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1189 -------------------------------------------------------------------------------
1190 */
1191 float128 int32_to_float128( int32 a )
1192 {
1193     flag zSign;
1194     uint32 absA;
1195     int8 shiftCount;
1196     bits64 zSig0;
1197
1198     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1199     zSign = ( a < 0 );
1200     absA = zSign ? - a : a;
1201     shiftCount = countLeadingZeros32( absA ) + 17;
1202     zSig0 = absA;
1203     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1204
1205 }
1206
1207 #endif
1208
1209 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1210 /*
1211 -------------------------------------------------------------------------------
1212 Returns the result of converting the 64-bit two's complement integer `a'
1213 to the single-precision floating-point format.  The conversion is performed
1214 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1215 -------------------------------------------------------------------------------
1216 */
1217 float32 int64_to_float32( int64 a )
1218 {
1219     flag zSign;
1220     uint64 absA;
1221     int8 shiftCount;
1222
1223     if ( a == 0 ) return 0;
1224     zSign = ( a < 0 );
1225     absA = zSign ? - a : a;
1226     shiftCount = countLeadingZeros64( absA ) - 40;
1227     if ( 0 <= shiftCount ) {
1228         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1229     }
1230     else {
1231         shiftCount += 7;
1232         if ( shiftCount < 0 ) {
1233             shift64RightJamming( absA, - shiftCount, &absA );
1234         }
1235         else {
1236             absA <<= shiftCount;
1237         }
1238         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1239     }
1240
1241 }
1242
1243 /*
1244 -------------------------------------------------------------------------------
1245 Returns the result of converting the 64-bit two's complement integer `a'
1246 to the double-precision floating-point format.  The conversion is performed
1247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1248 -------------------------------------------------------------------------------
1249 */
1250 float64 int64_to_float64( int64 a )
1251 {
1252     flag zSign;
1253
1254     if ( a == 0 ) return 0;
1255     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1256         return packFloat64( 1, 0x43E, 0 );
1257     }
1258     zSign = ( a < 0 );
1259     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1260
1261 }
1262
1263 #ifdef FLOATX80
1264
1265 /*
1266 -------------------------------------------------------------------------------
1267 Returns the result of converting the 64-bit two's complement integer `a'
1268 to the extended double-precision floating-point format.  The conversion
1269 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1270 Arithmetic.
1271 -------------------------------------------------------------------------------
1272 */
1273 floatx80 int64_to_floatx80( int64 a )
1274 {
1275     flag zSign;
1276     uint64 absA;
1277     int8 shiftCount;
1278
1279     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1280     zSign = ( a < 0 );
1281     absA = zSign ? - a : a;
1282     shiftCount = countLeadingZeros64( absA );
1283     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1284
1285 }
1286
1287 #endif
1288
1289 #endif /* !SOFTFLOAT_FOR_GCC */
1290
1291 #ifdef FLOAT128
1292
1293 /*
1294 -------------------------------------------------------------------------------
1295 Returns the result of converting the 64-bit two's complement integer `a' to
1296 the quadruple-precision floating-point format.  The conversion is performed
1297 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1298 -------------------------------------------------------------------------------
1299 */
1300 float128 int64_to_float128( int64 a )
1301 {
1302     flag zSign;
1303     uint64 absA;
1304     int8 shiftCount;
1305     int32 zExp;
1306     bits64 zSig0, zSig1;
1307
1308     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1309     zSign = ( a < 0 );
1310     absA = zSign ? - a : a;
1311     shiftCount = countLeadingZeros64( absA ) + 49;
1312     zExp = 0x406E - shiftCount;
1313     if ( 64 <= shiftCount ) {
1314         zSig1 = 0;
1315         zSig0 = absA;
1316         shiftCount -= 64;
1317     }
1318     else {
1319         zSig1 = absA;
1320         zSig0 = 0;
1321     }
1322     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1323     return packFloat128( zSign, zExp, zSig0, zSig1 );
1324
1325 }
1326
1327 #endif
1328
1329 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1330 /*
1331 -------------------------------------------------------------------------------
1332 Returns the result of converting the single-precision floating-point value
1333 `a' to the 32-bit two's complement integer format.  The conversion is
1334 performed according to the IEC/IEEE Standard for Binary Floating-Point
1335 Arithmetic---which means in particular that the conversion is rounded
1336 according to the current rounding mode.  If `a' is a NaN, the largest
1337 positive integer is returned.  Otherwise, if the conversion overflows, the
1338 largest integer with the same sign as `a' is returned.
1339 -------------------------------------------------------------------------------
1340 */
1341 int32 float32_to_int32( float32 a )
1342 {
1343     flag aSign;
1344     int16 aExp, shiftCount;
1345     bits32 aSig;
1346     bits64 aSig64;
1347
1348     aSig = extractFloat32Frac( a );
1349     aExp = extractFloat32Exp( a );
1350     aSign = extractFloat32Sign( a );
1351     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1352     if ( aExp ) aSig |= 0x00800000;
1353     shiftCount = 0xAF - aExp;
1354     aSig64 = aSig;
1355     aSig64 <<= 32;
1356     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1357     return roundAndPackInt32( aSign, aSig64 );
1358
1359 }
1360 #endif /* !SOFTFLOAT_FOR_GCC */
1361
1362 /*
1363 -------------------------------------------------------------------------------
1364 Returns the result of converting the single-precision floating-point value
1365 `a' to the 32-bit two's complement integer format.  The conversion is
1366 performed according to the IEC/IEEE Standard for Binary Floating-Point
1367 Arithmetic, except that the conversion is always rounded toward zero.
1368 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1369 the conversion overflows, the largest integer with the same sign as `a' is
1370 returned.
1371 -------------------------------------------------------------------------------
1372 */
1373 int32 float32_to_int32_round_to_zero( float32 a )
1374 {
1375     flag aSign;
1376     int16 aExp, shiftCount;
1377     bits32 aSig;
1378     int32 z;
1379
1380     aSig = extractFloat32Frac( a );
1381     aExp = extractFloat32Exp( a );
1382     aSign = extractFloat32Sign( a );
1383     shiftCount = aExp - 0x9E;
1384     if ( 0 <= shiftCount ) {
1385         if ( a != 0xCF000000 ) {
1386             float_raise( float_flag_invalid );
1387             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1388         }
1389         return (sbits32) 0x80000000;
1390     }
1391     else if ( aExp <= 0x7E ) {
1392         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1393         return 0;
1394     }
1395     aSig = ( aSig | 0x00800000 )<<8;
1396     z = aSig>>( - shiftCount );
1397     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1398         float_exception_flags |= float_flag_inexact;
1399     }
1400     if ( aSign ) z = - z;
1401     return z;
1402
1403 }
1404
1405 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1406 /*
1407 -------------------------------------------------------------------------------
1408 Returns the result of converting the single-precision floating-point value
1409 `a' to the 64-bit two's complement integer format.  The conversion is
1410 performed according to the IEC/IEEE Standard for Binary Floating-Point
1411 Arithmetic---which means in particular that the conversion is rounded
1412 according to the current rounding mode.  If `a' is a NaN, the largest
1413 positive integer is returned.  Otherwise, if the conversion overflows, the
1414 largest integer with the same sign as `a' is returned.
1415 -------------------------------------------------------------------------------
1416 */
1417 int64 float32_to_int64( float32 a )
1418 {
1419     flag aSign;
1420     int16 aExp, shiftCount;
1421     bits32 aSig;
1422     bits64 aSig64, aSigExtra;
1423
1424     aSig = extractFloat32Frac( a );
1425     aExp = extractFloat32Exp( a );
1426     aSign = extractFloat32Sign( a );
1427     shiftCount = 0xBE - aExp;
1428     if ( shiftCount < 0 ) {
1429         float_raise( float_flag_invalid );
1430         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1431             return LIT64( 0x7FFFFFFFFFFFFFFF );
1432         }
1433         return (sbits64) LIT64( 0x8000000000000000 );
1434     }
1435     if ( aExp ) aSig |= 0x00800000;
1436     aSig64 = aSig;
1437     aSig64 <<= 40;
1438     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1439     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1440
1441 }
1442
1443 /*
1444 -------------------------------------------------------------------------------
1445 Returns the result of converting the single-precision floating-point value
1446 `a' to the 64-bit two's complement integer format.  The conversion is
1447 performed according to the IEC/IEEE Standard for Binary Floating-Point
1448 Arithmetic, except that the conversion is always rounded toward zero.  If
1449 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1450 conversion overflows, the largest integer with the same sign as `a' is
1451 returned.
1452 -------------------------------------------------------------------------------
1453 */
1454 int64 float32_to_int64_round_to_zero( float32 a )
1455 {
1456     flag aSign;
1457     int16 aExp, shiftCount;
1458     bits32 aSig;
1459     bits64 aSig64;
1460     int64 z;
1461
1462     aSig = extractFloat32Frac( a );
1463     aExp = extractFloat32Exp( a );
1464     aSign = extractFloat32Sign( a );
1465     shiftCount = aExp - 0xBE;
1466     if ( 0 <= shiftCount ) {
1467         if ( a != 0xDF000000 ) {
1468             float_raise( float_flag_invalid );
1469             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1470                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1471             }
1472         }
1473         return (sbits64) LIT64( 0x8000000000000000 );
1474     }
1475     else if ( aExp <= 0x7E ) {
1476         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1477         return 0;
1478     }
1479     aSig64 = aSig | 0x00800000;
1480     aSig64 <<= 40;
1481     z = aSig64>>( - shiftCount );
1482     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1483         float_exception_flags |= float_flag_inexact;
1484     }
1485     if ( aSign ) z = - z;
1486     return z;
1487
1488 }
1489 #endif /* !SOFTFLOAT_FOR_GCC */
1490
1491 /*
1492 -------------------------------------------------------------------------------
1493 Returns the result of converting the single-precision floating-point value
1494 `a' to the double-precision floating-point format.  The conversion is
1495 performed according to the IEC/IEEE Standard for Binary Floating-Point
1496 Arithmetic.
1497 -------------------------------------------------------------------------------
1498 */
1499 float64 float32_to_float64( float32 a )
1500 {
1501     flag aSign;
1502     int16 aExp;
1503     bits32 aSig;
1504
1505     aSig = extractFloat32Frac( a );
1506     aExp = extractFloat32Exp( a );
1507     aSign = extractFloat32Sign( a );
1508     if ( aExp == 0xFF ) {
1509         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1510         return packFloat64( aSign, 0x7FF, 0 );
1511     }
1512     if ( aExp == 0 ) {
1513         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1514         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1515         --aExp;
1516     }
1517     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1518
1519 }
1520
1521 #ifdef FLOATX80
1522
1523 /*
1524 -------------------------------------------------------------------------------
1525 Returns the result of converting the single-precision floating-point value
1526 `a' to the extended double-precision floating-point format.  The conversion
1527 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1528 Arithmetic.
1529 -------------------------------------------------------------------------------
1530 */
1531 floatx80 float32_to_floatx80( float32 a )
1532 {
1533     flag aSign;
1534     int16 aExp;
1535     bits32 aSig;
1536
1537     aSig = extractFloat32Frac( a );
1538     aExp = extractFloat32Exp( a );
1539     aSign = extractFloat32Sign( a );
1540     if ( aExp == 0xFF ) {
1541         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1542         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1543     }
1544     if ( aExp == 0 ) {
1545         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1546         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1547     }
1548     aSig |= 0x00800000;
1549     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1550
1551 }
1552
1553 #endif
1554
1555 #ifdef FLOAT128
1556
1557 /*
1558 -------------------------------------------------------------------------------
1559 Returns the result of converting the single-precision floating-point value
1560 `a' to the double-precision floating-point format.  The conversion is
1561 performed according to the IEC/IEEE Standard for Binary Floating-Point
1562 Arithmetic.
1563 -------------------------------------------------------------------------------
1564 */
1565 float128 float32_to_float128( float32 a )
1566 {
1567     flag aSign;
1568     int16 aExp;
1569     bits32 aSig;
1570
1571     aSig = extractFloat32Frac( a );
1572     aExp = extractFloat32Exp( a );
1573     aSign = extractFloat32Sign( a );
1574     if ( aExp == 0xFF ) {
1575         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1576         return packFloat128( aSign, 0x7FFF, 0, 0 );
1577     }
1578     if ( aExp == 0 ) {
1579         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1580         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1581         --aExp;
1582     }
1583     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1584
1585 }
1586
1587 #endif
1588
1589 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1590 /*
1591 -------------------------------------------------------------------------------
1592 Rounds the single-precision floating-point value `a' to an integer, and
1593 returns the result as a single-precision floating-point value.  The
1594 operation is performed according to the IEC/IEEE Standard for Binary
1595 Floating-Point Arithmetic.
1596 -------------------------------------------------------------------------------
1597 */
1598 float32 float32_round_to_int( float32 a )
1599 {
1600     flag aSign;
1601     int16 aExp;
1602     bits32 lastBitMask, roundBitsMask;
1603     int8 roundingMode;
1604     float32 z;
1605
1606     aExp = extractFloat32Exp( a );
1607     if ( 0x96 <= aExp ) {
1608         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1609             return propagateFloat32NaN( a, a );
1610         }
1611         return a;
1612     }
1613     if ( aExp <= 0x7E ) {
1614         if ( (bits32) ( a<<1 ) == 0 ) return a;
1615         float_exception_flags |= float_flag_inexact;
1616         aSign = extractFloat32Sign( a );
1617         switch ( float_rounding_mode ) {
1618          case float_round_nearest_even:
1619             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1620                 return packFloat32( aSign, 0x7F, 0 );
1621             }
1622             break;
1623          case float_round_to_zero:
1624             break;
1625          case float_round_down:
1626             return aSign ? 0xBF800000 : 0;
1627          case float_round_up:
1628             return aSign ? 0x80000000 : 0x3F800000;
1629         }
1630         return packFloat32( aSign, 0, 0 );
1631     }
1632     lastBitMask = 1;
1633     lastBitMask <<= 0x96 - aExp;
1634     roundBitsMask = lastBitMask - 1;
1635     z = a;
1636     roundingMode = float_rounding_mode;
1637     if ( roundingMode == float_round_nearest_even ) {
1638         z += lastBitMask>>1;
1639         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1640     }
1641     else if ( roundingMode != float_round_to_zero ) {
1642         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1643             z += roundBitsMask;
1644         }
1645     }
1646     z &= ~ roundBitsMask;
1647     if ( z != a ) float_exception_flags |= float_flag_inexact;
1648     return z;
1649
1650 }
1651 #endif /* !SOFTFLOAT_FOR_GCC */
1652
1653 /*
1654 -------------------------------------------------------------------------------
1655 Returns the result of adding the absolute values of the single-precision
1656 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1657 before being returned.  `zSign' is ignored if the result is a NaN.
1658 The addition is performed according to the IEC/IEEE Standard for Binary
1659 Floating-Point Arithmetic.
1660 -------------------------------------------------------------------------------
1661 */
1662 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1663 {
1664     int16 aExp, bExp, zExp;
1665     bits32 aSig, bSig, zSig;
1666     int16 expDiff;
1667
1668     aSig = extractFloat32Frac( a );
1669     aExp = extractFloat32Exp( a );
1670     bSig = extractFloat32Frac( b );
1671     bExp = extractFloat32Exp( b );
1672     expDiff = aExp - bExp;
1673     aSig <<= 6;
1674     bSig <<= 6;
1675     if ( 0 < expDiff ) {
1676         if ( aExp == 0xFF ) {
1677             if ( aSig ) return propagateFloat32NaN( a, b );
1678             return a;
1679         }
1680         if ( bExp == 0 ) {
1681             --expDiff;
1682         }
1683         else {
1684             bSig |= 0x20000000;
1685         }
1686         shift32RightJamming( bSig, expDiff, &bSig );
1687         zExp = aExp;
1688     }
1689     else if ( expDiff < 0 ) {
1690         if ( bExp == 0xFF ) {
1691             if ( bSig ) return propagateFloat32NaN( a, b );
1692             return packFloat32( zSign, 0xFF, 0 );
1693         }
1694         if ( aExp == 0 ) {
1695             ++expDiff;
1696         }
1697         else {
1698             aSig |= 0x20000000;
1699         }
1700         shift32RightJamming( aSig, - expDiff, &aSig );
1701         zExp = bExp;
1702     }
1703     else {
1704         if ( aExp == 0xFF ) {
1705             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1706             return a;
1707         }
1708         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1709         zSig = 0x40000000 + aSig + bSig;
1710         zExp = aExp;
1711         goto roundAndPack;
1712     }
1713     aSig |= 0x20000000;
1714     zSig = ( aSig + bSig )<<1;
1715     --zExp;
1716     if ( (sbits32) zSig < 0 ) {
1717         zSig = aSig + bSig;
1718         ++zExp;
1719     }
1720  roundAndPack:
1721     return roundAndPackFloat32( zSign, zExp, zSig );
1722
1723 }
1724
1725 /*
1726 -------------------------------------------------------------------------------
1727 Returns the result of subtracting the absolute values of the single-
1728 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1729 difference is negated before being returned.  `zSign' is ignored if the
1730 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1731 Standard for Binary Floating-Point Arithmetic.
1732 -------------------------------------------------------------------------------
1733 */
1734 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1735 {
1736     int16 aExp, bExp, zExp;
1737     bits32 aSig, bSig, zSig;
1738     int16 expDiff;
1739
1740     aSig = extractFloat32Frac( a );
1741     aExp = extractFloat32Exp( a );
1742     bSig = extractFloat32Frac( b );
1743     bExp = extractFloat32Exp( b );
1744     expDiff = aExp - bExp;
1745     aSig <<= 7;
1746     bSig <<= 7;
1747     if ( 0 < expDiff ) goto aExpBigger;
1748     if ( expDiff < 0 ) goto bExpBigger;
1749     if ( aExp == 0xFF ) {
1750         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1751         float_raise( float_flag_invalid );
1752         return float32_default_nan;
1753     }
1754     if ( aExp == 0 ) {
1755         aExp = 1;
1756         bExp = 1;
1757     }
1758     if ( bSig < aSig ) goto aBigger;
1759     if ( aSig < bSig ) goto bBigger;
1760     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1761  bExpBigger:
1762     if ( bExp == 0xFF ) {
1763         if ( bSig ) return propagateFloat32NaN( a, b );
1764         return packFloat32( zSign ^ 1, 0xFF, 0 );
1765     }
1766     if ( aExp == 0 ) {
1767         ++expDiff;
1768     }
1769     else {
1770         aSig |= 0x40000000;
1771     }
1772     shift32RightJamming( aSig, - expDiff, &aSig );
1773     bSig |= 0x40000000;
1774  bBigger:
1775     zSig = bSig - aSig;
1776     zExp = bExp;
1777     zSign ^= 1;
1778     goto normalizeRoundAndPack;
1779  aExpBigger:
1780     if ( aExp == 0xFF ) {
1781         if ( aSig ) return propagateFloat32NaN( a, b );
1782         return a;
1783     }
1784     if ( bExp == 0 ) {
1785         --expDiff;
1786     }
1787     else {
1788         bSig |= 0x40000000;
1789     }
1790     shift32RightJamming( bSig, expDiff, &bSig );
1791     aSig |= 0x40000000;
1792  aBigger:
1793     zSig = aSig - bSig;
1794     zExp = aExp;
1795  normalizeRoundAndPack:
1796     --zExp;
1797     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1798
1799 }
1800
1801 /*
1802 -------------------------------------------------------------------------------
1803 Returns the result of adding the single-precision floating-point values `a'
1804 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1805 Binary Floating-Point Arithmetic.
1806 -------------------------------------------------------------------------------
1807 */
1808 float32 float32_add( float32 a, float32 b )
1809 {
1810     flag aSign, bSign;
1811
1812     aSign = extractFloat32Sign( a );
1813     bSign = extractFloat32Sign( b );
1814     if ( aSign == bSign ) {
1815         return addFloat32Sigs( a, b, aSign );
1816     }
1817     else {
1818         return subFloat32Sigs( a, b, aSign );
1819     }
1820
1821 }
1822
1823 /*
1824 -------------------------------------------------------------------------------
1825 Returns the result of subtracting the single-precision floating-point values
1826 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1827 for Binary Floating-Point Arithmetic.
1828 -------------------------------------------------------------------------------
1829 */
1830 float32 float32_sub( float32 a, float32 b )
1831 {
1832     flag aSign, bSign;
1833
1834     aSign = extractFloat32Sign( a );
1835     bSign = extractFloat32Sign( b );
1836     if ( aSign == bSign ) {
1837         return subFloat32Sigs( a, b, aSign );
1838     }
1839     else {
1840         return addFloat32Sigs( a, b, aSign );
1841     }
1842
1843 }
1844
1845 /*
1846 -------------------------------------------------------------------------------
1847 Returns the result of multiplying the single-precision floating-point values
1848 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1849 for Binary Floating-Point Arithmetic.
1850 -------------------------------------------------------------------------------
1851 */
1852 float32 float32_mul( float32 a, float32 b )
1853 {
1854     flag aSign, bSign, zSign;
1855     int16 aExp, bExp, zExp;
1856     bits32 aSig, bSig;
1857     bits64 zSig64;
1858     bits32 zSig;
1859
1860     aSig = extractFloat32Frac( a );
1861     aExp = extractFloat32Exp( a );
1862     aSign = extractFloat32Sign( a );
1863     bSig = extractFloat32Frac( b );
1864     bExp = extractFloat32Exp( b );
1865     bSign = extractFloat32Sign( b );
1866     zSign = aSign ^ bSign;
1867     if ( aExp == 0xFF ) {
1868         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1869             return propagateFloat32NaN( a, b );
1870         }
1871         if ( ( bExp | bSig ) == 0 ) {
1872             float_raise( float_flag_invalid );
1873             return float32_default_nan;
1874         }
1875         return packFloat32( zSign, 0xFF, 0 );
1876     }
1877     if ( bExp == 0xFF ) {
1878         if ( bSig ) return propagateFloat32NaN( a, b );
1879         if ( ( aExp | aSig ) == 0 ) {
1880             float_raise( float_flag_invalid );
1881             return float32_default_nan;
1882         }
1883         return packFloat32( zSign, 0xFF, 0 );
1884     }
1885     if ( aExp == 0 ) {
1886         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1887         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1888     }
1889     if ( bExp == 0 ) {
1890         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1891         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1892     }
1893     zExp = aExp + bExp - 0x7F;
1894     aSig = ( aSig | 0x00800000 )<<7;
1895     bSig = ( bSig | 0x00800000 )<<8;
1896     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1897     zSig = zSig64;
1898     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1899         zSig <<= 1;
1900         --zExp;
1901     }
1902     return roundAndPackFloat32( zSign, zExp, zSig );
1903
1904 }
1905
1906 /*
1907 -------------------------------------------------------------------------------
1908 Returns the result of dividing the single-precision floating-point value `a'
1909 by the corresponding value `b'.  The operation is performed according to the
1910 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1911 -------------------------------------------------------------------------------
1912 */
1913 float32 float32_div( float32 a, float32 b )
1914 {
1915     flag aSign, bSign, zSign;
1916     int16 aExp, bExp, zExp;
1917     bits32 aSig, bSig, zSig;
1918
1919     aSig = extractFloat32Frac( a );
1920     aExp = extractFloat32Exp( a );
1921     aSign = extractFloat32Sign( a );
1922     bSig = extractFloat32Frac( b );
1923     bExp = extractFloat32Exp( b );
1924     bSign = extractFloat32Sign( b );
1925     zSign = aSign ^ bSign;
1926     if ( aExp == 0xFF ) {
1927         if ( aSig ) return propagateFloat32NaN( a, b );
1928         if ( bExp == 0xFF ) {
1929             if ( bSig ) return propagateFloat32NaN( a, b );
1930             float_raise( float_flag_invalid );
1931             return float32_default_nan;
1932         }
1933         return packFloat32( zSign, 0xFF, 0 );
1934     }
1935     if ( bExp == 0xFF ) {
1936         if ( bSig ) return propagateFloat32NaN( a, b );
1937         return packFloat32( zSign, 0, 0 );
1938     }
1939     if ( bExp == 0 ) {
1940         if ( bSig == 0 ) {
1941             if ( ( aExp | aSig ) == 0 ) {
1942                 float_raise( float_flag_invalid );
1943                 return float32_default_nan;
1944             }
1945             float_raise( float_flag_divbyzero );
1946             return packFloat32( zSign, 0xFF, 0 );
1947         }
1948         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1949     }
1950     if ( aExp == 0 ) {
1951         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1952         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1953     }
1954     zExp = aExp - bExp + 0x7D;
1955     aSig = ( aSig | 0x00800000 )<<7;
1956     bSig = ( bSig | 0x00800000 )<<8;
1957     if ( bSig <= ( aSig + aSig ) ) {
1958         aSig >>= 1;
1959         ++zExp;
1960     }
1961     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1962     if ( ( zSig & 0x3F ) == 0 ) {
1963         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1964     }
1965     return roundAndPackFloat32( zSign, zExp, zSig );
1966
1967 }
1968
1969 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1970 /*
1971 -------------------------------------------------------------------------------
1972 Returns the remainder of the single-precision floating-point value `a'
1973 with respect to the corresponding value `b'.  The operation is performed
1974 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1975 -------------------------------------------------------------------------------
1976 */
1977 float32 float32_rem( float32 a, float32 b )
1978 {
1979     flag aSign, bSign, zSign;
1980     int16 aExp, bExp, expDiff;
1981     bits32 aSig, bSig;
1982     bits32 q;
1983     bits64 aSig64, bSig64, q64;
1984     bits32 alternateASig;
1985     sbits32 sigMean;
1986
1987     aSig = extractFloat32Frac( a );
1988     aExp = extractFloat32Exp( a );
1989     aSign = extractFloat32Sign( a );
1990     bSig = extractFloat32Frac( b );
1991     bExp = extractFloat32Exp( b );
1992     bSign = extractFloat32Sign( b );
1993     if ( aExp == 0xFF ) {
1994         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1995             return propagateFloat32NaN( a, b );
1996         }
1997         float_raise( float_flag_invalid );
1998         return float32_default_nan;
1999     }
2000     if ( bExp == 0xFF ) {
2001         if ( bSig ) return propagateFloat32NaN( a, b );
2002         return a;
2003     }
2004     if ( bExp == 0 ) {
2005         if ( bSig == 0 ) {
2006             float_raise( float_flag_invalid );
2007             return float32_default_nan;
2008         }
2009         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2010     }
2011     if ( aExp == 0 ) {
2012         if ( aSig == 0 ) return a;
2013         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2014     }
2015     expDiff = aExp - bExp;
2016     aSig |= 0x00800000;
2017     bSig |= 0x00800000;
2018     if ( expDiff < 32 ) {
2019         aSig <<= 8;
2020         bSig <<= 8;
2021         if ( expDiff < 0 ) {
2022             if ( expDiff < -1 ) return a;
2023             aSig >>= 1;
2024         }
2025         q = ( bSig <= aSig );
2026         if ( q ) aSig -= bSig;
2027         if ( 0 < expDiff ) {
2028             q = ( ( (bits64) aSig )<<32 ) / bSig;
2029             q >>= 32 - expDiff;
2030             bSig >>= 2;
2031             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2032         }
2033         else {
2034             aSig >>= 2;
2035             bSig >>= 2;
2036         }
2037     }
2038     else {
2039         if ( bSig <= aSig ) aSig -= bSig;
2040         aSig64 = ( (bits64) aSig )<<40;
2041         bSig64 = ( (bits64) bSig )<<40;
2042         expDiff -= 64;
2043         while ( 0 < expDiff ) {
2044             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2045             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2046             aSig64 = - ( ( bSig * q64 )<<38 );
2047             expDiff -= 62;
2048         }
2049         expDiff += 64;
2050         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2051         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2052         q = q64>>( 64 - expDiff );
2053         bSig <<= 6;
2054         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2055     }
2056     do {
2057         alternateASig = aSig;
2058         ++q;
2059         aSig -= bSig;
2060     } while ( 0 <= (sbits32) aSig );
2061     sigMean = aSig + alternateASig;
2062     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2063         aSig = alternateASig;
2064     }
2065     zSign = ( (sbits32) aSig < 0 );
2066     if ( zSign ) aSig = - aSig;
2067     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2068
2069 }
2070 #endif /* !SOFTFLOAT_FOR_GCC */
2071
2072 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2073 /*
2074 -------------------------------------------------------------------------------
2075 Returns the square root of the single-precision floating-point value `a'.
2076 The operation is performed according to the IEC/IEEE Standard for Binary
2077 Floating-Point Arithmetic.
2078 -------------------------------------------------------------------------------
2079 */
2080 float32 float32_sqrt( float32 a )
2081 {
2082     flag aSign;
2083     int16 aExp, zExp;
2084     bits32 aSig, zSig;
2085     bits64 rem, term;
2086
2087     aSig = extractFloat32Frac( a );
2088     aExp = extractFloat32Exp( a );
2089     aSign = extractFloat32Sign( a );
2090     if ( aExp == 0xFF ) {
2091         if ( aSig ) return propagateFloat32NaN( a, 0 );
2092         if ( ! aSign ) return a;
2093         float_raise( float_flag_invalid );
2094         return float32_default_nan;
2095     }
2096     if ( aSign ) {
2097         if ( ( aExp | aSig ) == 0 ) return a;
2098         float_raise( float_flag_invalid );
2099         return float32_default_nan;
2100     }
2101     if ( aExp == 0 ) {
2102         if ( aSig == 0 ) return 0;
2103         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2104     }
2105     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2106     aSig = ( aSig | 0x00800000 )<<8;
2107     zSig = estimateSqrt32( aExp, aSig ) + 2;
2108     if ( ( zSig & 0x7F ) <= 5 ) {
2109         if ( zSig < 2 ) {
2110             zSig = 0x7FFFFFFF;
2111             goto roundAndPack;
2112         }
2113         aSig >>= aExp & 1;
2114         term = ( (bits64) zSig ) * zSig;
2115         rem = ( ( (bits64) aSig )<<32 ) - term;
2116         while ( (sbits64) rem < 0 ) {
2117             --zSig;
2118             rem += ( ( (bits64) zSig )<<1 ) | 1;
2119         }
2120         zSig |= ( rem != 0 );
2121     }
2122     shift32RightJamming( zSig, 1, &zSig );
2123  roundAndPack:
2124     return roundAndPackFloat32( 0, zExp, zSig );
2125
2126 }
2127 #endif /* !SOFTFLOAT_FOR_GCC */
2128
2129 /*
2130 -------------------------------------------------------------------------------
2131 Returns 1 if the single-precision floating-point value `a' is equal to
2132 the corresponding value `b', and 0 otherwise.  The comparison is performed
2133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2134 -------------------------------------------------------------------------------
2135 */
2136 flag float32_eq( float32 a, float32 b )
2137 {
2138
2139     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2140          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2141        ) {
2142         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2143             float_raise( float_flag_invalid );
2144         }
2145         return 0;
2146     }
2147     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2148
2149 }
2150
2151 /*
2152 -------------------------------------------------------------------------------
2153 Returns 1 if the single-precision floating-point value `a' is less than
2154 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2155 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2156 Arithmetic.
2157 -------------------------------------------------------------------------------
2158 */
2159 flag float32_le( float32 a, float32 b )
2160 {
2161     flag aSign, bSign;
2162
2163     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2164          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2165        ) {
2166         float_raise( float_flag_invalid );
2167         return 0;
2168     }
2169     aSign = extractFloat32Sign( a );
2170     bSign = extractFloat32Sign( b );
2171     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2172     return ( a == b ) || ( aSign ^ ( a < b ) );
2173
2174 }
2175
2176 /*
2177 -------------------------------------------------------------------------------
2178 Returns 1 if the single-precision floating-point value `a' is less than
2179 the corresponding value `b', and 0 otherwise.  The comparison is performed
2180 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2181 -------------------------------------------------------------------------------
2182 */
2183 flag float32_lt( float32 a, float32 b )
2184 {
2185     flag aSign, bSign;
2186
2187     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2188          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2189        ) {
2190         float_raise( float_flag_invalid );
2191         return 0;
2192     }
2193     aSign = extractFloat32Sign( a );
2194     bSign = extractFloat32Sign( b );
2195     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2196     return ( a != b ) && ( aSign ^ ( a < b ) );
2197
2198 }
2199
2200 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2201 /*
2202 -------------------------------------------------------------------------------
2203 Returns 1 if the single-precision floating-point value `a' is equal to
2204 the corresponding value `b', and 0 otherwise.  The invalid exception is
2205 raised if either operand is a NaN.  Otherwise, the comparison is performed
2206 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2207 -------------------------------------------------------------------------------
2208 */
2209 flag float32_eq_signaling( float32 a, float32 b )
2210 {
2211
2212     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2213          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2214        ) {
2215         float_raise( float_flag_invalid );
2216         return 0;
2217     }
2218     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2219
2220 }
2221
2222 /*
2223 -------------------------------------------------------------------------------
2224 Returns 1 if the single-precision floating-point value `a' is less than or
2225 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2226 cause an exception.  Otherwise, the comparison is performed according to the
2227 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2228 -------------------------------------------------------------------------------
2229 */
2230 flag float32_le_quiet( float32 a, float32 b )
2231 {
2232     flag aSign, bSign;
2233
2234     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2235          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2236        ) {
2237         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2238             float_raise( float_flag_invalid );
2239         }
2240         return 0;
2241     }
2242     aSign = extractFloat32Sign( a );
2243     bSign = extractFloat32Sign( b );
2244     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2245     return ( a == b ) || ( aSign ^ ( a < b ) );
2246
2247 }
2248
2249 /*
2250 -------------------------------------------------------------------------------
2251 Returns 1 if the single-precision floating-point value `a' is less than
2252 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2253 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2254 Standard for Binary Floating-Point Arithmetic.
2255 -------------------------------------------------------------------------------
2256 */
2257 flag float32_lt_quiet( float32 a, float32 b )
2258 {
2259     flag aSign, bSign;
2260
2261     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2262          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2263        ) {
2264         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2265             float_raise( float_flag_invalid );
2266         }
2267         return 0;
2268     }
2269     aSign = extractFloat32Sign( a );
2270     bSign = extractFloat32Sign( b );
2271     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2272     return ( a != b ) && ( aSign ^ ( a < b ) );
2273
2274 }
2275 #endif /* !SOFTFLOAT_FOR_GCC */
2276
2277 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2278 /*
2279 -------------------------------------------------------------------------------
2280 Returns the result of converting the double-precision floating-point value
2281 `a' to the 32-bit two's complement integer format.  The conversion is
2282 performed according to the IEC/IEEE Standard for Binary Floating-Point
2283 Arithmetic---which means in particular that the conversion is rounded
2284 according to the current rounding mode.  If `a' is a NaN, the largest
2285 positive integer is returned.  Otherwise, if the conversion overflows, the
2286 largest integer with the same sign as `a' is returned.
2287 -------------------------------------------------------------------------------
2288 */
2289 int32 float64_to_int32( float64 a )
2290 {
2291     flag aSign;
2292     int16 aExp, shiftCount;
2293     bits64 aSig;
2294
2295     aSig = extractFloat64Frac( a );
2296     aExp = extractFloat64Exp( a );
2297     aSign = extractFloat64Sign( a );
2298     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2299     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2300     shiftCount = 0x42C - aExp;
2301     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2302     return roundAndPackInt32( aSign, aSig );
2303
2304 }
2305 #endif /* !SOFTFLOAT_FOR_GCC */
2306
2307 /*
2308 -------------------------------------------------------------------------------
2309 Returns the result of converting the double-precision floating-point value
2310 `a' to the 32-bit two's complement integer format.  The conversion is
2311 performed according to the IEC/IEEE Standard for Binary Floating-Point
2312 Arithmetic, except that the conversion is always rounded toward zero.
2313 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2314 the conversion overflows, the largest integer with the same sign as `a' is
2315 returned.
2316 -------------------------------------------------------------------------------
2317 */
2318 int32 float64_to_int32_round_to_zero( float64 a )
2319 {
2320     flag aSign;
2321     int16 aExp, shiftCount;
2322     bits64 aSig, savedASig;
2323     int32 z;
2324
2325     aSig = extractFloat64Frac( a );
2326     aExp = extractFloat64Exp( a );
2327     aSign = extractFloat64Sign( a );
2328     if ( 0x41E < aExp ) {
2329         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2330         goto invalid;
2331     }
2332     else if ( aExp < 0x3FF ) {
2333         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2334         return 0;
2335     }
2336     aSig |= LIT64( 0x0010000000000000 );
2337     shiftCount = 0x433 - aExp;
2338     savedASig = aSig;
2339     aSig >>= shiftCount;
2340     z = aSig;
2341     if ( aSign ) z = - z;
2342     if ( ( z < 0 ) ^ aSign ) {
2343  invalid:
2344         float_raise( float_flag_invalid );
2345         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2346     }
2347     if ( ( aSig<<shiftCount ) != savedASig ) {
2348         float_exception_flags |= float_flag_inexact;
2349     }
2350     return z;
2351
2352 }
2353
2354 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2355 /*
2356 -------------------------------------------------------------------------------
2357 Returns the result of converting the double-precision floating-point value
2358 `a' to the 64-bit two's complement integer format.  The conversion is
2359 performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 Arithmetic---which means in particular that the conversion is rounded
2361 according to the current rounding mode.  If `a' is a NaN, the largest
2362 positive integer is returned.  Otherwise, if the conversion overflows, the
2363 largest integer with the same sign as `a' is returned.
2364 -------------------------------------------------------------------------------
2365 */
2366 int64 float64_to_int64( float64 a )
2367 {
2368     flag aSign;
2369     int16 aExp, shiftCount;
2370     bits64 aSig, aSigExtra;
2371
2372     aSig = extractFloat64Frac( a );
2373     aExp = extractFloat64Exp( a );
2374     aSign = extractFloat64Sign( a );
2375     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2376     shiftCount = 0x433 - aExp;
2377     if ( shiftCount <= 0 ) {
2378         if ( 0x43E < aExp ) {
2379             float_raise( float_flag_invalid );
2380             if (    ! aSign
2381                  || (    ( aExp == 0x7FF )
2382                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2383                ) {
2384                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2385             }
2386             return (sbits64) LIT64( 0x8000000000000000 );
2387         }
2388         aSigExtra = 0;
2389         aSig <<= - shiftCount;
2390     }
2391     else {
2392         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2393     }
2394     return roundAndPackInt64( aSign, aSig, aSigExtra );
2395
2396 }
2397
2398 /*
2399 -------------------------------------------------------------------------------
2400 Returns the result of converting the double-precision floating-point value
2401 `a' to the 64-bit two's complement integer format.  The conversion is
2402 performed according to the IEC/IEEE Standard for Binary Floating-Point
2403 Arithmetic, except that the conversion is always rounded toward zero.
2404 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2405 the conversion overflows, the largest integer with the same sign as `a' is
2406 returned.
2407 -------------------------------------------------------------------------------
2408 */
2409 int64 float64_to_int64_round_to_zero( float64 a )
2410 {
2411     flag aSign;
2412     int16 aExp, shiftCount;
2413     bits64 aSig;
2414     int64 z;
2415
2416     aSig = extractFloat64Frac( a );
2417     aExp = extractFloat64Exp( a );
2418     aSign = extractFloat64Sign( a );
2419     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2420     shiftCount = aExp - 0x433;
2421     if ( 0 <= shiftCount ) {
2422         if ( 0x43E <= aExp ) {
2423             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2424                 float_raise( float_flag_invalid );
2425                 if (    ! aSign
2426                      || (    ( aExp == 0x7FF )
2427                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2428                    ) {
2429                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2430                 }
2431             }
2432             return (sbits64) LIT64( 0x8000000000000000 );
2433         }
2434         z = aSig<<shiftCount;
2435     }
2436     else {
2437         if ( aExp < 0x3FE ) {
2438             if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2439             return 0;
2440         }
2441         z = aSig>>( - shiftCount );
2442         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2443             float_exception_flags |= float_flag_inexact;
2444         }
2445     }
2446     if ( aSign ) z = - z;
2447     return z;
2448
2449 }
2450 #endif /* !SOFTFLOAT_FOR_GCC */
2451
2452 /*
2453 -------------------------------------------------------------------------------
2454 Returns the result of converting the double-precision floating-point value
2455 `a' to the single-precision floating-point format.  The conversion is
2456 performed according to the IEC/IEEE Standard for Binary Floating-Point
2457 Arithmetic.
2458 -------------------------------------------------------------------------------
2459 */
2460 float32 float64_to_float32( float64 a )
2461 {
2462     flag aSign;
2463     int16 aExp;
2464     bits64 aSig;
2465     bits32 zSig;
2466
2467     aSig = extractFloat64Frac( a );
2468     aExp = extractFloat64Exp( a );
2469     aSign = extractFloat64Sign( a );
2470     if ( aExp == 0x7FF ) {
2471         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2472         return packFloat32( aSign, 0xFF, 0 );
2473     }
2474     shift64RightJamming( aSig, 22, &aSig );
2475     zSig = aSig;
2476     if ( aExp || zSig ) {
2477         zSig |= 0x40000000;
2478         aExp -= 0x381;
2479     }
2480     return roundAndPackFloat32( aSign, aExp, zSig );
2481
2482 }
2483
2484 #ifdef FLOATX80
2485
2486 /*
2487 -------------------------------------------------------------------------------
2488 Returns the result of converting the double-precision floating-point value
2489 `a' to the extended double-precision floating-point format.  The conversion
2490 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2491 Arithmetic.
2492 -------------------------------------------------------------------------------
2493 */
2494 floatx80 float64_to_floatx80( float64 a )
2495 {
2496     flag aSign;
2497     int16 aExp;
2498     bits64 aSig;
2499
2500     aSig = extractFloat64Frac( a );
2501     aExp = extractFloat64Exp( a );
2502     aSign = extractFloat64Sign( a );
2503     if ( aExp == 0x7FF ) {
2504         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2505         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2506     }
2507     if ( aExp == 0 ) {
2508         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2509         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2510     }
2511     return
2512         packFloatx80(
2513             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2514
2515 }
2516
2517 #endif
2518
2519 #ifdef FLOAT128
2520
2521 /*
2522 -------------------------------------------------------------------------------
2523 Returns the result of converting the double-precision floating-point value
2524 `a' to the quadruple-precision floating-point format.  The conversion is
2525 performed according to the IEC/IEEE Standard for Binary Floating-Point
2526 Arithmetic.
2527 -------------------------------------------------------------------------------
2528 */
2529 float128 float64_to_float128( float64 a )
2530 {
2531     flag aSign;
2532     int16 aExp;
2533     bits64 aSig, zSig0, zSig1;
2534
2535     aSig = extractFloat64Frac( a );
2536     aExp = extractFloat64Exp( a );
2537     aSign = extractFloat64Sign( a );
2538     if ( aExp == 0x7FF ) {
2539         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2540         return packFloat128( aSign, 0x7FFF, 0, 0 );
2541     }
2542     if ( aExp == 0 ) {
2543         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2544         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2545         --aExp;
2546     }
2547     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2548     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2549
2550 }
2551
2552 #endif
2553
2554 #ifndef SOFTFLOAT_FOR_GCC
2555 /*
2556 -------------------------------------------------------------------------------
2557 Rounds the double-precision floating-point value `a' to an integer, and
2558 returns the result as a double-precision floating-point value.  The
2559 operation is performed according to the IEC/IEEE Standard for Binary
2560 Floating-Point Arithmetic.
2561 -------------------------------------------------------------------------------
2562 */
2563 float64 float64_round_to_int( float64 a )
2564 {
2565     flag aSign;
2566     int16 aExp;
2567     bits64 lastBitMask, roundBitsMask;
2568     int8 roundingMode;
2569     float64 z;
2570
2571     aExp = extractFloat64Exp( a );
2572     if ( 0x433 <= aExp ) {
2573         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2574             return propagateFloat64NaN( a, a );
2575         }
2576         return a;
2577     }
2578     if ( aExp < 0x3FF ) {
2579         if ( (bits64) ( a<<1 ) == 0 ) return a;
2580         float_exception_flags |= float_flag_inexact;
2581         aSign = extractFloat64Sign( a );
2582         switch ( float_rounding_mode ) {
2583          case float_round_nearest_even:
2584             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2585                 return packFloat64( aSign, 0x3FF, 0 );
2586             }
2587             break;
2588          case float_round_to_zero:
2589             break;
2590          case float_round_down:
2591             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2592          case float_round_up:
2593             return
2594             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2595         }
2596         return packFloat64( aSign, 0, 0 );
2597     }
2598     lastBitMask = 1;
2599     lastBitMask <<= 0x433 - aExp;
2600     roundBitsMask = lastBitMask - 1;
2601     z = a;
2602     roundingMode = float_rounding_mode;
2603     if ( roundingMode == float_round_nearest_even ) {
2604         z += lastBitMask>>1;
2605         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2606     }
2607     else if ( roundingMode != float_round_to_zero ) {
2608         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2609             z += roundBitsMask;
2610         }
2611     }
2612     z &= ~ roundBitsMask;
2613     if ( z != a ) float_exception_flags |= float_flag_inexact;
2614     return z;
2615
2616 }
2617 #endif
2618
2619 /*
2620 -------------------------------------------------------------------------------
2621 Returns the result of adding the absolute values of the double-precision
2622 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2623 before being returned.  `zSign' is ignored if the result is a NaN.
2624 The addition is performed according to the IEC/IEEE Standard for Binary
2625 Floating-Point Arithmetic.
2626 -------------------------------------------------------------------------------
2627 */
2628 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2629 {
2630     int16 aExp, bExp, zExp;
2631     bits64 aSig, bSig, zSig;
2632     int16 expDiff;
2633
2634     aSig = extractFloat64Frac( a );
2635     aExp = extractFloat64Exp( a );
2636     bSig = extractFloat64Frac( b );
2637     bExp = extractFloat64Exp( b );
2638     expDiff = aExp - bExp;
2639     aSig <<= 9;
2640     bSig <<= 9;
2641     if ( 0 < expDiff ) {
2642         if ( aExp == 0x7FF ) {
2643             if ( aSig ) return propagateFloat64NaN( a, b );
2644             return a;
2645         }
2646         if ( bExp == 0 ) {
2647             --expDiff;
2648         }
2649         else {
2650             bSig |= LIT64( 0x2000000000000000 );
2651         }
2652         shift64RightJamming( bSig, expDiff, &bSig );
2653         zExp = aExp;
2654     }
2655     else if ( expDiff < 0 ) {
2656         if ( bExp == 0x7FF ) {
2657             if ( bSig ) return propagateFloat64NaN( a, b );
2658             return packFloat64( zSign, 0x7FF, 0 );
2659         }
2660         if ( aExp == 0 ) {
2661             ++expDiff;
2662         }
2663         else {
2664             aSig |= LIT64( 0x2000000000000000 );
2665         }
2666         shift64RightJamming( aSig, - expDiff, &aSig );
2667         zExp = bExp;
2668     }
2669     else {
2670         if ( aExp == 0x7FF ) {
2671             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2672             return a;
2673         }
2674         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2675         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2676         zExp = aExp;
2677         goto roundAndPack;
2678     }
2679     aSig |= LIT64( 0x2000000000000000 );
2680     zSig = ( aSig + bSig )<<1;
2681     --zExp;
2682     if ( (sbits64) zSig < 0 ) {
2683         zSig = aSig + bSig;
2684         ++zExp;
2685     }
2686  roundAndPack:
2687     return roundAndPackFloat64( zSign, zExp, zSig );
2688
2689 }
2690
2691 /*
2692 -------------------------------------------------------------------------------
2693 Returns the result of subtracting the absolute values of the double-
2694 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2695 difference is negated before being returned.  `zSign' is ignored if the
2696 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2697 Standard for Binary Floating-Point Arithmetic.
2698 -------------------------------------------------------------------------------
2699 */
2700 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2701 {
2702     int16 aExp, bExp, zExp;
2703     bits64 aSig, bSig, zSig;
2704     int16 expDiff;
2705
2706     aSig = extractFloat64Frac( a );
2707     aExp = extractFloat64Exp( a );
2708     bSig = extractFloat64Frac( b );
2709     bExp = extractFloat64Exp( b );
2710     expDiff = aExp - bExp;
2711     aSig <<= 10;
2712     bSig <<= 10;
2713     if ( 0 < expDiff ) goto aExpBigger;
2714     if ( expDiff < 0 ) goto bExpBigger;
2715     if ( aExp == 0x7FF ) {
2716         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2717         float_raise( float_flag_invalid );
2718         return float64_default_nan;
2719     }
2720     if ( aExp == 0 ) {
2721         aExp = 1;
2722         bExp = 1;
2723     }
2724     if ( bSig < aSig ) goto aBigger;
2725     if ( aSig < bSig ) goto bBigger;
2726     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2727  bExpBigger:
2728     if ( bExp == 0x7FF ) {
2729         if ( bSig ) return propagateFloat64NaN( a, b );
2730         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2731     }
2732     if ( aExp == 0 ) {
2733         ++expDiff;
2734     }
2735     else {
2736         aSig |= LIT64( 0x4000000000000000 );
2737     }
2738     shift64RightJamming( aSig, - expDiff, &aSig );
2739     bSig |= LIT64( 0x4000000000000000 );
2740  bBigger:
2741     zSig = bSig - aSig;
2742     zExp = bExp;
2743     zSign ^= 1;
2744     goto normalizeRoundAndPack;
2745  aExpBigger:
2746     if ( aExp == 0x7FF ) {
2747         if ( aSig ) return propagateFloat64NaN( a, b );
2748         return a;
2749     }
2750     if ( bExp == 0 ) {
2751         --expDiff;
2752     }
2753     else {
2754         bSig |= LIT64( 0x4000000000000000 );
2755     }
2756     shift64RightJamming( bSig, expDiff, &bSig );
2757     aSig |= LIT64( 0x4000000000000000 );
2758  aBigger:
2759     zSig = aSig - bSig;
2760     zExp = aExp;
2761  normalizeRoundAndPack:
2762     --zExp;
2763     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2764
2765 }
2766
2767 /*
2768 -------------------------------------------------------------------------------
2769 Returns the result of adding the double-precision floating-point values `a'
2770 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2771 Binary Floating-Point Arithmetic.
2772 -------------------------------------------------------------------------------
2773 */
2774 float64 float64_add( float64 a, float64 b )
2775 {
2776     flag aSign, bSign;
2777
2778     aSign = extractFloat64Sign( a );
2779     bSign = extractFloat64Sign( b );
2780     if ( aSign == bSign ) {
2781         return addFloat64Sigs( a, b, aSign );
2782     }
2783     else {
2784         return subFloat64Sigs( a, b, aSign );
2785     }
2786
2787 }
2788
2789 /*
2790 -------------------------------------------------------------------------------
2791 Returns the result of subtracting the double-precision floating-point values
2792 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2793 for Binary Floating-Point Arithmetic.
2794 -------------------------------------------------------------------------------
2795 */
2796 float64 float64_sub( float64 a, float64 b )
2797 {
2798     flag aSign, bSign;
2799
2800     aSign = extractFloat64Sign( a );
2801     bSign = extractFloat64Sign( b );
2802     if ( aSign == bSign ) {
2803         return subFloat64Sigs( a, b, aSign );
2804     }
2805     else {
2806         return addFloat64Sigs( a, b, aSign );
2807     }
2808
2809 }
2810
2811 /*
2812 -------------------------------------------------------------------------------
2813 Returns the result of multiplying the double-precision floating-point values
2814 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2815 for Binary Floating-Point Arithmetic.
2816 -------------------------------------------------------------------------------
2817 */
2818 float64 float64_mul( float64 a, float64 b )
2819 {
2820     flag aSign, bSign, zSign;
2821     int16 aExp, bExp, zExp;
2822     bits64 aSig, bSig, zSig0, zSig1;
2823
2824     aSig = extractFloat64Frac( a );
2825     aExp = extractFloat64Exp( a );
2826     aSign = extractFloat64Sign( a );
2827     bSig = extractFloat64Frac( b );
2828     bExp = extractFloat64Exp( b );
2829     bSign = extractFloat64Sign( b );
2830     zSign = aSign ^ bSign;
2831     if ( aExp == 0x7FF ) {
2832         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2833             return propagateFloat64NaN( a, b );
2834         }
2835         if ( ( bExp | bSig ) == 0 ) {
2836             float_raise( float_flag_invalid );
2837             return float64_default_nan;
2838         }
2839         return packFloat64( zSign, 0x7FF, 0 );
2840     }
2841     if ( bExp == 0x7FF ) {
2842         if ( bSig ) return propagateFloat64NaN( a, b );
2843         if ( ( aExp | aSig ) == 0 ) {
2844             float_raise( float_flag_invalid );
2845             return float64_default_nan;
2846         }
2847         return packFloat64( zSign, 0x7FF, 0 );
2848     }
2849     if ( aExp == 0 ) {
2850         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2851         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2852     }
2853     if ( bExp == 0 ) {
2854         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2855         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2856     }
2857     zExp = aExp + bExp - 0x3FF;
2858     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2859     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2860     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2861     zSig0 |= ( zSig1 != 0 );
2862     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2863         zSig0 <<= 1;
2864         --zExp;
2865     }
2866     return roundAndPackFloat64( zSign, zExp, zSig0 );
2867
2868 }
2869
2870 /*
2871 -------------------------------------------------------------------------------
2872 Returns the result of dividing the double-precision floating-point value `a'
2873 by the corresponding value `b'.  The operation is performed according to
2874 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2875 -------------------------------------------------------------------------------
2876 */
2877 float64 float64_div( float64 a, float64 b )
2878 {
2879     flag aSign, bSign, zSign;
2880     int16 aExp, bExp, zExp;
2881     bits64 aSig, bSig, zSig;
2882     bits64 rem0, rem1;
2883     bits64 term0, term1;
2884
2885     aSig = extractFloat64Frac( a );
2886     aExp = extractFloat64Exp( a );
2887     aSign = extractFloat64Sign( a );
2888     bSig = extractFloat64Frac( b );
2889     bExp = extractFloat64Exp( b );
2890     bSign = extractFloat64Sign( b );
2891     zSign = aSign ^ bSign;
2892     if ( aExp == 0x7FF ) {
2893         if ( aSig ) return propagateFloat64NaN( a, b );
2894         if ( bExp == 0x7FF ) {
2895             if ( bSig ) return propagateFloat64NaN( a, b );
2896             float_raise( float_flag_invalid );
2897             return float64_default_nan;
2898         }
2899         return packFloat64( zSign, 0x7FF, 0 );
2900     }
2901     if ( bExp == 0x7FF ) {
2902         if ( bSig ) return propagateFloat64NaN( a, b );
2903         return packFloat64( zSign, 0, 0 );
2904     }
2905     if ( bExp == 0 ) {
2906         if ( bSig == 0 ) {
2907             if ( ( aExp | aSig ) == 0 ) {
2908                 float_raise( float_flag_invalid );
2909                 return float64_default_nan;
2910             }
2911             float_raise( float_flag_divbyzero );
2912             return packFloat64( zSign, 0x7FF, 0 );
2913         }
2914         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2915     }
2916     if ( aExp == 0 ) {
2917         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2918         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2919     }
2920     zExp = aExp - bExp + 0x3FD;
2921     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2922     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2923     if ( bSig <= ( aSig + aSig ) ) {
2924         aSig >>= 1;
2925         ++zExp;
2926     }
2927     zSig = estimateDiv128To64( aSig, 0, bSig );
2928     if ( ( zSig & 0x1FF ) <= 2 ) {
2929         mul64To128( bSig, zSig, &term0, &term1 );
2930         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2931         while ( (sbits64) rem0 < 0 ) {
2932             --zSig;
2933             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2934         }
2935         zSig |= ( rem1 != 0 );
2936     }
2937     return roundAndPackFloat64( zSign, zExp, zSig );
2938
2939 }
2940
2941 #ifndef SOFTFLOAT_FOR_GCC
2942 /*
2943 -------------------------------------------------------------------------------
2944 Returns the remainder of the double-precision floating-point value `a'
2945 with respect to the corresponding value `b'.  The operation is performed
2946 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2947 -------------------------------------------------------------------------------
2948 */
2949 float64 float64_rem( float64 a, float64 b )
2950 {
2951     flag aSign, bSign, zSign;
2952     int16 aExp, bExp, expDiff;
2953     bits64 aSig, bSig;
2954     bits64 q, alternateASig;
2955     sbits64 sigMean;
2956
2957     aSig = extractFloat64Frac( a );
2958     aExp = extractFloat64Exp( a );
2959     aSign = extractFloat64Sign( a );
2960     bSig = extractFloat64Frac( b );
2961     bExp = extractFloat64Exp( b );
2962     bSign = extractFloat64Sign( b );
2963     if ( aExp == 0x7FF ) {
2964         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2965             return propagateFloat64NaN( a, b );
2966         }
2967         float_raise( float_flag_invalid );
2968         return float64_default_nan;
2969     }
2970     if ( bExp == 0x7FF ) {
2971         if ( bSig ) return propagateFloat64NaN( a, b );
2972         return a;
2973     }
2974     if ( bExp == 0 ) {
2975         if ( bSig == 0 ) {
2976             float_raise( float_flag_invalid );
2977             return float64_default_nan;
2978         }
2979         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2980     }
2981     if ( aExp == 0 ) {
2982         if ( aSig == 0 ) return a;
2983         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2984     }
2985     expDiff = aExp - bExp;
2986     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2987     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2988     if ( expDiff < 0 ) {
2989         if ( expDiff < -1 ) return a;
2990         aSig >>= 1;
2991     }
2992     q = ( bSig <= aSig );
2993     if ( q ) aSig -= bSig;
2994     expDiff -= 64;
2995     while ( 0 < expDiff ) {
2996         q = estimateDiv128To64( aSig, 0, bSig );
2997         q = ( 2 < q ) ? q - 2 : 0;
2998         aSig = - ( ( bSig>>2 ) * q );
2999         expDiff -= 62;
3000     }
3001     expDiff += 64;
3002     if ( 0 < expDiff ) {
3003         q = estimateDiv128To64( aSig, 0, bSig );
3004         q = ( 2 < q ) ? q - 2 : 0;
3005         q >>= 64 - expDiff;
3006         bSig >>= 2;
3007         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3008     }
3009     else {
3010         aSig >>= 2;
3011         bSig >>= 2;
3012     }
3013     do {
3014         alternateASig = aSig;
3015         ++q;
3016         aSig -= bSig;
3017     } while ( 0 <= (sbits64) aSig );
3018     sigMean = aSig + alternateASig;
3019     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3020         aSig = alternateASig;
3021     }
3022     zSign = ( (sbits64) aSig < 0 );
3023     if ( zSign ) aSig = - aSig;
3024     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3025
3026 }
3027
3028 /*
3029 -------------------------------------------------------------------------------
3030 Returns the square root of the double-precision floating-point value `a'.
3031 The operation is performed according to the IEC/IEEE Standard for Binary
3032 Floating-Point Arithmetic.
3033 -------------------------------------------------------------------------------
3034 */
3035 float64 float64_sqrt( float64 a )
3036 {
3037     flag aSign;
3038     int16 aExp, zExp;
3039     bits64 aSig, zSig, doubleZSig;
3040     bits64 rem0, rem1, term0, term1;
3041
3042     aSig = extractFloat64Frac( a );
3043     aExp = extractFloat64Exp( a );
3044     aSign = extractFloat64Sign( a );
3045     if ( aExp == 0x7FF ) {
3046         if ( aSig ) return propagateFloat64NaN( a, a );
3047         if ( ! aSign ) return a;
3048         float_raise( float_flag_invalid );
3049         return float64_default_nan;
3050     }
3051     if ( aSign ) {
3052         if ( ( aExp | aSig ) == 0 ) return a;
3053         float_raise( float_flag_invalid );
3054         return float64_default_nan;
3055     }
3056     if ( aExp == 0 ) {
3057         if ( aSig == 0 ) return 0;
3058         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3059     }
3060     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3061     aSig |= LIT64( 0x0010000000000000 );
3062     zSig = estimateSqrt32( aExp, aSig>>21 );
3063     aSig <<= 9 - ( aExp & 1 );
3064     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3065     if ( ( zSig & 0x1FF ) <= 5 ) {
3066         doubleZSig = zSig<<1;
3067         mul64To128( zSig, zSig, &term0, &term1 );
3068         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3069         while ( (sbits64) rem0 < 0 ) {
3070             --zSig;
3071             doubleZSig -= 2;
3072             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3073         }
3074         zSig |= ( ( rem0 | rem1 ) != 0 );
3075     }
3076     return roundAndPackFloat64( 0, zExp, zSig );
3077
3078 }
3079 #endif
3080
3081 /*
3082 -------------------------------------------------------------------------------
3083 Returns 1 if the double-precision floating-point value `a' is equal to the
3084 corresponding value `b', and 0 otherwise.  The comparison is performed
3085 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3086 -------------------------------------------------------------------------------
3087 */
3088 flag float64_eq( float64 a, float64 b )
3089 {
3090
3091     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3092          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3093        ) {
3094         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3095             float_raise( float_flag_invalid );
3096         }
3097         return 0;
3098     }
3099     return ( a == b ) ||
3100         ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3101
3102 }
3103
3104 /*
3105 -------------------------------------------------------------------------------
3106 Returns 1 if the double-precision floating-point value `a' is less than or
3107 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3108 performed according to the IEC/IEEE Standard for Binary Floating-Point
3109 Arithmetic.
3110 -------------------------------------------------------------------------------
3111 */
3112 flag float64_le( float64 a, float64 b )
3113 {
3114     flag aSign, bSign;
3115
3116     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3117          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3118        ) {
3119         float_raise( float_flag_invalid );
3120         return 0;
3121     }
3122     aSign = extractFloat64Sign( a );
3123     bSign = extractFloat64Sign( b );
3124     if ( aSign != bSign )
3125         return aSign ||
3126             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3127               0 );
3128     return ( a == b ) ||
3129         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3130
3131 }
3132
3133 /*
3134 -------------------------------------------------------------------------------
3135 Returns 1 if the double-precision floating-point value `a' is less than
3136 the corresponding value `b', and 0 otherwise.  The comparison is performed
3137 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3138 -------------------------------------------------------------------------------
3139 */
3140 flag float64_lt( float64 a, float64 b )
3141 {
3142     flag aSign, bSign;
3143
3144     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3145          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3146        ) {
3147         float_raise( float_flag_invalid );
3148         return 0;
3149     }
3150     aSign = extractFloat64Sign( a );
3151     bSign = extractFloat64Sign( b );
3152     if ( aSign != bSign )
3153         return aSign &&
3154             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3155               0 );
3156     return ( a != b ) &&
3157         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3158
3159 }
3160
3161 #ifndef SOFTFLOAT_FOR_GCC
3162 /*
3163 -------------------------------------------------------------------------------
3164 Returns 1 if the double-precision floating-point value `a' is equal to the
3165 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3166 if either operand is a NaN.  Otherwise, the comparison is performed
3167 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3168 -------------------------------------------------------------------------------
3169 */
3170 flag float64_eq_signaling( float64 a, float64 b )
3171 {
3172
3173     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3174          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3175        ) {
3176         float_raise( float_flag_invalid );
3177         return 0;
3178     }
3179     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3180
3181 }
3182
3183 /*
3184 -------------------------------------------------------------------------------
3185 Returns 1 if the double-precision floating-point value `a' is less than or
3186 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3187 cause an exception.  Otherwise, the comparison is performed according to the
3188 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3189 -------------------------------------------------------------------------------
3190 */
3191 flag float64_le_quiet( float64 a, float64 b )
3192 {
3193     flag aSign, bSign;
3194
3195     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3196          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3197        ) {
3198         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3199             float_raise( float_flag_invalid );
3200         }
3201         return 0;
3202     }
3203     aSign = extractFloat64Sign( a );
3204     bSign = extractFloat64Sign( b );
3205     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3206     return ( a == b ) || ( aSign ^ ( a < b ) );
3207
3208 }
3209
3210 /*
3211 -------------------------------------------------------------------------------
3212 Returns 1 if the double-precision floating-point value `a' is less than
3213 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3214 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3215 Standard for Binary Floating-Point Arithmetic.
3216 -------------------------------------------------------------------------------
3217 */
3218 flag float64_lt_quiet( float64 a, float64 b )
3219 {
3220     flag aSign, bSign;
3221
3222     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3223          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3224        ) {
3225         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3226             float_raise( float_flag_invalid );
3227         }
3228         return 0;
3229     }
3230     aSign = extractFloat64Sign( a );
3231     bSign = extractFloat64Sign( b );
3232     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3233     return ( a != b ) && ( aSign ^ ( a < b ) );
3234
3235 }
3236 #endif
3237
3238 #ifdef FLOATX80
3239
3240 /*
3241 -------------------------------------------------------------------------------
3242 Returns the result of converting the extended double-precision floating-
3243 point value `a' to the 32-bit two's complement integer format.  The
3244 conversion is performed according to the IEC/IEEE Standard for Binary
3245 Floating-Point Arithmetic---which means in particular that the conversion
3246 is rounded according to the current rounding mode.  If `a' is a NaN, the
3247 largest positive integer is returned.  Otherwise, if the conversion
3248 overflows, the largest integer with the same sign as `a' is returned.
3249 -------------------------------------------------------------------------------
3250 */
3251 int32 floatx80_to_int32( floatx80 a )
3252 {
3253     flag aSign;
3254     int32 aExp, shiftCount;
3255     bits64 aSig;
3256
3257     aSig = extractFloatx80Frac( a );
3258     aExp = extractFloatx80Exp( a );
3259     aSign = extractFloatx80Sign( a );
3260     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3261     shiftCount = 0x4037 - aExp;
3262     if ( shiftCount <= 0 ) shiftCount = 1;
3263     shift64RightJamming( aSig, shiftCount, &aSig );
3264     return roundAndPackInt32( aSign, aSig );
3265
3266 }
3267
3268 /*
3269 -------------------------------------------------------------------------------
3270 Returns the result of converting the extended double-precision floating-
3271 point value `a' to the 32-bit two's complement integer format.  The
3272 conversion is performed according to the IEC/IEEE Standard for Binary
3273 Floating-Point Arithmetic, except that the conversion is always rounded
3274 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3275 Otherwise, if the conversion overflows, the largest integer with the same
3276 sign as `a' is returned.
3277 -------------------------------------------------------------------------------
3278 */
3279 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3280 {
3281     flag aSign;
3282     int32 aExp, shiftCount;
3283     bits64 aSig, savedASig;
3284     int32 z;
3285
3286     aSig = extractFloatx80Frac( a );
3287     aExp = extractFloatx80Exp( a );
3288     aSign = extractFloatx80Sign( a );
3289     if ( 0x401E < aExp ) {
3290         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3291         goto invalid;
3292     }
3293     else if ( aExp < 0x3FFF ) {
3294         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3295         return 0;
3296     }
3297     shiftCount = 0x403E - aExp;
3298     savedASig = aSig;
3299     aSig >>= shiftCount;
3300     z = aSig;
3301     if ( aSign ) z = - z;
3302     if ( ( z < 0 ) ^ aSign ) {
3303  invalid:
3304         float_raise( float_flag_invalid );
3305         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3306     }
3307     if ( ( aSig<<shiftCount ) != savedASig ) {
3308         float_exception_flags |= float_flag_inexact;
3309     }
3310     return z;
3311
3312 }
3313
3314 /*
3315 -------------------------------------------------------------------------------
3316 Returns the result of converting the extended double-precision floating-
3317 point value `a' to the 64-bit two's complement integer format.  The
3318 conversion is performed according to the IEC/IEEE Standard for Binary
3319 Floating-Point Arithmetic---which means in particular that the conversion
3320 is rounded according to the current rounding mode.  If `a' is a NaN,
3321 the largest positive integer is returned.  Otherwise, if the conversion
3322 overflows, the largest integer with the same sign as `a' is returned.
3323 -------------------------------------------------------------------------------
3324 */
3325 int64 floatx80_to_int64( floatx80 a )
3326 {
3327     flag aSign;
3328     int32 aExp, shiftCount;
3329     bits64 aSig, aSigExtra;
3330
3331     aSig = extractFloatx80Frac( a );
3332     aExp = extractFloatx80Exp( a );
3333     aSign = extractFloatx80Sign( a );
3334     shiftCount = 0x403E - aExp;
3335     if ( shiftCount <= 0 ) {
3336         if ( shiftCount ) {
3337             float_raise( float_flag_invalid );
3338             if (    ! aSign
3339                  || (    ( aExp == 0x7FFF )
3340                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3341                ) {
3342                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3343             }
3344             return (sbits64) LIT64( 0x8000000000000000 );
3345         }
3346         aSigExtra = 0;
3347     }
3348     else {
3349         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3350     }
3351     return roundAndPackInt64( aSign, aSig, aSigExtra );
3352
3353 }
3354
3355 /*
3356 -------------------------------------------------------------------------------
3357 Returns the result of converting the extended double-precision floating-
3358 point value `a' to the 64-bit two's complement integer format.  The
3359 conversion is performed according to the IEC/IEEE Standard for Binary
3360 Floating-Point Arithmetic, except that the conversion is always rounded
3361 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3362 Otherwise, if the conversion overflows, the largest integer with the same
3363 sign as `a' is returned.
3364 -------------------------------------------------------------------------------
3365 */
3366 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3367 {
3368     flag aSign;
3369     int32 aExp, shiftCount;
3370     bits64 aSig;
3371     int64 z;
3372
3373     aSig = extractFloatx80Frac( a );
3374     aExp = extractFloatx80Exp( a );
3375     aSign = extractFloatx80Sign( a );
3376     shiftCount = aExp - 0x403E;
3377     if ( 0 <= shiftCount ) {
3378         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3379         if ( ( a.high != 0xC03E ) || aSig ) {
3380             float_raise( float_flag_invalid );
3381             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3382                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3383             }
3384         }
3385         return (sbits64) LIT64( 0x8000000000000000 );
3386     }
3387     else if ( aExp < 0x3FFF ) {
3388         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3389         return 0;
3390     }
3391     z = aSig>>( - shiftCount );
3392     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3393         float_exception_flags |= float_flag_inexact;
3394     }
3395     if ( aSign ) z = - z;
3396     return z;
3397
3398 }
3399
3400 /*
3401 -------------------------------------------------------------------------------
3402 Returns the result of converting the extended double-precision floating-
3403 point value `a' to the single-precision floating-point format.  The
3404 conversion is performed according to the IEC/IEEE Standard for Binary
3405 Floating-Point Arithmetic.
3406 -------------------------------------------------------------------------------
3407 */
3408 float32 floatx80_to_float32( floatx80 a )
3409 {
3410     flag aSign;
3411     int32 aExp;
3412     bits64 aSig;
3413
3414     aSig = extractFloatx80Frac( a );
3415     aExp = extractFloatx80Exp( a );
3416     aSign = extractFloatx80Sign( a );
3417     if ( aExp == 0x7FFF ) {
3418         if ( (bits64) ( aSig<<1 ) ) {
3419             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3420         }
3421         return packFloat32( aSign, 0xFF, 0 );
3422     }
3423     shift64RightJamming( aSig, 33, &aSig );
3424     if ( aExp || aSig ) aExp -= 0x3F81;
3425     return roundAndPackFloat32( aSign, aExp, aSig );
3426
3427 }
3428
3429 /*
3430 -------------------------------------------------------------------------------
3431 Returns the result of converting the extended double-precision floating-
3432 point value `a' to the double-precision floating-point format.  The
3433 conversion is performed according to the IEC/IEEE Standard for Binary
3434 Floating-Point Arithmetic.
3435 -------------------------------------------------------------------------------
3436 */
3437 float64 floatx80_to_float64( floatx80 a )
3438 {
3439     flag aSign;
3440     int32 aExp;
3441     bits64 aSig, zSig;
3442
3443     aSig = extractFloatx80Frac( a );
3444     aExp = extractFloatx80Exp( a );
3445     aSign = extractFloatx80Sign( a );
3446     if ( aExp == 0x7FFF ) {
3447         if ( (bits64) ( aSig<<1 ) ) {
3448             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3449         }
3450         return packFloat64( aSign, 0x7FF, 0 );
3451     }
3452     shift64RightJamming( aSig, 1, &zSig );
3453     if ( aExp || aSig ) aExp -= 0x3C01;
3454     return roundAndPackFloat64( aSign, aExp, zSig );
3455
3456 }
3457
3458 #ifdef FLOAT128
3459
3460 /*
3461 -------------------------------------------------------------------------------
3462 Returns the result of converting the extended double-precision floating-
3463 point value `a' to the quadruple-precision floating-point format.  The
3464 conversion is performed according to the IEC/IEEE Standard for Binary
3465 Floating-Point Arithmetic.
3466 -------------------------------------------------------------------------------
3467 */
3468 float128 floatx80_to_float128( floatx80 a )
3469 {
3470     flag aSign;
3471     int16 aExp;
3472     bits64 aSig, zSig0, zSig1;
3473
3474     aSig = extractFloatx80Frac( a );
3475     aExp = extractFloatx80Exp( a );
3476     aSign = extractFloatx80Sign( a );
3477     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3478         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3479     }
3480     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3481     return packFloat128( aSign, aExp, zSig0, zSig1 );
3482
3483 }
3484
3485 #endif
3486
3487 /*
3488 -------------------------------------------------------------------------------
3489 Rounds the extended double-precision floating-point value `a' to an integer,
3490 and returns the result as an extended quadruple-precision floating-point
3491 value.  The operation is performed according to the IEC/IEEE Standard for
3492 Binary Floating-Point Arithmetic.
3493 -------------------------------------------------------------------------------
3494 */
3495 floatx80 floatx80_round_to_int( floatx80 a )
3496 {
3497     flag aSign;
3498     int32 aExp;
3499     bits64 lastBitMask, roundBitsMask;
3500     int8 roundingMode;
3501     floatx80 z;
3502
3503     aExp = extractFloatx80Exp( a );
3504     if ( 0x403E <= aExp ) {
3505         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3506             return propagateFloatx80NaN( a, a );
3507         }
3508         return a;
3509     }
3510     if ( aExp < 0x3FFF ) {
3511         if (    ( aExp == 0 )
3512              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3513             return a;
3514         }
3515         float_exception_flags |= float_flag_inexact;
3516         aSign = extractFloatx80Sign( a );
3517         switch ( float_rounding_mode ) {
3518          case float_round_nearest_even:
3519             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3520                ) {
3521                 return
3522                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3523             }
3524             break;
3525          case float_round_to_zero:
3526             break;
3527          case float_round_down:
3528             return
3529                   aSign ?
3530                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3531                 : packFloatx80( 0, 0, 0 );
3532          case float_round_up:
3533             return
3534                   aSign ? packFloatx80( 1, 0, 0 )
3535                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3536         }
3537         return packFloatx80( aSign, 0, 0 );
3538     }
3539     lastBitMask = 1;
3540     lastBitMask <<= 0x403E - aExp;
3541     roundBitsMask = lastBitMask - 1;
3542     z = a;
3543     roundingMode = float_rounding_mode;
3544     if ( roundingMode == float_round_nearest_even ) {
3545         z.low += lastBitMask>>1;
3546         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3547     }
3548     else if ( roundingMode != float_round_to_zero ) {
3549         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3550             z.low += roundBitsMask;
3551         }
3552     }
3553     z.low &= ~ roundBitsMask;
3554     if ( z.low == 0 ) {
3555         ++z.high;
3556         z.low = LIT64( 0x8000000000000000 );
3557     }
3558     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3559     return z;
3560
3561 }
3562
3563 /*
3564 -------------------------------------------------------------------------------
3565 Returns the result of adding the absolute values of the extended double-
3566 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3567 negated before being returned.  `zSign' is ignored if the result is a NaN.
3568 The addition is performed according to the IEC/IEEE Standard for Binary
3569 Floating-Point Arithmetic.
3570 -------------------------------------------------------------------------------
3571 */
3572 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3573 {
3574     int32 aExp, bExp, zExp;
3575     bits64 aSig, bSig, zSig0, zSig1;
3576     int32 expDiff;
3577
3578     aSig = extractFloatx80Frac( a );
3579     aExp = extractFloatx80Exp( a );
3580     bSig = extractFloatx80Frac( b );
3581     bExp = extractFloatx80Exp( b );
3582     expDiff = aExp - bExp;
3583     if ( 0 < expDiff ) {
3584         if ( aExp == 0x7FFF ) {
3585             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3586             return a;
3587         }
3588         if ( bExp == 0 ) --expDiff;
3589         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3590         zExp = aExp;
3591     }
3592     else if ( expDiff < 0 ) {
3593         if ( bExp == 0x7FFF ) {
3594             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3595             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3596         }
3597         if ( aExp == 0 ) ++expDiff;
3598         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3599         zExp = bExp;
3600     }
3601     else {
3602         if ( aExp == 0x7FFF ) {
3603             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3604                 return propagateFloatx80NaN( a, b );
3605             }
3606             return a;
3607         }
3608         zSig1 = 0;
3609         zSig0 = aSig + bSig;
3610         if ( aExp == 0 ) {
3611             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3612             goto roundAndPack;
3613         }
3614         zExp = aExp;
3615         goto shiftRight1;
3616     }
3617     zSig0 = aSig + bSig;
3618     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3619  shiftRight1:
3620     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3621     zSig0 |= LIT64( 0x8000000000000000 );
3622     ++zExp;
3623  roundAndPack:
3624     return
3625         roundAndPackFloatx80(
3626             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3627
3628 }
3629
3630 /*
3631 -------------------------------------------------------------------------------
3632 Returns the result of subtracting the absolute values of the extended
3633 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3634 difference is negated before being returned.  `zSign' is ignored if the
3635 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3636 Standard for Binary Floating-Point Arithmetic.
3637 -------------------------------------------------------------------------------
3638 */
3639 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3640 {
3641     int32 aExp, bExp, zExp;
3642     bits64 aSig, bSig, zSig0, zSig1;
3643     int32 expDiff;
3644     floatx80 z;
3645
3646     aSig = extractFloatx80Frac( a );
3647     aExp = extractFloatx80Exp( a );
3648     bSig = extractFloatx80Frac( b );
3649     bExp = extractFloatx80Exp( b );
3650     expDiff = aExp - bExp;
3651     if ( 0 < expDiff ) goto aExpBigger;
3652     if ( expDiff < 0 ) goto bExpBigger;
3653     if ( aExp == 0x7FFF ) {
3654         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3655             return propagateFloatx80NaN( a, b );
3656         }
3657         float_raise( float_flag_invalid );
3658         z.low = floatx80_default_nan_low;
3659         z.high = floatx80_default_nan_high;
3660         return z;
3661     }
3662     if ( aExp == 0 ) {
3663         aExp = 1;
3664         bExp = 1;
3665     }
3666     zSig1 = 0;
3667     if ( bSig < aSig ) goto aBigger;
3668     if ( aSig < bSig ) goto bBigger;
3669     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3670  bExpBigger:
3671     if ( bExp == 0x7FFF ) {
3672         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3673         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3674     }
3675     if ( aExp == 0 ) ++expDiff;
3676     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3677  bBigger:
3678     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3679     zExp = bExp;
3680     zSign ^= 1;
3681     goto normalizeRoundAndPack;
3682  aExpBigger:
3683     if ( aExp == 0x7FFF ) {
3684         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3685         return a;
3686     }
3687     if ( bExp == 0 ) --expDiff;
3688     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3689  aBigger:
3690     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3691     zExp = aExp;
3692  normalizeRoundAndPack:
3693     return
3694         normalizeRoundAndPackFloatx80(
3695             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3696
3697 }
3698
3699 /*
3700 -------------------------------------------------------------------------------
3701 Returns the result of adding the extended double-precision floating-point
3702 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3703 Standard for Binary Floating-Point Arithmetic.
3704 -------------------------------------------------------------------------------
3705 */
3706 floatx80 floatx80_add( floatx80 a, floatx80 b )
3707 {
3708     flag aSign, bSign;
3709
3710     aSign = extractFloatx80Sign( a );
3711     bSign = extractFloatx80Sign( b );
3712     if ( aSign == bSign ) {
3713         return addFloatx80Sigs( a, b, aSign );
3714     }
3715     else {
3716         return subFloatx80Sigs( a, b, aSign );
3717     }
3718
3719 }
3720
3721 /*
3722 -------------------------------------------------------------------------------
3723 Returns the result of subtracting the extended double-precision floating-
3724 point values `a' and `b'.  The operation is performed according to the
3725 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3726 -------------------------------------------------------------------------------
3727 */
3728 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3729 {
3730     flag aSign, bSign;
3731
3732     aSign = extractFloatx80Sign( a );
3733     bSign = extractFloatx80Sign( b );
3734     if ( aSign == bSign ) {
3735         return subFloatx80Sigs( a, b, aSign );
3736     }
3737     else {
3738         return addFloatx80Sigs( a, b, aSign );
3739     }
3740
3741 }
3742
3743 /*
3744 -------------------------------------------------------------------------------
3745 Returns the result of multiplying the extended double-precision floating-
3746 point values `a' and `b'.  The operation is performed according to the
3747 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3748 -------------------------------------------------------------------------------
3749 */
3750 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3751 {
3752     flag aSign, bSign, zSign;
3753     int32 aExp, bExp, zExp;
3754     bits64 aSig, bSig, zSig0, zSig1;
3755     floatx80 z;
3756
3757     aSig = extractFloatx80Frac( a );
3758     aExp = extractFloatx80Exp( a );
3759     aSign = extractFloatx80Sign( a );
3760     bSig = extractFloatx80Frac( b );
3761     bExp = extractFloatx80Exp( b );
3762     bSign = extractFloatx80Sign( b );
3763     zSign = aSign ^ bSign;
3764     if ( aExp == 0x7FFF ) {
3765         if (    (bits64) ( aSig<<1 )
3766              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3767             return propagateFloatx80NaN( a, b );
3768         }
3769         if ( ( bExp | bSig ) == 0 ) goto invalid;
3770         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3771     }
3772     if ( bExp == 0x7FFF ) {
3773         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3774         if ( ( aExp | aSig ) == 0 ) {
3775  invalid:
3776             float_raise( float_flag_invalid );
3777             z.low = floatx80_default_nan_low;
3778             z.high = floatx80_default_nan_high;
3779             return z;
3780         }
3781         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3782     }
3783     if ( aExp == 0 ) {
3784         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3785         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3786     }
3787     if ( bExp == 0 ) {
3788         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3789         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3790     }
3791     zExp = aExp + bExp - 0x3FFE;
3792     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3793     if ( 0 < (sbits64) zSig0 ) {
3794         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3795         --zExp;
3796     }
3797     return
3798         roundAndPackFloatx80(
3799             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3800
3801 }
3802
3803 /*
3804 -------------------------------------------------------------------------------
3805 Returns the result of dividing the extended double-precision floating-point
3806 value `a' by the corresponding value `b'.  The operation is performed
3807 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3808 -------------------------------------------------------------------------------
3809 */
3810 floatx80 floatx80_div( floatx80 a, floatx80 b )
3811 {
3812     flag aSign, bSign, zSign;
3813     int32 aExp, bExp, zExp;
3814     bits64 aSig, bSig, zSig0, zSig1;
3815     bits64 rem0, rem1, rem2, term0, term1, term2;
3816     floatx80 z;
3817
3818     aSig = extractFloatx80Frac( a );
3819     aExp = extractFloatx80Exp( a );
3820     aSign = extractFloatx80Sign( a );
3821     bSig = extractFloatx80Frac( b );
3822     bExp = extractFloatx80Exp( b );
3823     bSign = extractFloatx80Sign( b );
3824     zSign = aSign ^ bSign;
3825     if ( aExp == 0x7FFF ) {
3826         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3827         if ( bExp == 0x7FFF ) {
3828             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3829             goto invalid;
3830         }
3831         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3832     }
3833     if ( bExp == 0x7FFF ) {
3834         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3835         return packFloatx80( zSign, 0, 0 );
3836     }
3837     if ( bExp == 0 ) {
3838         if ( bSig == 0 ) {
3839             if ( ( aExp | aSig ) == 0 ) {
3840  invalid:
3841                 float_raise( float_flag_invalid );
3842                 z.low = floatx80_default_nan_low;
3843                 z.high = floatx80_default_nan_high;
3844                 return z;
3845             }
3846             float_raise( float_flag_divbyzero );
3847             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3848         }
3849         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3850     }
3851     if ( aExp == 0 ) {
3852         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3853         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3854     }
3855     zExp = aExp - bExp + 0x3FFE;
3856     rem1 = 0;
3857     if ( bSig <= aSig ) {
3858         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3859         ++zExp;
3860     }
3861     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3862     mul64To128( bSig, zSig0, &term0, &term1 );
3863     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3864     while ( (sbits64) rem0 < 0 ) {
3865         --zSig0;
3866         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3867     }
3868     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3869     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3870         mul64To128( bSig, zSig1, &term1, &term2 );
3871         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3872         while ( (sbits64) rem1 < 0 ) {
3873             --zSig1;
3874             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3875         }
3876         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3877     }
3878     return
3879         roundAndPackFloatx80(
3880             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3881
3882 }
3883
3884 /*
3885 -------------------------------------------------------------------------------
3886 Returns the remainder of the extended double-precision floating-point value
3887 `a' with respect to the corresponding value `b'.  The operation is performed
3888 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3889 -------------------------------------------------------------------------------
3890 */
3891 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3892 {
3893     flag aSign, bSign, zSign;
3894     int32 aExp, bExp, expDiff;
3895     bits64 aSig0, aSig1, bSig;
3896     bits64 q, term0, term1, alternateASig0, alternateASig1;
3897     floatx80 z;
3898
3899     aSig0 = extractFloatx80Frac( a );
3900     aExp = extractFloatx80Exp( a );
3901     aSign = extractFloatx80Sign( a );
3902     bSig = extractFloatx80Frac( b );
3903     bExp = extractFloatx80Exp( b );
3904     bSign = extractFloatx80Sign( b );
3905     if ( aExp == 0x7FFF ) {
3906         if (    (bits64) ( aSig0<<1 )
3907              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3908             return propagateFloatx80NaN( a, b );
3909         }
3910         goto invalid;
3911     }
3912     if ( bExp == 0x7FFF ) {
3913         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3914         return a;
3915     }
3916     if ( bExp == 0 ) {
3917         if ( bSig == 0 ) {
3918  invalid:
3919             float_raise( float_flag_invalid );
3920             z.low = floatx80_default_nan_low;
3921             z.high = floatx80_default_nan_high;
3922             return z;
3923         }
3924         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3925     }
3926     if ( aExp == 0 ) {
3927         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3928         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3929     }
3930     bSig |= LIT64( 0x8000000000000000 );
3931     zSign = aSign;
3932     expDiff = aExp - bExp;
3933     aSig1 = 0;
3934     if ( expDiff < 0 ) {
3935         if ( expDiff < -1 ) return a;
3936         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3937         expDiff = 0;
3938     }
3939     q = ( bSig <= aSig0 );
3940     if ( q ) aSig0 -= bSig;
3941     expDiff -= 64;
3942     while ( 0 < expDiff ) {
3943         q = estimateDiv128To64( aSig0, aSig1, bSig );
3944         q = ( 2 < q ) ? q - 2 : 0;
3945         mul64To128( bSig, q, &term0, &term1 );
3946         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3947         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3948         expDiff -= 62;
3949     }
3950     expDiff += 64;
3951     if ( 0 < expDiff ) {
3952         q = estimateDiv128To64( aSig0, aSig1, bSig );
3953         q = ( 2 < q ) ? q - 2 : 0;
3954         q >>= 64 - expDiff;
3955         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3956         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3957         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3958         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3959             ++q;
3960             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3961         }
3962     }
3963     else {
3964         term1 = 0;
3965         term0 = bSig;
3966     }
3967     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3968     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3969          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3970               && ( q & 1 ) )
3971        ) {
3972         aSig0 = alternateASig0;
3973         aSig1 = alternateASig1;
3974         zSign = ! zSign;
3975     }
3976     return
3977         normalizeRoundAndPackFloatx80(
3978             80, zSign, bExp + expDiff, aSig0, aSig1 );
3979
3980 }
3981
3982 /*
3983 -------------------------------------------------------------------------------
3984 Returns the square root of the extended double-precision floating-point
3985 value `a'.  The operation is performed according to the IEC/IEEE Standard
3986 for Binary Floating-Point Arithmetic.
3987 -------------------------------------------------------------------------------
3988 */
3989 floatx80 floatx80_sqrt( floatx80 a )
3990 {
3991     flag aSign;
3992     int32 aExp, zExp;
3993     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3994     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3995     floatx80 z;
3996
3997     aSig0 = extractFloatx80Frac( a );
3998     aExp = extractFloatx80Exp( a );
3999     aSign = extractFloatx80Sign( a );
4000     if ( aExp == 0x7FFF ) {
4001         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4002         if ( ! aSign ) return a;
4003         goto invalid;
4004     }
4005     if ( aSign ) {
4006         if ( ( aExp | aSig0 ) == 0 ) return a;
4007  invalid:
4008         float_raise( float_flag_invalid );
4009         z.low = floatx80_default_nan_low;
4010         z.high = floatx80_default_nan_high;
4011         return z;
4012     }
4013     if ( aExp == 0 ) {
4014         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4015         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4016     }
4017     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4018     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4019     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4020     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4021     doubleZSig0 = zSig0<<1;
4022     mul64To128( zSig0, zSig0, &term0, &term1 );
4023     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4024     while ( (sbits64) rem0 < 0 ) {
4025         --zSig0;
4026         doubleZSig0 -= 2;
4027         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4028     }
4029     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4030     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4031         if ( zSig1 == 0 ) zSig1 = 1;
4032         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4033         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4034         mul64To128( zSig1, zSig1, &term2, &term3 );
4035         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4036         while ( (sbits64) rem1 < 0 ) {
4037             --zSig1;
4038             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4039             term3 |= 1;
4040             term2 |= doubleZSig0;
4041             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4042         }
4043         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4044     }
4045     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4046     zSig0 |= doubleZSig0;
4047     return
4048         roundAndPackFloatx80(
4049             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4050
4051 }
4052
4053 /*
4054 -------------------------------------------------------------------------------
4055 Returns 1 if the extended double-precision floating-point value `a' is
4056 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4057 performed according to the IEC/IEEE Standard for Binary Floating-Point
4058 Arithmetic.
4059 -------------------------------------------------------------------------------
4060 */
4061 flag floatx80_eq( floatx80 a, floatx80 b )
4062 {
4063
4064     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4065               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4066          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4067               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4068        ) {
4069         if (    floatx80_is_signaling_nan( a )
4070              || floatx80_is_signaling_nan( b ) ) {
4071             float_raise( float_flag_invalid );
4072         }
4073         return 0;
4074     }
4075     return
4076            ( a.low == b.low )
4077         && (    ( a.high == b.high )
4078              || (    ( a.low == 0 )
4079                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4080            );
4081
4082 }
4083
4084 /*
4085 -------------------------------------------------------------------------------
4086 Returns 1 if the extended double-precision floating-point value `a' is
4087 less than or equal to the corresponding value `b', and 0 otherwise.  The
4088 comparison is performed according to the IEC/IEEE Standard for Binary
4089 Floating-Point Arithmetic.
4090 -------------------------------------------------------------------------------
4091 */
4092 flag floatx80_le( floatx80 a, floatx80 b )
4093 {
4094     flag aSign, bSign;
4095
4096     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4097               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4098          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4099               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4100        ) {
4101         float_raise( float_flag_invalid );
4102         return 0;
4103     }
4104     aSign = extractFloatx80Sign( a );
4105     bSign = extractFloatx80Sign( b );
4106     if ( aSign != bSign ) {
4107         return
4108                aSign
4109             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4110                  == 0 );
4111     }
4112     return
4113           aSign ? le128( b.high, b.low, a.high, a.low )
4114         : le128( a.high, a.low, b.high, b.low );
4115
4116 }
4117
4118 /*
4119 -------------------------------------------------------------------------------
4120 Returns 1 if the extended double-precision floating-point value `a' is
4121 less than the corresponding value `b', and 0 otherwise.  The comparison
4122 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4123 Arithmetic.
4124 -------------------------------------------------------------------------------
4125 */
4126 flag floatx80_lt( floatx80 a, floatx80 b )
4127 {
4128     flag aSign, bSign;
4129
4130     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4131               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4132          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4133               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4134        ) {
4135         float_raise( float_flag_invalid );
4136         return 0;
4137     }
4138     aSign = extractFloatx80Sign( a );
4139     bSign = extractFloatx80Sign( b );
4140     if ( aSign != bSign ) {
4141         return
4142                aSign
4143             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4144                  != 0 );
4145     }
4146     return
4147           aSign ? lt128( b.high, b.low, a.high, a.low )
4148         : lt128( a.high, a.low, b.high, b.low );
4149
4150 }
4151
4152 /*
4153 -------------------------------------------------------------------------------
4154 Returns 1 if the extended double-precision floating-point value `a' is equal
4155 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4156 raised if either operand is a NaN.  Otherwise, the comparison is performed
4157 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4158 -------------------------------------------------------------------------------
4159 */
4160 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4161 {
4162
4163     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4164               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4165          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4166               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4167        ) {
4168         float_raise( float_flag_invalid );
4169         return 0;
4170     }
4171     return
4172            ( a.low == b.low )
4173         && (    ( a.high == b.high )
4174              || (    ( a.low == 0 )
4175                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4176            );
4177
4178 }
4179
4180 /*
4181 -------------------------------------------------------------------------------
4182 Returns 1 if the extended double-precision floating-point value `a' is less
4183 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4184 do not cause an exception.  Otherwise, the comparison is performed according
4185 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4186 -------------------------------------------------------------------------------
4187 */
4188 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4189 {
4190     flag aSign, bSign;
4191
4192     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4193               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4194          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4195               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4196        ) {
4197         if (    floatx80_is_signaling_nan( a )
4198              || floatx80_is_signaling_nan( b ) ) {
4199             float_raise( float_flag_invalid );
4200         }
4201         return 0;
4202     }
4203     aSign = extractFloatx80Sign( a );
4204     bSign = extractFloatx80Sign( b );
4205     if ( aSign != bSign ) {
4206         return
4207                aSign
4208             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4209                  == 0 );
4210     }
4211     return
4212           aSign ? le128( b.high, b.low, a.high, a.low )
4213         : le128( a.high, a.low, b.high, b.low );
4214
4215 }
4216
4217 /*
4218 -------------------------------------------------------------------------------
4219 Returns 1 if the extended double-precision floating-point value `a' is less
4220 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4221 an exception.  Otherwise, the comparison is performed according to the
4222 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4223 -------------------------------------------------------------------------------
4224 */
4225 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4226 {
4227     flag aSign, bSign;
4228
4229     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4230               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4231          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4232               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4233        ) {
4234         if (    floatx80_is_signaling_nan( a )
4235              || floatx80_is_signaling_nan( b ) ) {
4236             float_raise( float_flag_invalid );
4237         }
4238         return 0;
4239     }
4240     aSign = extractFloatx80Sign( a );
4241     bSign = extractFloatx80Sign( b );
4242     if ( aSign != bSign ) {
4243         return
4244                aSign
4245             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4246                  != 0 );
4247     }
4248     return
4249           aSign ? lt128( b.high, b.low, a.high, a.low )
4250         : lt128( a.high, a.low, b.high, b.low );
4251
4252 }
4253
4254 #endif
4255
4256 #ifdef FLOAT128
4257
4258 /*
4259 -------------------------------------------------------------------------------
4260 Returns the result of converting the quadruple-precision floating-point
4261 value `a' to the 32-bit two's complement integer format.  The conversion
4262 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4263 Arithmetic---which means in particular that the conversion is rounded
4264 according to the current rounding mode.  If `a' is a NaN, the largest
4265 positive integer is returned.  Otherwise, if the conversion overflows, the
4266 largest integer with the same sign as `a' is returned.
4267 -------------------------------------------------------------------------------
4268 */
4269 int32 float128_to_int32( float128 a )
4270 {
4271     flag aSign;
4272     int32 aExp, shiftCount;
4273     bits64 aSig0, aSig1;
4274
4275     aSig1 = extractFloat128Frac1( a );
4276     aSig0 = extractFloat128Frac0( a );
4277     aExp = extractFloat128Exp( a );
4278     aSign = extractFloat128Sign( a );
4279     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4280     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4281     aSig0 |= ( aSig1 != 0 );
4282     shiftCount = 0x4028 - aExp;
4283     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4284     return roundAndPackInt32( aSign, aSig0 );
4285
4286 }
4287
4288 /*
4289 -------------------------------------------------------------------------------
4290 Returns the result of converting the quadruple-precision floating-point
4291 value `a' to the 32-bit two's complement integer format.  The conversion
4292 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4293 Arithmetic, except that the conversion is always rounded toward zero.  If
4294 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4295 conversion overflows, the largest integer with the same sign as `a' is
4296 returned.
4297 -------------------------------------------------------------------------------
4298 */
4299 int32 float128_to_int32_round_to_zero( float128 a )
4300 {
4301     flag aSign;
4302     int32 aExp, shiftCount;
4303     bits64 aSig0, aSig1, savedASig;
4304     int32 z;
4305
4306     aSig1 = extractFloat128Frac1( a );
4307     aSig0 = extractFloat128Frac0( a );
4308     aExp = extractFloat128Exp( a );
4309     aSign = extractFloat128Sign( a );
4310     aSig0 |= ( aSig1 != 0 );
4311     if ( 0x401E < aExp ) {
4312         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4313         goto invalid;
4314     }
4315     else if ( aExp < 0x3FFF ) {
4316         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4317         return 0;
4318     }
4319     aSig0 |= LIT64( 0x0001000000000000 );
4320     shiftCount = 0x402F - aExp;
4321     savedASig = aSig0;
4322     aSig0 >>= shiftCount;
4323     z = aSig0;
4324     if ( aSign ) z = - z;
4325     if ( ( z < 0 ) ^ aSign ) {
4326  invalid:
4327         float_raise( float_flag_invalid );
4328         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4329     }
4330     if ( ( aSig0<<shiftCount ) != savedASig ) {
4331         float_exception_flags |= float_flag_inexact;
4332     }
4333     return z;
4334
4335 }
4336
4337 /*
4338 -------------------------------------------------------------------------------
4339 Returns the result of converting the quadruple-precision floating-point
4340 value `a' to the 64-bit two's complement integer format.  The conversion
4341 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4342 Arithmetic---which means in particular that the conversion is rounded
4343 according to the current rounding mode.  If `a' is a NaN, the largest
4344 positive integer is returned.  Otherwise, if the conversion overflows, the
4345 largest integer with the same sign as `a' is returned.
4346 -------------------------------------------------------------------------------
4347 */
4348 int64 float128_to_int64( float128 a )
4349 {
4350     flag aSign;
4351     int32 aExp, shiftCount;
4352     bits64 aSig0, aSig1;
4353
4354     aSig1 = extractFloat128Frac1( a );
4355     aSig0 = extractFloat128Frac0( a );
4356     aExp = extractFloat128Exp( a );
4357     aSign = extractFloat128Sign( a );
4358     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4359     shiftCount = 0x402F - aExp;
4360     if ( shiftCount <= 0 ) {
4361         if ( 0x403E < aExp ) {
4362             float_raise( float_flag_invalid );
4363             if (    ! aSign
4364                  || (    ( aExp == 0x7FFF )
4365                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4366                     )
4367                ) {
4368                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4369             }
4370             return (sbits64) LIT64( 0x8000000000000000 );
4371         }
4372         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4373     }
4374     else {
4375         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4376     }
4377     return roundAndPackInt64( aSign, aSig0, aSig1 );
4378
4379 }
4380
4381 /*
4382 -------------------------------------------------------------------------------
4383 Returns the result of converting the quadruple-precision floating-point
4384 value `a' to the 64-bit two's complement integer format.  The conversion
4385 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4386 Arithmetic, except that the conversion is always rounded toward zero.
4387 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4388 the conversion overflows, the largest integer with the same sign as `a' is
4389 returned.
4390 -------------------------------------------------------------------------------
4391 */
4392 int64 float128_to_int64_round_to_zero( float128 a )
4393 {
4394     flag aSign;
4395     int32 aExp, shiftCount;
4396     bits64 aSig0, aSig1;
4397     int64 z;
4398
4399     aSig1 = extractFloat128Frac1( a );
4400     aSig0 = extractFloat128Frac0( a );
4401     aExp = extractFloat128Exp( a );
4402     aSign = extractFloat128Sign( a );
4403     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4404     shiftCount = aExp - 0x402F;
4405     if ( 0 < shiftCount ) {
4406         if ( 0x403E <= aExp ) {
4407             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4408             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4409                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4410                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4411             }
4412             else {
4413                 float_raise( float_flag_invalid );
4414                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4415                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4416                 }
4417             }
4418             return (sbits64) LIT64( 0x8000000000000000 );
4419         }
4420         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4421         if ( (bits64) ( aSig1<<shiftCount ) ) {
4422             float_exception_flags |= float_flag_inexact;
4423         }
4424     }
4425     else {
4426         if ( aExp < 0x3FFF ) {
4427             if ( aExp | aSig0 | aSig1 ) {
4428                 float_exception_flags |= float_flag_inexact;
4429             }
4430             return 0;
4431         }
4432         z = aSig0>>( - shiftCount );
4433         if (    aSig1
4434              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4435             float_exception_flags |= float_flag_inexact;
4436         }
4437     }
4438     if ( aSign ) z = - z;
4439     return z;
4440
4441 }
4442
4443 /*
4444  * just like above - but do not care for overflow of signed results
4445  */
4446 uint64 float128_to_uint64_round_to_zero( float128 a )
4447 {
4448     flag aSign;
4449     int32 aExp, shiftCount;
4450     bits64 aSig0, aSig1;
4451     uint64 z;
4452
4453     aSig1 = extractFloat128Frac1( a );
4454     aSig0 = extractFloat128Frac0( a );
4455     aExp = extractFloat128Exp( a );
4456     aSign = extractFloat128Sign( a );
4457     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4458     shiftCount = aExp - 0x402F;
4459     if ( 0 < shiftCount ) {
4460         if ( 0x403F <= aExp ) {
4461             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4462             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4463                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4464                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4465             }
4466             else {
4467                 float_raise( float_flag_invalid );
4468             }
4469             return LIT64( 0xFFFFFFFFFFFFFFFF );
4470         }
4471         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4472         if ( (bits64) ( aSig1<<shiftCount ) ) {
4473             float_exception_flags |= float_flag_inexact;
4474         }
4475     }
4476     else {
4477         if ( aExp < 0x3FFF ) {
4478             if ( aExp | aSig0 | aSig1 ) {
4479                 float_exception_flags |= float_flag_inexact;
4480             }
4481             return 0;
4482         }
4483         z = aSig0>>( - shiftCount );
4484         if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4485             float_exception_flags |= float_flag_inexact;
4486         }
4487     }
4488     if ( aSign ) z = - z;
4489     return z;
4490
4491 }
4492
4493 /*
4494 -------------------------------------------------------------------------------
4495 Returns the result of converting the quadruple-precision floating-point
4496 value `a' to the single-precision floating-point format.  The conversion
4497 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4498 Arithmetic.
4499 -------------------------------------------------------------------------------
4500 */
4501 float32 float128_to_float32( float128 a )
4502 {
4503     flag aSign;
4504     int32 aExp;
4505     bits64 aSig0, aSig1;
4506     bits32 zSig;
4507
4508     aSig1 = extractFloat128Frac1( a );
4509     aSig0 = extractFloat128Frac0( a );
4510     aExp = extractFloat128Exp( a );
4511     aSign = extractFloat128Sign( a );
4512     if ( aExp == 0x7FFF ) {
4513         if ( aSig0 | aSig1 ) {
4514             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4515         }
4516         return packFloat32( aSign, 0xFF, 0 );
4517     }
4518     aSig0 |= ( aSig1 != 0 );
4519     shift64RightJamming( aSig0, 18, &aSig0 );
4520     zSig = aSig0;
4521     if ( aExp || zSig ) {
4522         zSig |= 0x40000000;
4523         aExp -= 0x3F81;
4524     }
4525     return roundAndPackFloat32( aSign, aExp, zSig );
4526
4527 }
4528
4529 /*
4530 -------------------------------------------------------------------------------
4531 Returns the result of converting the quadruple-precision floating-point
4532 value `a' to the double-precision floating-point format.  The conversion
4533 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4534 Arithmetic.
4535 -------------------------------------------------------------------------------
4536 */
4537 float64 float128_to_float64( float128 a )
4538 {
4539     flag aSign;
4540     int32 aExp;
4541     bits64 aSig0, aSig1;
4542
4543     aSig1 = extractFloat128Frac1( a );
4544     aSig0 = extractFloat128Frac0( a );
4545     aExp = extractFloat128Exp( a );
4546     aSign = extractFloat128Sign( a );
4547     if ( aExp == 0x7FFF ) {
4548         if ( aSig0 | aSig1 ) {
4549             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4550         }
4551         return packFloat64( aSign, 0x7FF, 0 );
4552     }
4553     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4554     aSig0 |= ( aSig1 != 0 );
4555     if ( aExp || aSig0 ) {
4556         aSig0 |= LIT64( 0x4000000000000000 );
4557         aExp -= 0x3C01;
4558     }
4559     return roundAndPackFloat64( aSign, aExp, aSig0 );
4560
4561 }
4562
4563 #ifdef FLOATX80
4564
4565 /*
4566 -------------------------------------------------------------------------------
4567 Returns the result of converting the quadruple-precision floating-point
4568 value `a' to the extended double-precision floating-point format.  The
4569 conversion is performed according to the IEC/IEEE Standard for Binary
4570 Floating-Point Arithmetic.
4571 -------------------------------------------------------------------------------
4572 */
4573 floatx80 float128_to_floatx80( float128 a )
4574 {
4575     flag aSign;
4576     int32 aExp;
4577     bits64 aSig0, aSig1;
4578
4579     aSig1 = extractFloat128Frac1( a );
4580     aSig0 = extractFloat128Frac0( a );
4581     aExp = extractFloat128Exp( a );
4582     aSign = extractFloat128Sign( a );
4583     if ( aExp == 0x7FFF ) {
4584         if ( aSig0 | aSig1 ) {
4585             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4586         }
4587         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4588     }
4589     if ( aExp == 0 ) {
4590         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4591         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4592     }
4593     else {
4594         aSig0 |= LIT64( 0x0001000000000000 );
4595     }
4596     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4597     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4598
4599 }
4600
4601 #endif
4602
4603 /*
4604 -------------------------------------------------------------------------------
4605 Rounds the quadruple-precision floating-point value `a' to an integer, and
4606 returns the result as a quadruple-precision floating-point value.  The
4607 operation is performed according to the IEC/IEEE Standard for Binary
4608 Floating-Point Arithmetic.
4609 -------------------------------------------------------------------------------
4610 */
4611 float128 float128_round_to_int( float128 a )
4612 {
4613     flag aSign;
4614     int32 aExp;
4615     bits64 lastBitMask, roundBitsMask;
4616     int8 roundingMode;
4617     float128 z;
4618
4619     aExp = extractFloat128Exp( a );
4620     if ( 0x402F <= aExp ) {
4621         if ( 0x406F <= aExp ) {
4622             if (    ( aExp == 0x7FFF )
4623                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4624                ) {
4625                 return propagateFloat128NaN( a, a );
4626             }
4627             return a;
4628         }
4629         lastBitMask = 1;
4630         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4631         roundBitsMask = lastBitMask - 1;
4632         z = a;
4633         roundingMode = float_rounding_mode;
4634         if ( roundingMode == float_round_nearest_even ) {
4635             if ( lastBitMask ) {
4636                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4637                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4638             }
4639             else {
4640                 if ( (sbits64) z.low < 0 ) {
4641                     ++z.high;
4642                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4643                 }
4644             }
4645         }
4646         else if ( roundingMode != float_round_to_zero ) {
4647             if (   extractFloat128Sign( z )
4648                  ^ ( roundingMode == float_round_up ) ) {
4649                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4650             }
4651         }
4652         z.low &= ~ roundBitsMask;
4653     }
4654     else {
4655         if ( aExp < 0x3FFF ) {
4656             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4657             float_exception_flags |= float_flag_inexact;
4658             aSign = extractFloat128Sign( a );
4659             switch ( float_rounding_mode ) {
4660              case float_round_nearest_even:
4661                 if (    ( aExp == 0x3FFE )
4662                      && (   extractFloat128Frac0( a )
4663                           | extractFloat128Frac1( a ) )
4664                    ) {
4665                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4666                 }
4667                 break;
4668              case float_round_to_zero:
4669                 break;
4670              case float_round_down:
4671                 return
4672                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4673                     : packFloat128( 0, 0, 0, 0 );
4674              case float_round_up:
4675                 return
4676                       aSign ? packFloat128( 1, 0, 0, 0 )
4677                     : packFloat128( 0, 0x3FFF, 0, 0 );
4678             }
4679             return packFloat128( aSign, 0, 0, 0 );
4680         }
4681         lastBitMask = 1;
4682         lastBitMask <<= 0x402F - aExp;
4683         roundBitsMask = lastBitMask - 1;
4684         z.low = 0;
4685         z.high = a.high;
4686         roundingMode = float_rounding_mode;
4687         if ( roundingMode == float_round_nearest_even ) {
4688             z.high += lastBitMask>>1;
4689             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4690                 z.high &= ~ lastBitMask;
4691             }
4692         }
4693         else if ( roundingMode != float_round_to_zero ) {
4694             if (   extractFloat128Sign( z )
4695                  ^ ( roundingMode == float_round_up ) ) {
4696                 z.high |= ( a.low != 0 );
4697                 z.high += roundBitsMask;
4698             }
4699         }
4700         z.high &= ~ roundBitsMask;
4701     }
4702     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4703         float_exception_flags |= float_flag_inexact;
4704     }
4705     return z;
4706
4707 }
4708
4709 /*
4710 -------------------------------------------------------------------------------
4711 Returns the result of adding the absolute values of the quadruple-precision
4712 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4713 before being returned.  `zSign' is ignored if the result is a NaN.
4714 The addition is performed according to the IEC/IEEE Standard for Binary
4715 Floating-Point Arithmetic.
4716 -------------------------------------------------------------------------------
4717 */
4718 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4719 {
4720     int32 aExp, bExp, zExp;
4721     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4722     int32 expDiff;
4723
4724     aSig1 = extractFloat128Frac1( a );
4725     aSig0 = extractFloat128Frac0( a );
4726     aExp = extractFloat128Exp( a );
4727     bSig1 = extractFloat128Frac1( b );
4728     bSig0 = extractFloat128Frac0( b );
4729     bExp = extractFloat128Exp( b );
4730     expDiff = aExp - bExp;
4731     if ( 0 < expDiff ) {
4732         if ( aExp == 0x7FFF ) {
4733             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4734             return a;
4735         }
4736         if ( bExp == 0 ) {
4737             --expDiff;
4738         }
4739         else {
4740             bSig0 |= LIT64( 0x0001000000000000 );
4741         }
4742         shift128ExtraRightJamming(
4743             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4744         zExp = aExp;
4745     }
4746     else if ( expDiff < 0 ) {
4747         if ( bExp == 0x7FFF ) {
4748             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4749             return packFloat128( zSign, 0x7FFF, 0, 0 );
4750         }
4751         if ( aExp == 0 ) {
4752             ++expDiff;
4753         }
4754         else {
4755             aSig0 |= LIT64( 0x0001000000000000 );
4756         }
4757         shift128ExtraRightJamming(
4758             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4759         zExp = bExp;
4760     }
4761     else {
4762         if ( aExp == 0x7FFF ) {
4763             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4764                 return propagateFloat128NaN( a, b );
4765             }
4766             return a;
4767         }
4768         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4769         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4770         zSig2 = 0;
4771         zSig0 |= LIT64( 0x0002000000000000 );
4772         zExp = aExp;
4773         goto shiftRight1;
4774     }
4775     aSig0 |= LIT64( 0x0001000000000000 );
4776     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4777     --zExp;
4778     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4779     ++zExp;
4780  shiftRight1:
4781     shift128ExtraRightJamming(
4782         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4783  roundAndPack:
4784     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4785
4786 }
4787
4788 /*
4789 -------------------------------------------------------------------------------
4790 Returns the result of subtracting the absolute values of the quadruple-
4791 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4792 difference is negated before being returned.  `zSign' is ignored if the
4793 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4794 Standard for Binary Floating-Point Arithmetic.
4795 -------------------------------------------------------------------------------
4796 */
4797 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4798 {
4799     int32 aExp, bExp, zExp;
4800     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4801     int32 expDiff;
4802     float128 z;
4803
4804     aSig1 = extractFloat128Frac1( a );
4805     aSig0 = extractFloat128Frac0( a );
4806     aExp = extractFloat128Exp( a );
4807     bSig1 = extractFloat128Frac1( b );
4808     bSig0 = extractFloat128Frac0( b );
4809     bExp = extractFloat128Exp( b );
4810     expDiff = aExp - bExp;
4811     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4812     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4813     if ( 0 < expDiff ) goto aExpBigger;
4814     if ( expDiff < 0 ) goto bExpBigger;
4815     if ( aExp == 0x7FFF ) {
4816         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4817             return propagateFloat128NaN( a, b );
4818         }
4819         float_raise( float_flag_invalid );
4820         z.low = float128_default_nan_low;
4821         z.high = float128_default_nan_high;
4822         return z;
4823     }
4824     if ( aExp == 0 ) {
4825         aExp = 1;
4826         bExp = 1;
4827     }
4828     if ( bSig0 < aSig0 ) goto aBigger;
4829     if ( aSig0 < bSig0 ) goto bBigger;
4830     if ( bSig1 < aSig1 ) goto aBigger;
4831     if ( aSig1 < bSig1 ) goto bBigger;
4832     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4833  bExpBigger:
4834     if ( bExp == 0x7FFF ) {
4835         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4836         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4837     }
4838     if ( aExp == 0 ) {
4839         ++expDiff;
4840     }
4841     else {
4842         aSig0 |= LIT64( 0x4000000000000000 );
4843     }
4844     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4845     bSig0 |= LIT64( 0x4000000000000000 );
4846  bBigger:
4847     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4848     zExp = bExp;
4849     zSign ^= 1;
4850     goto normalizeRoundAndPack;
4851  aExpBigger:
4852     if ( aExp == 0x7FFF ) {
4853         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4854         return a;
4855     }
4856     if ( bExp == 0 ) {
4857         --expDiff;
4858     }
4859     else {
4860         bSig0 |= LIT64( 0x4000000000000000 );
4861     }
4862     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4863     aSig0 |= LIT64( 0x4000000000000000 );
4864  aBigger:
4865     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4866     zExp = aExp;
4867  normalizeRoundAndPack:
4868     --zExp;
4869     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4870
4871 }
4872
4873 /*
4874 -------------------------------------------------------------------------------
4875 Returns the result of adding the quadruple-precision floating-point values
4876 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4877 for Binary Floating-Point Arithmetic.
4878 -------------------------------------------------------------------------------
4879 */
4880 float128 float128_add( float128 a, float128 b )
4881 {
4882     flag aSign, bSign;
4883
4884     aSign = extractFloat128Sign( a );
4885     bSign = extractFloat128Sign( b );
4886     if ( aSign == bSign ) {
4887         return addFloat128Sigs( a, b, aSign );
4888     }
4889     else {
4890         return subFloat128Sigs( a, b, aSign );
4891     }
4892
4893 }
4894
4895 /*
4896 -------------------------------------------------------------------------------
4897 Returns the result of subtracting the quadruple-precision floating-point
4898 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4899 Standard for Binary Floating-Point Arithmetic.
4900 -------------------------------------------------------------------------------
4901 */
4902 float128 float128_sub( float128 a, float128 b )
4903 {
4904     flag aSign, bSign;
4905
4906     aSign = extractFloat128Sign( a );
4907     bSign = extractFloat128Sign( b );
4908     if ( aSign == bSign ) {
4909         return subFloat128Sigs( a, b, aSign );
4910     }
4911     else {
4912         return addFloat128Sigs( a, b, aSign );
4913     }
4914
4915 }
4916
4917 /*
4918 -------------------------------------------------------------------------------
4919 Returns the result of multiplying the quadruple-precision floating-point
4920 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4921 Standard for Binary Floating-Point Arithmetic.
4922 -------------------------------------------------------------------------------
4923 */
4924 float128 float128_mul( float128 a, float128 b )
4925 {
4926     flag aSign, bSign, zSign;
4927     int32 aExp, bExp, zExp;
4928     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4929     float128 z;
4930
4931     aSig1 = extractFloat128Frac1( a );
4932     aSig0 = extractFloat128Frac0( a );
4933     aExp = extractFloat128Exp( a );
4934     aSign = extractFloat128Sign( a );
4935     bSig1 = extractFloat128Frac1( b );
4936     bSig0 = extractFloat128Frac0( b );
4937     bExp = extractFloat128Exp( b );
4938     bSign = extractFloat128Sign( b );
4939     zSign = aSign ^ bSign;
4940     if ( aExp == 0x7FFF ) {
4941         if (    ( aSig0 | aSig1 )
4942              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4943             return propagateFloat128NaN( a, b );
4944         }
4945         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4946         return packFloat128( zSign, 0x7FFF, 0, 0 );
4947     }
4948     if ( bExp == 0x7FFF ) {
4949         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4950         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4951  invalid:
4952             float_raise( float_flag_invalid );
4953             z.low = float128_default_nan_low;
4954             z.high = float128_default_nan_high;
4955             return z;
4956         }
4957         return packFloat128( zSign, 0x7FFF, 0, 0 );
4958     }
4959     if ( aExp == 0 ) {
4960         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4961         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4962     }
4963     if ( bExp == 0 ) {
4964         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4965         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4966     }
4967     zExp = aExp + bExp - 0x4000;
4968     aSig0 |= LIT64( 0x0001000000000000 );
4969     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4970     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4971     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4972     zSig2 |= ( zSig3 != 0 );
4973     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4974         shift128ExtraRightJamming(
4975             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4976         ++zExp;
4977     }
4978     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4979
4980 }
4981
4982 /*
4983 -------------------------------------------------------------------------------
4984 Returns the result of dividing the quadruple-precision floating-point value
4985 `a' by the corresponding value `b'.  The operation is performed according to
4986 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4987 -------------------------------------------------------------------------------
4988 */
4989 float128 float128_div( float128 a, float128 b )
4990 {
4991     flag aSign, bSign, zSign;
4992     int32 aExp, bExp, zExp;
4993     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4994     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4995     float128 z;
4996
4997     aSig1 = extractFloat128Frac1( a );
4998     aSig0 = extractFloat128Frac0( a );
4999     aExp = extractFloat128Exp( a );
5000     aSign = extractFloat128Sign( a );
5001     bSig1 = extractFloat128Frac1( b );
5002     bSig0 = extractFloat128Frac0( b );
5003     bExp = extractFloat128Exp( b );
5004     bSign = extractFloat128Sign( b );
5005     zSign = aSign ^ bSign;
5006     if ( aExp == 0x7FFF ) {
5007         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5008         if ( bExp == 0x7FFF ) {
5009             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5010             goto invalid;
5011         }
5012         return packFloat128( zSign, 0x7FFF, 0, 0 );
5013     }
5014     if ( bExp == 0x7FFF ) {
5015         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5016         return packFloat128( zSign, 0, 0, 0 );
5017     }
5018     if ( bExp == 0 ) {
5019         if ( ( bSig0 | bSig1 ) == 0 ) {
5020             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5021  invalid:
5022                 float_raise( float_flag_invalid );
5023                 z.low = float128_default_nan_low;
5024                 z.high = float128_default_nan_high;
5025                 return z;
5026             }
5027             float_raise( float_flag_divbyzero );
5028             return packFloat128( zSign, 0x7FFF, 0, 0 );
5029         }
5030         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5031     }
5032     if ( aExp == 0 ) {
5033         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5034         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5035     }
5036     zExp = aExp - bExp + 0x3FFD;
5037     shortShift128Left(
5038         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5039     shortShift128Left(
5040         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5041     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5042         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5043         ++zExp;
5044     }
5045     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5046     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5047     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5048     while ( (sbits64) rem0 < 0 ) {
5049         --zSig0;
5050         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5051     }
5052     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5053     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5054         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5055         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5056         while ( (sbits64) rem1 < 0 ) {
5057             --zSig1;
5058             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5059         }
5060         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5061     }
5062     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5063     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5064
5065 }
5066
5067 /*
5068 -------------------------------------------------------------------------------
5069 Returns the remainder of the quadruple-precision floating-point value `a'
5070 with respect to the corresponding value `b'.  The operation is performed
5071 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5072 -------------------------------------------------------------------------------
5073 */
5074 float128 float128_rem( float128 a, float128 b )
5075 {
5076     flag aSign, bSign, zSign;
5077     int32 aExp, bExp, expDiff;
5078     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5079     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5080     sbits64 sigMean0;
5081     float128 z;
5082
5083     aSig1 = extractFloat128Frac1( a );
5084     aSig0 = extractFloat128Frac0( a );
5085     aExp = extractFloat128Exp( a );
5086     aSign = extractFloat128Sign( a );
5087     bSig1 = extractFloat128Frac1( b );
5088     bSig0 = extractFloat128Frac0( b );
5089     bExp = extractFloat128Exp( b );
5090     bSign = extractFloat128Sign( b );
5091     if ( aExp == 0x7FFF ) {
5092         if (    ( aSig0 | aSig1 )
5093              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5094             return propagateFloat128NaN( a, b );
5095         }
5096         goto invalid;
5097     }
5098     if ( bExp == 0x7FFF ) {
5099         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5100         return a;
5101     }
5102     if ( bExp == 0 ) {
5103         if ( ( bSig0 | bSig1 ) == 0 ) {
5104  invalid:
5105             float_raise( float_flag_invalid );
5106             z.low = float128_default_nan_low;
5107             z.high = float128_default_nan_high;
5108             return z;
5109         }
5110         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5111     }
5112     if ( aExp == 0 ) {
5113         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5114         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5115     }
5116     expDiff = aExp - bExp;
5117     if ( expDiff < -1 ) return a;
5118     shortShift128Left(
5119         aSig0 | LIT64( 0x0001000000000000 ),
5120         aSig1,
5121         15 - ( expDiff < 0 ),
5122         &aSig0,
5123         &aSig1
5124     );
5125     shortShift128Left(
5126         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5127     q = le128( bSig0, bSig1, aSig0, aSig1 );
5128     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5129     expDiff -= 64;
5130     while ( 0 < expDiff ) {
5131         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5132         q = ( 4 < q ) ? q - 4 : 0;
5133         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5134         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5135         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5136         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5137         expDiff -= 61;
5138     }
5139     if ( -64 < expDiff ) {
5140         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5141         q = ( 4 < q ) ? q - 4 : 0;
5142         q >>= - expDiff;
5143         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5144         expDiff += 52;
5145         if ( expDiff < 0 ) {
5146             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5147         }
5148         else {
5149             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5150         }
5151         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5152         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5153     }
5154     else {
5155         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5156         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5157     }
5158     do {
5159         alternateASig0 = aSig0;
5160         alternateASig1 = aSig1;
5161         ++q;
5162         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5163     } while ( 0 <= (sbits64) aSig0 );
5164     add128(
5165         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5166     if (    ( sigMean0 < 0 )
5167          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5168         aSig0 = alternateASig0;
5169         aSig1 = alternateASig1;
5170     }
5171     zSign = ( (sbits64) aSig0 < 0 );
5172     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5173     return
5174         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5175
5176 }
5177
5178 /*
5179 -------------------------------------------------------------------------------
5180 Returns the square root of the quadruple-precision floating-point value `a'.
5181 The operation is performed according to the IEC/IEEE Standard for Binary
5182 Floating-Point Arithmetic.
5183 -------------------------------------------------------------------------------
5184 */
5185 float128 float128_sqrt( float128 a )
5186 {
5187     flag aSign;
5188     int32 aExp, zExp;
5189     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5190     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5191     float128 z;
5192
5193     aSig1 = extractFloat128Frac1( a );
5194     aSig0 = extractFloat128Frac0( a );
5195     aExp = extractFloat128Exp( a );
5196     aSign = extractFloat128Sign( a );
5197     if ( aExp == 0x7FFF ) {
5198         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5199         if ( ! aSign ) return a;
5200         goto invalid;
5201     }
5202     if ( aSign ) {
5203         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5204  invalid:
5205         float_raise( float_flag_invalid );
5206         z.low = float128_default_nan_low;
5207         z.high = float128_default_nan_high;
5208         return z;
5209     }
5210     if ( aExp == 0 ) {
5211         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5212         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5213     }
5214     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5215     aSig0 |= LIT64( 0x0001000000000000 );
5216     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5217     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5218     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5219     doubleZSig0 = zSig0<<1;
5220     mul64To128( zSig0, zSig0, &term0, &term1 );
5221     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5222     while ( (sbits64) rem0 < 0 ) {
5223         --zSig0;
5224         doubleZSig0 -= 2;
5225         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5226     }
5227     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5228     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5229         if ( zSig1 == 0 ) zSig1 = 1;
5230         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5231         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5232         mul64To128( zSig1, zSig1, &term2, &term3 );
5233         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5234         while ( (sbits64) rem1 < 0 ) {
5235             --zSig1;
5236             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5237             term3 |= 1;
5238             term2 |= doubleZSig0;
5239             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5240         }
5241         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5242     }
5243     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5244     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5245
5246 }
5247
5248 /*
5249 -------------------------------------------------------------------------------
5250 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5251 the corresponding value `b', and 0 otherwise.  The comparison is performed
5252 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5253 -------------------------------------------------------------------------------
5254 */
5255 flag float128_eq( float128 a, float128 b )
5256 {
5257
5258     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5259               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5260          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5261               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5262        ) {
5263         if (    float128_is_signaling_nan( a )
5264              || float128_is_signaling_nan( b ) ) {
5265             float_raise( float_flag_invalid );
5266         }
5267         return 0;
5268     }
5269     return
5270            ( a.low == b.low )
5271         && (    ( a.high == b.high )
5272              || (    ( a.low == 0 )
5273                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5274            );
5275
5276 }
5277
5278 /*
5279 -------------------------------------------------------------------------------
5280 Returns 1 if the quadruple-precision floating-point value `a' is less than
5281 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5282 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5283 Arithmetic.
5284 -------------------------------------------------------------------------------
5285 */
5286 flag float128_le( float128 a, float128 b )
5287 {
5288     flag aSign, bSign;
5289
5290     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5291               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5292          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5293               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5294        ) {
5295         float_raise( float_flag_invalid );
5296         return 0;
5297     }
5298     aSign = extractFloat128Sign( a );
5299     bSign = extractFloat128Sign( b );
5300     if ( aSign != bSign ) {
5301         return
5302                aSign
5303             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5304                  == 0 );
5305     }
5306     return
5307           aSign ? le128( b.high, b.low, a.high, a.low )
5308         : le128( a.high, a.low, b.high, b.low );
5309
5310 }
5311
5312 /*
5313 -------------------------------------------------------------------------------
5314 Returns 1 if the quadruple-precision floating-point value `a' is less than
5315 the corresponding value `b', and 0 otherwise.  The comparison is performed
5316 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5317 -------------------------------------------------------------------------------
5318 */
5319 flag float128_lt( float128 a, float128 b )
5320 {
5321     flag aSign, bSign;
5322
5323     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5324               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5325          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5326               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5327        ) {
5328         float_raise( float_flag_invalid );
5329         return 0;
5330     }
5331     aSign = extractFloat128Sign( a );
5332     bSign = extractFloat128Sign( b );
5333     if ( aSign != bSign ) {
5334         return
5335                aSign
5336             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5337                  != 0 );
5338     }
5339     return
5340           aSign ? lt128( b.high, b.low, a.high, a.low )
5341         : lt128( a.high, a.low, b.high, b.low );
5342
5343 }
5344
5345 /*
5346 -------------------------------------------------------------------------------
5347 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5348 the corresponding value `b', and 0 otherwise.  The invalid exception is
5349 raised if either operand is a NaN.  Otherwise, the comparison is performed
5350 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5351 -------------------------------------------------------------------------------
5352 */
5353 flag float128_eq_signaling( float128 a, float128 b )
5354 {
5355
5356     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5357               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5358          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5359               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5360        ) {
5361         float_raise( float_flag_invalid );
5362         return 0;
5363     }
5364     return
5365            ( a.low == b.low )
5366         && (    ( a.high == b.high )
5367              || (    ( a.low == 0 )
5368                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5369            );
5370
5371 }
5372
5373 /*
5374 -------------------------------------------------------------------------------
5375 Returns 1 if the quadruple-precision floating-point value `a' is less than
5376 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5377 cause an exception.  Otherwise, the comparison is performed according to the
5378 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5379 -------------------------------------------------------------------------------
5380 */
5381 flag float128_le_quiet( float128 a, float128 b )
5382 {
5383     flag aSign, bSign;
5384
5385     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5386               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5387          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5388               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5389        ) {
5390         if (    float128_is_signaling_nan( a )
5391              || float128_is_signaling_nan( b ) ) {
5392             float_raise( float_flag_invalid );
5393         }
5394         return 0;
5395     }
5396     aSign = extractFloat128Sign( a );
5397     bSign = extractFloat128Sign( b );
5398     if ( aSign != bSign ) {
5399         return
5400                aSign
5401             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5402                  == 0 );
5403     }
5404     return
5405           aSign ? le128( b.high, b.low, a.high, a.low )
5406         : le128( a.high, a.low, b.high, b.low );
5407
5408 }
5409
5410 /*
5411 -------------------------------------------------------------------------------
5412 Returns 1 if the quadruple-precision floating-point value `a' is less than
5413 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5414 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5415 Standard for Binary Floating-Point Arithmetic.
5416 -------------------------------------------------------------------------------
5417 */
5418 flag float128_lt_quiet( float128 a, float128 b )
5419 {
5420     flag aSign, bSign;
5421
5422     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5423               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5424          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5425               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5426        ) {
5427         if (    float128_is_signaling_nan( a )
5428              || float128_is_signaling_nan( b ) ) {
5429             float_raise( float_flag_invalid );
5430         }
5431         return 0;
5432     }
5433     aSign = extractFloat128Sign( a );
5434     bSign = extractFloat128Sign( b );
5435     if ( aSign != bSign ) {
5436         return
5437                aSign
5438             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5439                  != 0 );
5440     }
5441     return
5442           aSign ? lt128( b.high, b.low, a.high, a.low )
5443         : lt128( a.high, a.low, b.high, b.low );
5444
5445 }
5446
5447 #endif
5448
5449
5450 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5451
5452 /*
5453  * These two routines are not part of the original softfloat distribution.
5454  *
5455  * They are based on the corresponding conversions to integer but return
5456  * unsigned numbers instead since these functions are required by GCC.
5457  *
5458  * Added by Mark Brinicombe <mark@NetBSD.org>   27/09/97
5459  *
5460  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5461  */
5462
5463 /*
5464 -------------------------------------------------------------------------------
5465 Returns the result of converting the double-precision floating-point value
5466 `a' to the 32-bit unsigned integer format.  The conversion is
5467 performed according to the IEC/IEEE Standard for Binary Floating-point
5468 Arithmetic, except that the conversion is always rounded toward zero.  If
5469 `a' is a NaN, the largest positive integer is returned.  If the conversion
5470 overflows, the largest integer positive is returned.
5471 -------------------------------------------------------------------------------
5472 */
5473 uint32 float64_to_uint32_round_to_zero( float64 a )
5474 {
5475     flag aSign;
5476     int16 aExp, shiftCount;
5477     bits64 aSig, savedASig;
5478     uint32 z;
5479
5480     aSig = extractFloat64Frac( a );
5481     aExp = extractFloat64Exp( a );
5482     aSign = extractFloat64Sign( a );
5483
5484     if (aSign) {
5485         float_raise( float_flag_invalid );
5486         return(0);
5487     }
5488
5489     if ( 0x41E < aExp ) {
5490         float_raise( float_flag_invalid );
5491         return 0xffffffff;
5492     }
5493     else if ( aExp < 0x3FF ) {
5494         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5495         return 0;
5496     }
5497     aSig |= LIT64( 0x0010000000000000 );
5498     shiftCount = 0x433 - aExp;
5499     savedASig = aSig;
5500     aSig >>= shiftCount;
5501     z = aSig;
5502     if ( ( aSig<<shiftCount ) != savedASig ) {
5503         float_exception_flags |= float_flag_inexact;
5504     }
5505     return z;
5506
5507 }
5508
5509 /*
5510 -------------------------------------------------------------------------------
5511 Returns the result of converting the single-precision floating-point value
5512 `a' to the 32-bit unsigned integer format.  The conversion is
5513 performed according to the IEC/IEEE Standard for Binary Floating-point
5514 Arithmetic, except that the conversion is always rounded toward zero.  If
5515 `a' is a NaN, the largest positive integer is returned.  If the conversion
5516 overflows, the largest positive integer is returned.
5517 -------------------------------------------------------------------------------
5518 */
5519 uint32 float32_to_uint32_round_to_zero( float32 a )
5520 {
5521     flag aSign;
5522     int16 aExp, shiftCount;
5523     bits32 aSig;
5524     uint32 z;
5525
5526     aSig = extractFloat32Frac( a );
5527     aExp = extractFloat32Exp( a );
5528     aSign = extractFloat32Sign( a );
5529     shiftCount = aExp - 0x9E;
5530
5531     if (aSign) {
5532         float_raise( float_flag_invalid );
5533         return(0);
5534     }
5535     if ( 0 < shiftCount ) {
5536         float_raise( float_flag_invalid );
5537         return 0xFFFFFFFF;
5538     }
5539     else if ( aExp <= 0x7E ) {
5540         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5541         return 0;
5542     }
5543     aSig = ( aSig | 0x800000 )<<8;
5544     z = aSig>>( - shiftCount );
5545     if ( aSig<<( shiftCount & 31 ) ) {
5546         float_exception_flags |= float_flag_inexact;
5547     }
5548     return z;
5549
5550 }
5551
5552 #endif