sys/lib/libkern/softfloat.c

   1 /* $NetBSD: softfloat.c,v 1.2.6.3 2004/09/21 13:35:54 skrll Exp $ */
   2
   3 /*
   4  * This version hacked for use with gcc -msoft-float by bjh21.
   5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
   6  *  itself).
   7  */
   8
   9 /*
  10  * Things you may want to define:
  11  *
  12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
  13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
  14  *   properly renamed.
  15  */
  16
  17 /*
  18 ===============================================================================
  19
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* If you need this in a boot program, you have bigger problems... */
  48 #ifndef _STANDALONE
  49
  50 #include <sys/cdefs.h>
  51 #if defined(LIBC_SCCS) && !defined(lint)
  52 __RCSID("$NetBSD: softfloat.c,v 1.2.6.3 2004/09/21 13:35:54 skrll Exp $");
  53 #endif /* LIBC_SCCS and not lint */
  54
  55 #ifdef SOFTFLOAT_FOR_GCC
  56 #include "softfloat-for-gcc.h"
  57 #endif
  58
  59 #include "milieu.h"
  60 #include "softfloat.h"
  61
  62 /*
  63  * Conversions between floats as stored in memory and floats as
  64  * SoftFloat uses them
  65  */
  66 #ifndef FLOAT64_DEMANGLE
  67 #define FLOAT64_DEMANGLE(a)     (a)
  68 #endif
  69 #ifndef FLOAT64_MANGLE
  70 #define FLOAT64_MANGLE(a)       (a)
  71 #endif
  72
  73 /*
  74 -------------------------------------------------------------------------------
  75 Floating-point rounding mode, extended double-precision rounding precision,
  76 and exception flags.
  77 -------------------------------------------------------------------------------
  78 */
  79
  80 /*
  81  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
  82  *      Right now, it does not.  I've removed all other dynamic global
  83  *      variables. [ross]
  84  */
  85 #ifdef FLOATX80
  86 int8 floatx80_rounding_precision = 80;
  87 #endif
  88
  89 /*
  90 -------------------------------------------------------------------------------
  91 Primitive arithmetic functions, including multi-word arithmetic, and
  92 division and square root approximations.  (Can be specialized to target if
  93 desired.)
  94 -------------------------------------------------------------------------------
  95 */
  96 #include "softfloat-macros.h"
  97
  98 /*
  99 -------------------------------------------------------------------------------
 100 Functions and definitions to determine:  (1) whether tininess for underflow
 101 is detected before or after rounding by default, (2) what (if anything)
 102 happens when exceptions are raised, (3) how signaling NaNs are distinguished
 103 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 104 are propagated from function inputs to output.  These details are target-
 105 specific.
 106 -------------------------------------------------------------------------------
 107 */
 108 #include "softfloat-specialize.h"
 109
 110 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
 111 /*
 112 -------------------------------------------------------------------------------
 113 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 114 and 7, and returns the properly rounded 32-bit integer corresponding to the
 115 input.  If `zSign' is 1, the input is negated before being converted to an
 116 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 117 is simply rounded to an integer, with the inexact exception raised if the
 118 input cannot be represented exactly as an integer.  However, if the fixed-
 119 point input is too large, the invalid exception is raised and the largest
 120 positive or negative integer is returned.
 121 -------------------------------------------------------------------------------
 122 */
 123 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
 124 {
 125     int8 roundingMode;
 126     flag roundNearestEven;
 127     int8 roundIncrement, roundBits;
 128     int32 z;
 129
 130     roundingMode = float_rounding_mode();
 131     roundNearestEven = ( roundingMode == float_round_nearest_even );
 132     roundIncrement = 0x40;
 133     if ( ! roundNearestEven ) {
 134         if ( roundingMode == float_round_to_zero ) {
 135             roundIncrement = 0;
 136         }
 137         else {
 138             roundIncrement = 0x7F;
 139             if ( zSign ) {
 140                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 141             }
 142             else {
 143                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 144             }
 145         }
 146     }
 147     roundBits = absZ & 0x7F;
 148     absZ = ( absZ + roundIncrement )>>7;
 149     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 150     z = absZ;
 151     if ( zSign ) z = - z;
 152     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 153         float_raise( float_flag_invalid );
 154         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 155     }
 156     if ( roundBits ) float_set_inexact();
 157     return z;
 158
 159 }
 160
 161 /*
 162 -------------------------------------------------------------------------------
 163 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 164 `absZ1', with binary point between bits 63 and 64 (between the input words),
 165 and returns the properly rounded 64-bit integer corresponding to the input.
 166 If `zSign' is 1, the input is negated before being converted to an integer.
 167 Ordinarily, the fixed-point input is simply rounded to an integer, with
 168 the inexact exception raised if the input cannot be represented exactly as
 169 an integer.  However, if the fixed-point input is too large, the invalid
 170 exception is raised and the largest positive or negative integer is
 171 returned.
 172 -------------------------------------------------------------------------------
 173 */
 174 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
 175 {
 176     int8 roundingMode;
 177     flag roundNearestEven, increment;
 178     int64 z;
 179
 180     roundingMode = float_rounding_mode();
 181     roundNearestEven = ( roundingMode == float_round_nearest_even );
 182     increment = ( (sbits64) absZ1 < 0 );
 183     if ( ! roundNearestEven ) {
 184         if ( roundingMode == float_round_to_zero ) {
 185             increment = 0;
 186         }
 187         else {
 188             if ( zSign ) {
 189                 increment = ( roundingMode == float_round_down ) && absZ1;
 190             }
 191             else {
 192                 increment = ( roundingMode == float_round_up ) && absZ1;
 193             }
 194         }
 195     }
 196     if ( increment ) {
 197         ++absZ0;
 198         if ( absZ0 == 0 ) goto overflow;
 199         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 200     }
 201     z = absZ0;
 202     if ( zSign ) z = - z;
 203     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 204  overflow:
 205         float_raise( float_flag_invalid );
 206         return
 207               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 208             : LIT64( 0x7FFFFFFFFFFFFFFF );
 209     }
 210     if ( absZ1 ) float_set_inexact();
 211     return z;
 212
 213 }
 214 #endif
 215
 216 /*
 217 -------------------------------------------------------------------------------
 218 Returns the fraction bits of the single-precision floating-point value `a'.
 219 -------------------------------------------------------------------------------
 220 */
 221 INLINE bits32 extractFloat32Frac( float32 a )
 222 {
 223
 224     return a & 0x007FFFFF;
 225
 226 }
 227
 228 /*
 229 -------------------------------------------------------------------------------
 230 Returns the exponent bits of the single-precision floating-point value `a'.
 231 -------------------------------------------------------------------------------
 232 */
 233 INLINE int16 extractFloat32Exp( float32 a )
 234 {
 235
 236     return ( a>>23 ) & 0xFF;
 237
 238 }
 239
 240 /*
 241 -------------------------------------------------------------------------------
 242 Returns the sign bit of the single-precision floating-point value `a'.
 243 -------------------------------------------------------------------------------
 244 */
 245 INLINE flag extractFloat32Sign( float32 a )
 246 {
 247
 248     return a>>31;
 249
 250 }
 251
 252 /*
 253 -------------------------------------------------------------------------------
 254 Normalizes the subnormal single-precision floating-point value represented
 255 by the denormalized significand `aSig'.  The normalized exponent and
 256 significand are stored at the locations pointed to by `zExpPtr' and
 257 `zSigPtr', respectively.
 258 -------------------------------------------------------------------------------
 259 */
 260 static void
 261  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 262 {
 263     int8 shiftCount;
 264
 265     shiftCount = countLeadingZeros32( aSig ) - 8;
 266     *zSigPtr = aSig<<shiftCount;
 267     *zExpPtr = 1 - shiftCount;
 268
 269 }
 270
 271 /*
 272 -------------------------------------------------------------------------------
 273 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 274 single-precision floating-point value, returning the result.  After being
 275 shifted into the proper positions, the three fields are simply added
 276 together to form the result.  This means that any integer portion of `zSig'
 277 will be added into the exponent.  Since a properly normalized significand
 278 will have an integer portion equal to 1, the `zExp' input should be 1 less
 279 than the desired result exponent whenever `zSig' is a complete, normalized
 280 significand.
 281 -------------------------------------------------------------------------------
 282 */
 283 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 284 {
 285
 286     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 287
 288 }
 289
 290 /*
 291 -------------------------------------------------------------------------------
 292 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 293 and significand `zSig', and returns the proper single-precision floating-
 294 point value corresponding to the abstract input.  Ordinarily, the abstract
 295 value is simply rounded and packed into the single-precision format, with
 296 the inexact exception raised if the abstract input cannot be represented
 297 exactly.  However, if the abstract value is too large, the overflow and
 298 inexact exceptions are raised and an infinity or maximal finite value is
 299 returned.  If the abstract value is too small, the input value is rounded to
 300 a subnormal number, and the underflow and inexact exceptions are raised if
 301 the abstract input cannot be represented exactly as a subnormal single-
 302 precision floating-point number.
 303     The input significand `zSig' has its binary point between bits 30
 304 and 29, which is 7 bits to the left of the usual location.  This shifted
 305 significand must be normalized or smaller.  If `zSig' is not normalized,
 306 `zExp' must be 0; in that case, the result returned is a subnormal number,
 307 and it must not require rounding.  In the usual case that `zSig' is
 308 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 309 The handling of underflow and overflow follows the IEC/IEEE Standard for
 310 Binary Floating-Point Arithmetic.
 311 -------------------------------------------------------------------------------
 312 */
 313 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 314 {
 315     int8 roundingMode;
 316     flag roundNearestEven;
 317     int8 roundIncrement, roundBits;
 318     flag isTiny;
 319
 320     roundingMode = float_rounding_mode();
 321     roundNearestEven = ( roundingMode == float_round_nearest_even );
 322     roundIncrement = 0x40;
 323     if ( ! roundNearestEven ) {
 324         if ( roundingMode == float_round_to_zero ) {
 325             roundIncrement = 0;
 326         }
 327         else {
 328             roundIncrement = 0x7F;
 329             if ( zSign ) {
 330                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 331             }
 332             else {
 333                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 334             }
 335         }
 336     }
 337     roundBits = zSig & 0x7F;
 338     if ( 0xFD <= (bits16) zExp ) {
 339         if (    ( 0xFD < zExp )
 340              || (    ( zExp == 0xFD )
 341                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 342            ) {
 343             float_raise( float_flag_overflow | float_flag_inexact );
 344             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 345         }
 346         if ( zExp < 0 ) {
 347             isTiny =
 348                    ( float_detect_tininess == float_tininess_before_rounding )
 349                 || ( zExp < -1 )
 350                 || ( zSig + roundIncrement < 0x80000000 );
 351             shift32RightJamming( zSig, - zExp, &zSig );
 352             zExp = 0;
 353             roundBits = zSig & 0x7F;
 354             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 355         }
 356     }
 357     if ( roundBits ) float_set_inexact();
 358     zSig = ( zSig + roundIncrement )>>7;
 359     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 360     if ( zSig == 0 ) zExp = 0;
 361     return packFloat32( zSign, zExp, zSig );
 362
 363 }
 364
 365 /*
 366 -------------------------------------------------------------------------------
 367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 368 and significand `zSig', and returns the proper single-precision floating-
 369 point value corresponding to the abstract input.  This routine is just like
 370 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 371 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 372 floating-point exponent.
 373 -------------------------------------------------------------------------------
 374 */
 375 static float32
 376  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 377 {
 378     int8 shiftCount;
 379
 380     shiftCount = countLeadingZeros32( zSig ) - 1;
 381     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
 382
 383 }
 384
 385 /*
 386 -------------------------------------------------------------------------------
 387 Returns the fraction bits of the double-precision floating-point value `a'.
 388 -------------------------------------------------------------------------------
 389 */
 390 INLINE bits64 extractFloat64Frac( float64 a )
 391 {
 392
 393     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
 394
 395 }
 396
 397 /*
 398 -------------------------------------------------------------------------------
 399 Returns the exponent bits of the double-precision floating-point value `a'.
 400 -------------------------------------------------------------------------------
 401 */
 402 INLINE int16 extractFloat64Exp( float64 a )
 403 {
 404
 405     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
 406
 407 }
 408
 409 /*
 410 -------------------------------------------------------------------------------
 411 Returns the sign bit of the double-precision floating-point value `a'.
 412 -------------------------------------------------------------------------------
 413 */
 414 INLINE flag extractFloat64Sign( float64 a )
 415 {
 416
 417     return FLOAT64_DEMANGLE(a)>>63;
 418
 419 }
 420
 421 /*
 422 -------------------------------------------------------------------------------
 423 Normalizes the subnormal double-precision floating-point value represented
 424 by the denormalized significand `aSig'.  The normalized exponent and
 425 significand are stored at the locations pointed to by `zExpPtr' and
 426 `zSigPtr', respectively.
 427 -------------------------------------------------------------------------------
 428 */
 429 static void
 430  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 431 {
 432     int8 shiftCount;
 433
 434     shiftCount = countLeadingZeros64( aSig ) - 11;
 435     *zSigPtr = aSig<<shiftCount;
 436     *zExpPtr = 1 - shiftCount;
 437
 438 }
 439
 440 /*
 441 -------------------------------------------------------------------------------
 442 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 443 double-precision floating-point value, returning the result.  After being
 444 shifted into the proper positions, the three fields are simply added
 445 together to form the result.  This means that any integer portion of `zSig'
 446 will be added into the exponent.  Since a properly normalized significand
 447 will have an integer portion equal to 1, the `zExp' input should be 1 less
 448 than the desired result exponent whenever `zSig' is a complete, normalized
 449 significand.
 450 -------------------------------------------------------------------------------
 451 */
 452 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 453 {
 454
 455     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
 456                            ( ( (bits64) zExp )<<52 ) + zSig );
 457
 458 }
 459
 460 /*
 461 -------------------------------------------------------------------------------
 462 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 463 and significand `zSig', and returns the proper double-precision floating-
 464 point value corresponding to the abstract input.  Ordinarily, the abstract
 465 value is simply rounded and packed into the double-precision format, with
 466 the inexact exception raised if the abstract input cannot be represented
 467 exactly.  However, if the abstract value is too large, the overflow and
 468 inexact exceptions are raised and an infinity or maximal finite value is
 469 returned.  If the abstract value is too small, the input value is rounded to
 470 a subnormal number, and the underflow and inexact exceptions are raised if
 471 the abstract input cannot be represented exactly as a subnormal double-
 472 precision floating-point number.
 473     The input significand `zSig' has its binary point between bits 62
 474 and 61, which is 10 bits to the left of the usual location.  This shifted
 475 significand must be normalized or smaller.  If `zSig' is not normalized,
 476 `zExp' must be 0; in that case, the result returned is a subnormal number,
 477 and it must not require rounding.  In the usual case that `zSig' is
 478 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 479 The handling of underflow and overflow follows the IEC/IEEE Standard for
 480 Binary Floating-Point Arithmetic.
 481 -------------------------------------------------------------------------------
 482 */
 483 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 484 {
 485     int8 roundingMode;
 486     flag roundNearestEven;
 487     int16 roundIncrement, roundBits;
 488     flag isTiny;
 489
 490     roundingMode = float_rounding_mode();
 491     roundNearestEven = ( roundingMode == float_round_nearest_even );
 492     roundIncrement = 0x200;
 493     if ( ! roundNearestEven ) {
 494         if ( roundingMode == float_round_to_zero ) {
 495             roundIncrement = 0;
 496         }
 497         else {
 498             roundIncrement = 0x3FF;
 499             if ( zSign ) {
 500                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 501             }
 502             else {
 503                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 504             }
 505         }
 506     }
 507     roundBits = zSig & 0x3FF;
 508     if ( 0x7FD <= (bits16) zExp ) {
 509         if (    ( 0x7FD < zExp )
 510              || (    ( zExp == 0x7FD )
 511                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 512            ) {
 513             float_raise( float_flag_overflow | float_flag_inexact );
 514             return FLOAT64_MANGLE(
 515                 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
 516                 ( roundIncrement == 0 ));
 517         }
 518         if ( zExp < 0 ) {
 519             isTiny =
 520                    ( float_detect_tininess == float_tininess_before_rounding )
 521                 || ( zExp < -1 )
 522                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 523             shift64RightJamming( zSig, - zExp, &zSig );
 524             zExp = 0;
 525             roundBits = zSig & 0x3FF;
 526             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 527         }
 528     }
 529     if ( roundBits ) float_set_inexact();
 530     zSig = ( zSig + roundIncrement )>>10;
 531     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 532     if ( zSig == 0 ) zExp = 0;
 533     return packFloat64( zSign, zExp, zSig );
 534
 535 }
 536
 537 /*
 538 -------------------------------------------------------------------------------
 539 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 540 and significand `zSig', and returns the proper double-precision floating-
 541 point value corresponding to the abstract input.  This routine is just like
 542 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 543 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 544 floating-point exponent.
 545 -------------------------------------------------------------------------------
 546 */
 547 static float64
 548  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 549 {
 550     int8 shiftCount;
 551
 552     shiftCount = countLeadingZeros64( zSig ) - 1;
 553     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
 554
 555 }
 556
 557 #ifdef FLOATX80
 558
 559 /*
 560 -------------------------------------------------------------------------------
 561 Returns the fraction bits of the extended double-precision floating-point
 562 value `a'.
 563 -------------------------------------------------------------------------------
 564 */
 565 INLINE bits64 extractFloatx80Frac( floatx80 a )
 566 {
 567
 568     return a.low;
 569
 570 }
 571
 572 /*
 573 -------------------------------------------------------------------------------
 574 Returns the exponent bits of the extended double-precision floating-point
 575 value `a'.
 576 -------------------------------------------------------------------------------
 577 */
 578 INLINE int32 extractFloatx80Exp( floatx80 a )
 579 {
 580
 581     return a.high & 0x7FFF;
 582
 583 }
 584
 585 /*
 586 -------------------------------------------------------------------------------
 587 Returns the sign bit of the extended double-precision floating-point value
 588 `a'.
 589 -------------------------------------------------------------------------------
 590 */
 591 INLINE flag extractFloatx80Sign( floatx80 a )
 592 {
 593
 594     return a.high>>15;
 595
 596 }
 597
 598 /*
 599 -------------------------------------------------------------------------------
 600 Normalizes the subnormal extended double-precision floating-point value
 601 represented by the denormalized significand `aSig'.  The normalized exponent
 602 and significand are stored at the locations pointed to by `zExpPtr' and
 603 `zSigPtr', respectively.
 604 -------------------------------------------------------------------------------
 605 */
 606 static void
 607  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 608 {
 609     int8 shiftCount;
 610
 611     shiftCount = countLeadingZeros64( aSig );
 612     *zSigPtr = aSig<<shiftCount;
 613     *zExpPtr = 1 - shiftCount;
 614
 615 }
 616
 617 /*
 618 -------------------------------------------------------------------------------
 619 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 620 extended double-precision floating-point value, returning the result.
 621 -------------------------------------------------------------------------------
 622 */
 623 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 624 {
 625     floatx80 z;
 626
 627     z.low = zSig;
 628     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 629     return z;
 630
 631 }
 632
 633 /*
 634 -------------------------------------------------------------------------------
 635 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 636 and extended significand formed by the concatenation of `zSig0' and `zSig1',
 637 and returns the proper extended double-precision floating-point value
 638 corresponding to the abstract input.  Ordinarily, the abstract value is
 639 rounded and packed into the extended double-precision format, with the
 640 inexact exception raised if the abstract input cannot be represented
 641 exactly.  However, if the abstract value is too large, the overflow and
 642 inexact exceptions are raised and an infinity or maximal finite value is
 643 returned.  If the abstract value is too small, the input value is rounded to
 644 a subnormal number, and the underflow and inexact exceptions are raised if
 645 the abstract input cannot be represented exactly as a subnormal extended
 646 double-precision floating-point number.
 647     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 648 number of bits as single or double precision, respectively.  Otherwise, the
 649 result is rounded to the full precision of the extended double-precision
 650 format.
 651     The input significand must be normalized or smaller.  If the input
 652 significand is not normalized, `zExp' must be 0; in that case, the result
 653 returned is a subnormal number, and it must not require rounding.  The
 654 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 655 Floating-Point Arithmetic.
 656 -------------------------------------------------------------------------------
 657 */
 658 static floatx80
 659  roundAndPackFloatx80(
 660      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 661  )
 662 {
 663     int8 roundingMode;
 664     flag roundNearestEven, increment, isTiny;
 665     int64 roundIncrement, roundMask, roundBits;
 666
 667     roundingMode = float_rounding_mode();
 668     roundNearestEven = ( roundingMode == float_round_nearest_even );
 669     if ( roundingPrecision == 80 ) goto precision80;
 670     if ( roundingPrecision == 64 ) {
 671         roundIncrement = LIT64( 0x0000000000000400 );
 672         roundMask = LIT64( 0x00000000000007FF );
 673     }
 674     else if ( roundingPrecision == 32 ) {
 675         roundIncrement = LIT64( 0x0000008000000000 );
 676         roundMask = LIT64( 0x000000FFFFFFFFFF );
 677     }
 678     else {
 679         goto precision80;
 680     }
 681     zSig0 |= ( zSig1 != 0 );
 682     if ( ! roundNearestEven ) {
 683         if ( roundingMode == float_round_to_zero ) {
 684             roundIncrement = 0;
 685         }
 686         else {
 687             roundIncrement = roundMask;
 688             if ( zSign ) {
 689                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 690             }
 691             else {
 692                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 693             }
 694         }
 695     }
 696     roundBits = zSig0 & roundMask;
 697     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 698         if (    ( 0x7FFE < zExp )
 699              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 700            ) {
 701             goto overflow;
 702         }
 703         if ( zExp <= 0 ) {
 704             isTiny =
 705                    ( float_detect_tininess == float_tininess_before_rounding )
 706                 || ( zExp < 0 )
 707                 || ( zSig0 <= zSig0 + roundIncrement );
 708             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 709             zExp = 0;
 710             roundBits = zSig0 & roundMask;
 711             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 712             if ( roundBits ) float_set_inexact();
 713             zSig0 += roundIncrement;
 714             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 715             roundIncrement = roundMask + 1;
 716             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 717                 roundMask |= roundIncrement;
 718             }
 719             zSig0 &= ~ roundMask;
 720             return packFloatx80( zSign, zExp, zSig0 );
 721         }
 722     }
 723     if ( roundBits ) float_set_inexact();
 724     zSig0 += roundIncrement;
 725     if ( zSig0 < roundIncrement ) {
 726         ++zExp;
 727         zSig0 = LIT64( 0x8000000000000000 );
 728     }
 729     roundIncrement = roundMask + 1;
 730     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 731         roundMask |= roundIncrement;
 732     }
 733     zSig0 &= ~ roundMask;
 734     if ( zSig0 == 0 ) zExp = 0;
 735     return packFloatx80( zSign, zExp, zSig0 );
 736  precision80:
 737     increment = ( (sbits64) zSig1 < 0 );
 738     if ( ! roundNearestEven ) {
 739         if ( roundingMode == float_round_to_zero ) {
 740             increment = 0;
 741         }
 742         else {
 743             if ( zSign ) {
 744                 increment = ( roundingMode == float_round_down ) && zSig1;
 745             }
 746             else {
 747                 increment = ( roundingMode == float_round_up ) && zSig1;
 748             }
 749         }
 750     }
 751     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 752         if (    ( 0x7FFE < zExp )
 753              || (    ( zExp == 0x7FFE )
 754                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 755                   && increment
 756                 )
 757            ) {
 758             roundMask = 0;
 759  overflow:
 760             float_raise( float_flag_overflow | float_flag_inexact );
 761             if (    ( roundingMode == float_round_to_zero )
 762                  || ( zSign && ( roundingMode == float_round_up ) )
 763                  || ( ! zSign && ( roundingMode == float_round_down ) )
 764                ) {
 765                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 766             }
 767             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 768         }
 769         if ( zExp <= 0 ) {
 770             isTiny =
 771                    ( float_detect_tininess == float_tininess_before_rounding )
 772                 || ( zExp < 0 )
 773                 || ! increment
 774                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 775             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 776             zExp = 0;
 777             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
 778             if ( zSig1 ) float_set_inexact();
 779             if ( roundNearestEven ) {
 780                 increment = ( (sbits64) zSig1 < 0 );
 781             }
 782             else {
 783                 if ( zSign ) {
 784                     increment = ( roundingMode == float_round_down ) && zSig1;
 785                 }
 786                 else {
 787                     increment = ( roundingMode == float_round_up ) && zSig1;
 788                 }
 789             }
 790             if ( increment ) {
 791                 ++zSig0;
 792                 zSig0 &=
 793                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 794                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 795             }
 796             return packFloatx80( zSign, zExp, zSig0 );
 797         }
 798     }
 799     if ( zSig1 ) float_set_inexact();
 800     if ( increment ) {
 801         ++zSig0;
 802         if ( zSig0 == 0 ) {
 803             ++zExp;
 804             zSig0 = LIT64( 0x8000000000000000 );
 805         }
 806         else {
 807             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 808         }
 809     }
 810     else {
 811         if ( zSig0 == 0 ) zExp = 0;
 812     }
 813     return packFloatx80( zSign, zExp, zSig0 );
 814
 815 }
 816
 817 /*
 818 -------------------------------------------------------------------------------
 819 Takes an abstract floating-point value having sign `zSign', exponent
 820 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 821 and returns the proper extended double-precision floating-point value
 822 corresponding to the abstract input.  This routine is just like
 823 `roundAndPackFloatx80' except that the input significand does not have to be
 824 normalized.
 825 -------------------------------------------------------------------------------
 826 */
 827 static floatx80
 828  normalizeRoundAndPackFloatx80(
 829      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 830  )
 831 {
 832     int8 shiftCount;
 833
 834     if ( zSig0 == 0 ) {
 835         zSig0 = zSig1;
 836         zSig1 = 0;
 837         zExp -= 64;
 838     }
 839     shiftCount = countLeadingZeros64( zSig0 );
 840     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 841     zExp -= shiftCount;
 842     return
 843         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
 844
 845 }
 846
 847 #endif
 848
 849 #ifdef FLOAT128
 850
 851 /*
 852 -------------------------------------------------------------------------------
 853 Returns the least-significant 64 fraction bits of the quadruple-precision
 854 floating-point value `a'.
 855 -------------------------------------------------------------------------------
 856 */
 857 INLINE bits64 extractFloat128Frac1( float128 a )
 858 {
 859
 860     return a.low;
 861
 862 }
 863
 864 /*
 865 -------------------------------------------------------------------------------
 866 Returns the most-significant 48 fraction bits of the quadruple-precision
 867 floating-point value `a'.
 868 -------------------------------------------------------------------------------
 869 */
 870 INLINE bits64 extractFloat128Frac0( float128 a )
 871 {
 872
 873     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 874
 875 }
 876
 877 /*
 878 -------------------------------------------------------------------------------
 879 Returns the exponent bits of the quadruple-precision floating-point value
 880 `a'.
 881 -------------------------------------------------------------------------------
 882 */
 883 INLINE int32 extractFloat128Exp( float128 a )
 884 {
 885
 886     return ( a.high>>48 ) & 0x7FFF;
 887
 888 }
 889
 890 /*
 891 -------------------------------------------------------------------------------
 892 Returns the sign bit of the quadruple-precision floating-point value `a'.
 893 -------------------------------------------------------------------------------
 894 */
 895 INLINE flag extractFloat128Sign( float128 a )
 896 {
 897
 898     return a.high>>63;
 899
 900 }
 901
 902 /*
 903 -------------------------------------------------------------------------------
 904 Normalizes the subnormal quadruple-precision floating-point value
 905 represented by the denormalized significand formed by the concatenation of
 906 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 907 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 908 significand are stored at the location pointed to by `zSig0Ptr', and the
 909 least significant 64 bits of the normalized significand are stored at the
 910 location pointed to by `zSig1Ptr'.
 911 -------------------------------------------------------------------------------
 912 */
 913 static void
 914  normalizeFloat128Subnormal(
 915      bits64 aSig0,
 916      bits64 aSig1,
 917      int32 *zExpPtr,
 918      bits64 *zSig0Ptr,
 919      bits64 *zSig1Ptr
 920  )
 921 {
 922     int8 shiftCount;
 923
 924     if ( aSig0 == 0 ) {
 925         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 926         if ( shiftCount < 0 ) {
 927             *zSig0Ptr = aSig1>>( - shiftCount );
 928             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 929         }
 930         else {
 931             *zSig0Ptr = aSig1<<shiftCount;
 932             *zSig1Ptr = 0;
 933         }
 934         *zExpPtr = - shiftCount - 63;
 935     }
 936     else {
 937         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 938         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 939         *zExpPtr = 1 - shiftCount;
 940     }
 941
 942 }
 943
 944 /*
 945 -------------------------------------------------------------------------------
 946 Packs the sign `zSign', the exponent `zExp', and the significand formed
 947 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 948 floating-point value, returning the result.  After being shifted into the
 949 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 950 added together to form the most significant 32 bits of the result.  This
 951 means that any integer portion of `zSig0' will be added into the exponent.
 952 Since a properly normalized significand will have an integer portion equal
 953 to 1, the `zExp' input should be 1 less than the desired result exponent
 954 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 955 significand.
 956 -------------------------------------------------------------------------------
 957 */
 958 INLINE float128
 959  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 960 {
 961     float128 z;
 962
 963     z.low = zSig1;
 964     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 965     return z;
 966
 967 }
 968
 969 /*
 970 -------------------------------------------------------------------------------
 971 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 972 and extended significand formed by the concatenation of `zSig0', `zSig1',
 973 and `zSig2', and returns the proper quadruple-precision floating-point value
 974 corresponding to the abstract input.  Ordinarily, the abstract value is
 975 simply rounded and packed into the quadruple-precision format, with the
 976 inexact exception raised if the abstract input cannot be represented
 977 exactly.  However, if the abstract value is too large, the overflow and
 978 inexact exceptions are raised and an infinity or maximal finite value is
 979 returned.  If the abstract value is too small, the input value is rounded to
 980 a subnormal number, and the underflow and inexact exceptions are raised if
 981 the abstract input cannot be represented exactly as a subnormal quadruple-
 982 precision floating-point number.
 983     The input significand must be normalized or smaller.  If the input
 984 significand is not normalized, `zExp' must be 0; in that case, the result
 985 returned is a subnormal number, and it must not require rounding.  In the
 986 usual case that the input significand is normalized, `zExp' must be 1 less
 987 than the ``true'' floating-point exponent.  The handling of underflow and
 988 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 989 -------------------------------------------------------------------------------
 990 */
 991 static float128
 992  roundAndPackFloat128(
 993      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
 994 {
 995     int8 roundingMode;
 996     flag roundNearestEven, increment, isTiny;
 997
 998     roundingMode = float_rounding_mode();
 999     roundNearestEven = ( roundingMode == float_round_nearest_even );
1000     increment = ( (sbits64) zSig2 < 0 );
1001     if ( ! roundNearestEven ) {
1002         if ( roundingMode == float_round_to_zero ) {
1003             increment = 0;
1004         }
1005         else {
1006             if ( zSign ) {
1007                 increment = ( roundingMode == float_round_down ) && zSig2;
1008             }
1009             else {
1010                 increment = ( roundingMode == float_round_up ) && zSig2;
1011             }
1012         }
1013     }
1014     if ( 0x7FFD <= (bits32) zExp ) {
1015         if (    ( 0x7FFD < zExp )
1016              || (    ( zExp == 0x7FFD )
1017                   && eq128(
1018                          LIT64( 0x0001FFFFFFFFFFFF ),
1019                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1020                          zSig0,
1021                          zSig1
1022                      )
1023                   && increment
1024                 )
1025            ) {
1026             float_raise( float_flag_overflow | float_flag_inexact );
1027             if (    ( roundingMode == float_round_to_zero )
1028                  || ( zSign && ( roundingMode == float_round_up ) )
1029                  || ( ! zSign && ( roundingMode == float_round_down ) )
1030                ) {
1031                 return
1032                     packFloat128(
1033                         zSign,
1034                         0x7FFE,
1035                         LIT64( 0x0000FFFFFFFFFFFF ),
1036                         LIT64( 0xFFFFFFFFFFFFFFFF )
1037                     );
1038             }
1039             return packFloat128( zSign, 0x7FFF, 0, 0 );
1040         }
1041         if ( zExp < 0 ) {
1042             isTiny =
1043                    ( float_detect_tininess == float_tininess_before_rounding )
1044                 || ( zExp < -1 )
1045                 || ! increment
1046                 || lt128(
1047                        zSig0,
1048                        zSig1,
1049                        LIT64( 0x0001FFFFFFFFFFFF ),
1050                        LIT64( 0xFFFFFFFFFFFFFFFF )
1051                    );
1052             shift128ExtraRightJamming(
1053                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1054             zExp = 0;
1055             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1056             if ( roundNearestEven ) {
1057                 increment = ( (sbits64) zSig2 < 0 );
1058             }
1059             else {
1060                 if ( zSign ) {
1061                     increment = ( roundingMode == float_round_down ) && zSig2;
1062                 }
1063                 else {
1064                     increment = ( roundingMode == float_round_up ) && zSig2;
1065                 }
1066             }
1067         }
1068     }
1069     if ( zSig2 ) float_set_inexact();
1070     if ( increment ) {
1071         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1072         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1073     }
1074     else {
1075         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1076     }
1077     return packFloat128( zSign, zExp, zSig0, zSig1 );
1078
1079 }
1080
1081 /*
1082 -------------------------------------------------------------------------------
1083 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084 and significand formed by the concatenation of `zSig0' and `zSig1', and
1085 returns the proper quadruple-precision floating-point value corresponding
1086 to the abstract input.  This routine is just like `roundAndPackFloat128'
1087 except that the input significand has fewer bits and does not have to be
1088 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1089 point exponent.
1090 -------------------------------------------------------------------------------
1091 */
1092 static float128
1093  normalizeRoundAndPackFloat128(
1094      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1095 {
1096     int8 shiftCount;
1097     bits64 zSig2;
1098
1099     if ( zSig0 == 0 ) {
1100         zSig0 = zSig1;
1101         zSig1 = 0;
1102         zExp -= 64;
1103     }
1104     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105     if ( 0 <= shiftCount ) {
1106         zSig2 = 0;
1107         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108     }
1109     else {
1110         shift128ExtraRightJamming(
1111             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112     }
1113     zExp -= shiftCount;
1114     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1115
1116 }
1117
1118 #endif
1119
1120 /*
1121 -------------------------------------------------------------------------------
1122 Returns the result of converting the 32-bit two's complement integer `a'
1123 to the single-precision floating-point format.  The conversion is performed
1124 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1125 -------------------------------------------------------------------------------
1126 */
1127 float32 int32_to_float32( int32 a )
1128 {
1129     flag zSign;
1130
1131     if ( a == 0 ) return 0;
1132     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1133     zSign = ( a < 0 );
1134     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1135
1136 }
1137
1138 /*
1139 -------------------------------------------------------------------------------
1140 Returns the result of converting the 32-bit two's complement integer `a'
1141 to the double-precision floating-point format.  The conversion is performed
1142 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1143 -------------------------------------------------------------------------------
1144 */
1145 float64 int32_to_float64( int32 a )
1146 {
1147     flag zSign;
1148     uint32 absA;
1149     int8 shiftCount;
1150     bits64 zSig;
1151
1152     if ( a == 0 ) return 0;
1153     zSign = ( a < 0 );
1154     absA = zSign ? - a : a;
1155     shiftCount = countLeadingZeros32( absA ) + 21;
1156     zSig = absA;
1157     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1158
1159 }
1160
1161 #ifdef FLOATX80
1162
1163 /*
1164 -------------------------------------------------------------------------------
1165 Returns the result of converting the 32-bit two's complement integer `a'
1166 to the extended double-precision floating-point format.  The conversion
1167 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1168 Arithmetic.
1169 -------------------------------------------------------------------------------
1170 */
1171 floatx80 int32_to_floatx80( int32 a )
1172 {
1173     flag zSign;
1174     uint32 absA;
1175     int8 shiftCount;
1176     bits64 zSig;
1177
1178     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1179     zSign = ( a < 0 );
1180     absA = zSign ? - a : a;
1181     shiftCount = countLeadingZeros32( absA ) + 32;
1182     zSig = absA;
1183     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1184
1185 }
1186
1187 #endif
1188
1189 #ifdef FLOAT128
1190
1191 /*
1192 -------------------------------------------------------------------------------
1193 Returns the result of converting the 32-bit two's complement integer `a' to
1194 the quadruple-precision floating-point format.  The conversion is performed
1195 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1196 -------------------------------------------------------------------------------
1197 */
1198 float128 int32_to_float128( int32 a )
1199 {
1200     flag zSign;
1201     uint32 absA;
1202     int8 shiftCount;
1203     bits64 zSig0;
1204
1205     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1206     zSign = ( a < 0 );
1207     absA = zSign ? - a : a;
1208     shiftCount = countLeadingZeros32( absA ) + 17;
1209     zSig0 = absA;
1210     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1211
1212 }
1213
1214 #endif
1215
1216 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1217 /*
1218 -------------------------------------------------------------------------------
1219 Returns the result of converting the 64-bit two's complement integer `a'
1220 to the single-precision floating-point format.  The conversion is performed
1221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1222 -------------------------------------------------------------------------------
1223 */
1224 float32 int64_to_float32( int64 a )
1225 {
1226     flag zSign;
1227     uint64 absA;
1228     int8 shiftCount;
1229
1230     if ( a == 0 ) return 0;
1231     zSign = ( a < 0 );
1232     absA = zSign ? - a : a;
1233     shiftCount = countLeadingZeros64( absA ) - 40;
1234     if ( 0 <= shiftCount ) {
1235         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1236     }
1237     else {
1238         shiftCount += 7;
1239         if ( shiftCount < 0 ) {
1240             shift64RightJamming( absA, - shiftCount, &absA );
1241         }
1242         else {
1243             absA <<= shiftCount;
1244         }
1245         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1246     }
1247
1248 }
1249
1250 /*
1251 -------------------------------------------------------------------------------
1252 Returns the result of converting the 64-bit two's complement integer `a'
1253 to the double-precision floating-point format.  The conversion is performed
1254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 -------------------------------------------------------------------------------
1256 */
1257 float64 int64_to_float64( int64 a )
1258 {
1259     flag zSign;
1260
1261     if ( a == 0 ) return 0;
1262     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1263         return packFloat64( 1, 0x43E, 0 );
1264     }
1265     zSign = ( a < 0 );
1266     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1267
1268 }
1269
1270 #ifdef FLOATX80
1271
1272 /*
1273 -------------------------------------------------------------------------------
1274 Returns the result of converting the 64-bit two's complement integer `a'
1275 to the extended double-precision floating-point format.  The conversion
1276 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1277 Arithmetic.
1278 -------------------------------------------------------------------------------
1279 */
1280 floatx80 int64_to_floatx80( int64 a )
1281 {
1282     flag zSign;
1283     uint64 absA;
1284     int8 shiftCount;
1285
1286     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1287     zSign = ( a < 0 );
1288     absA = zSign ? - a : a;
1289     shiftCount = countLeadingZeros64( absA );
1290     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1291
1292 }
1293
1294 #endif
1295
1296 #ifdef FLOAT128
1297
1298 /*
1299 -------------------------------------------------------------------------------
1300 Returns the result of converting the 64-bit two's complement integer `a' to
1301 the quadruple-precision floating-point format.  The conversion is performed
1302 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1303 -------------------------------------------------------------------------------
1304 */
1305 float128 int64_to_float128( int64 a )
1306 {
1307     flag zSign;
1308     uint64 absA;
1309     int8 shiftCount;
1310     int32 zExp;
1311     bits64 zSig0, zSig1;
1312
1313     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1314     zSign = ( a < 0 );
1315     absA = zSign ? - a : a;
1316     shiftCount = countLeadingZeros64( absA ) + 49;
1317     zExp = 0x406E - shiftCount;
1318     if ( 64 <= shiftCount ) {
1319         zSig1 = 0;
1320         zSig0 = absA;
1321         shiftCount -= 64;
1322     }
1323     else {
1324         zSig1 = absA;
1325         zSig0 = 0;
1326     }
1327     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1328     return packFloat128( zSign, zExp, zSig0, zSig1 );
1329
1330 }
1331
1332 #endif
1333 #endif /* !SOFTFLOAT_FOR_GCC */
1334
1335 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1336 /*
1337 -------------------------------------------------------------------------------
1338 Returns the result of converting the single-precision floating-point value
1339 `a' to the 32-bit two's complement integer format.  The conversion is
1340 performed according to the IEC/IEEE Standard for Binary Floating-Point
1341 Arithmetic---which means in particular that the conversion is rounded
1342 according to the current rounding mode.  If `a' is a NaN, the largest
1343 positive integer is returned.  Otherwise, if the conversion overflows, the
1344 largest integer with the same sign as `a' is returned.
1345 -------------------------------------------------------------------------------
1346 */
1347 int32 float32_to_int32( float32 a )
1348 {
1349     flag aSign;
1350     int16 aExp, shiftCount;
1351     bits32 aSig;
1352     bits64 aSig64;
1353
1354     aSig = extractFloat32Frac( a );
1355     aExp = extractFloat32Exp( a );
1356     aSign = extractFloat32Sign( a );
1357     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1358     if ( aExp ) aSig |= 0x00800000;
1359     shiftCount = 0xAF - aExp;
1360     aSig64 = aSig;
1361     aSig64 <<= 32;
1362     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1363     return roundAndPackInt32( aSign, aSig64 );
1364
1365 }
1366 #endif /* !SOFTFLOAT_FOR_GCC */
1367
1368 /*
1369 -------------------------------------------------------------------------------
1370 Returns the result of converting the single-precision floating-point value
1371 `a' to the 32-bit two's complement integer format.  The conversion is
1372 performed according to the IEC/IEEE Standard for Binary Floating-Point
1373 Arithmetic, except that the conversion is always rounded toward zero.
1374 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1375 the conversion overflows, the largest integer with the same sign as `a' is
1376 returned.
1377 -------------------------------------------------------------------------------
1378 */
1379 int32 float32_to_int32_round_to_zero( float32 a )
1380 {
1381     flag aSign;
1382     int16 aExp, shiftCount;
1383     bits32 aSig;
1384     int32 z;
1385
1386     aSig = extractFloat32Frac( a );
1387     aExp = extractFloat32Exp( a );
1388     aSign = extractFloat32Sign( a );
1389     shiftCount = aExp - 0x9E;
1390     if ( 0 <= shiftCount ) {
1391         if ( a != 0xCF000000 ) {
1392             float_raise( float_flag_invalid );
1393             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1394         }
1395         return (sbits32) 0x80000000;
1396     }
1397     else if ( aExp <= 0x7E ) {
1398         if ( aExp | aSig ) float_set_inexact();
1399         return 0;
1400     }
1401     aSig = ( aSig | 0x00800000 )<<8;
1402     z = aSig>>( - shiftCount );
1403     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1404         float_set_inexact();
1405     }
1406     if ( aSign ) z = - z;
1407     return z;
1408
1409 }
1410
1411 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1412 /*
1413 -------------------------------------------------------------------------------
1414 Returns the result of converting the single-precision floating-point value
1415 `a' to the 64-bit two's complement integer format.  The conversion is
1416 performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 Arithmetic---which means in particular that the conversion is rounded
1418 according to the current rounding mode.  If `a' is a NaN, the largest
1419 positive integer is returned.  Otherwise, if the conversion overflows, the
1420 largest integer with the same sign as `a' is returned.
1421 -------------------------------------------------------------------------------
1422 */
1423 int64 float32_to_int64( float32 a )
1424 {
1425     flag aSign;
1426     int16 aExp, shiftCount;
1427     bits32 aSig;
1428     bits64 aSig64, aSigExtra;
1429
1430     aSig = extractFloat32Frac( a );
1431     aExp = extractFloat32Exp( a );
1432     aSign = extractFloat32Sign( a );
1433     shiftCount = 0xBE - aExp;
1434     if ( shiftCount < 0 ) {
1435         float_raise( float_flag_invalid );
1436         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1437             return LIT64( 0x7FFFFFFFFFFFFFFF );
1438         }
1439         return (sbits64) LIT64( 0x8000000000000000 );
1440     }
1441     if ( aExp ) aSig |= 0x00800000;
1442     aSig64 = aSig;
1443     aSig64 <<= 40;
1444     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1445     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1446
1447 }
1448
1449 /*
1450 -------------------------------------------------------------------------------
1451 Returns the result of converting the single-precision floating-point value
1452 `a' to the 64-bit two's complement integer format.  The conversion is
1453 performed according to the IEC/IEEE Standard for Binary Floating-Point
1454 Arithmetic, except that the conversion is always rounded toward zero.  If
1455 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1456 conversion overflows, the largest integer with the same sign as `a' is
1457 returned.
1458 -------------------------------------------------------------------------------
1459 */
1460 int64 float32_to_int64_round_to_zero( float32 a )
1461 {
1462     flag aSign;
1463     int16 aExp, shiftCount;
1464     bits32 aSig;
1465     bits64 aSig64;
1466     int64 z;
1467
1468     aSig = extractFloat32Frac( a );
1469     aExp = extractFloat32Exp( a );
1470     aSign = extractFloat32Sign( a );
1471     shiftCount = aExp - 0xBE;
1472     if ( 0 <= shiftCount ) {
1473         if ( a != 0xDF000000 ) {
1474             float_raise( float_flag_invalid );
1475             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1476                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1477             }
1478         }
1479         return (sbits64) LIT64( 0x8000000000000000 );
1480     }
1481     else if ( aExp <= 0x7E ) {
1482         if ( aExp | aSig ) float_set_inexact();
1483         return 0;
1484     }
1485     aSig64 = aSig | 0x00800000;
1486     aSig64 <<= 40;
1487     z = aSig64>>( - shiftCount );
1488     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1489         float_set_inexact();
1490     }
1491     if ( aSign ) z = - z;
1492     return z;
1493
1494 }
1495 #endif /* !SOFTFLOAT_FOR_GCC */
1496
1497 /*
1498 -------------------------------------------------------------------------------
1499 Returns the result of converting the single-precision floating-point value
1500 `a' to the double-precision floating-point format.  The conversion is
1501 performed according to the IEC/IEEE Standard for Binary Floating-Point
1502 Arithmetic.
1503 -------------------------------------------------------------------------------
1504 */
1505 float64 float32_to_float64( float32 a )
1506 {
1507     flag aSign;
1508     int16 aExp;
1509     bits32 aSig;
1510
1511     aSig = extractFloat32Frac( a );
1512     aExp = extractFloat32Exp( a );
1513     aSign = extractFloat32Sign( a );
1514     if ( aExp == 0xFF ) {
1515         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1516         return packFloat64( aSign, 0x7FF, 0 );
1517     }
1518     if ( aExp == 0 ) {
1519         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1520         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1521         --aExp;
1522     }
1523     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1524
1525 }
1526
1527 #ifdef FLOATX80
1528
1529 /*
1530 -------------------------------------------------------------------------------
1531 Returns the result of converting the single-precision floating-point value
1532 `a' to the extended double-precision floating-point format.  The conversion
1533 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1534 Arithmetic.
1535 -------------------------------------------------------------------------------
1536 */
1537 floatx80 float32_to_floatx80( float32 a )
1538 {
1539     flag aSign;
1540     int16 aExp;
1541     bits32 aSig;
1542
1543     aSig = extractFloat32Frac( a );
1544     aExp = extractFloat32Exp( a );
1545     aSign = extractFloat32Sign( a );
1546     if ( aExp == 0xFF ) {
1547         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1548         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1549     }
1550     if ( aExp == 0 ) {
1551         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1552         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1553     }
1554     aSig |= 0x00800000;
1555     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1556
1557 }
1558
1559 #endif
1560
1561 #ifdef FLOAT128
1562
1563 /*
1564 -------------------------------------------------------------------------------
1565 Returns the result of converting the single-precision floating-point value
1566 `a' to the double-precision floating-point format.  The conversion is
1567 performed according to the IEC/IEEE Standard for Binary Floating-Point
1568 Arithmetic.
1569 -------------------------------------------------------------------------------
1570 */
1571 float128 float32_to_float128( float32 a )
1572 {
1573     flag aSign;
1574     int16 aExp;
1575     bits32 aSig;
1576
1577     aSig = extractFloat32Frac( a );
1578     aExp = extractFloat32Exp( a );
1579     aSign = extractFloat32Sign( a );
1580     if ( aExp == 0xFF ) {
1581         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1582         return packFloat128( aSign, 0x7FFF, 0, 0 );
1583     }
1584     if ( aExp == 0 ) {
1585         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1586         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1587         --aExp;
1588     }
1589     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1590
1591 }
1592
1593 #endif
1594
1595 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1596 /*
1597 -------------------------------------------------------------------------------
1598 Rounds the single-precision floating-point value `a' to an integer, and
1599 returns the result as a single-precision floating-point value.  The
1600 operation is performed according to the IEC/IEEE Standard for Binary
1601 Floating-Point Arithmetic.
1602 -------------------------------------------------------------------------------
1603 */
1604 float32 float32_round_to_int( float32 a )
1605 {
1606     flag aSign;
1607     int16 aExp;
1608     bits32 lastBitMask, roundBitsMask;
1609     int8 roundingMode;
1610     float32 z;
1611
1612     aExp = extractFloat32Exp( a );
1613     if ( 0x96 <= aExp ) {
1614         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1615             return propagateFloat32NaN( a, a );
1616         }
1617         return a;
1618     }
1619     if ( aExp <= 0x7E ) {
1620         if ( (bits32) ( a<<1 ) == 0 ) return a;
1621         float_set_inexact();
1622         aSign = extractFloat32Sign( a );
1623         switch ( float_rounding_mode() ) {
1624          case float_round_nearest_even:
1625             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1626                 return packFloat32( aSign, 0x7F, 0 );
1627             }
1628             break;
1629          case float_round_down:
1630             return aSign ? 0xBF800000 : 0;
1631          case float_round_up:
1632             return aSign ? 0x80000000 : 0x3F800000;
1633         }
1634         return packFloat32( aSign, 0, 0 );
1635     }
1636     lastBitMask = 1;
1637     lastBitMask <<= 0x96 - aExp;
1638     roundBitsMask = lastBitMask - 1;
1639     z = a;
1640     roundingMode = float_rounding_mode();
1641     if ( roundingMode == float_round_nearest_even ) {
1642         z += lastBitMask>>1;
1643         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1644     }
1645     else if ( roundingMode != float_round_to_zero ) {
1646         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1647             z += roundBitsMask;
1648         }
1649     }
1650     z &= ~ roundBitsMask;
1651     if ( z != a ) float_set_inexact();
1652     return z;
1653
1654 }
1655 #endif /* !SOFTFLOAT_FOR_GCC */
1656
1657 /*
1658 -------------------------------------------------------------------------------
1659 Returns the result of adding the absolute values of the single-precision
1660 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1661 before being returned.  `zSign' is ignored if the result is a NaN.
1662 The addition is performed according to the IEC/IEEE Standard for Binary
1663 Floating-Point Arithmetic.
1664 -------------------------------------------------------------------------------
1665 */
1666 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1667 {
1668     int16 aExp, bExp, zExp;
1669     bits32 aSig, bSig, zSig;
1670     int16 expDiff;
1671
1672     aSig = extractFloat32Frac( a );
1673     aExp = extractFloat32Exp( a );
1674     bSig = extractFloat32Frac( b );
1675     bExp = extractFloat32Exp( b );
1676     expDiff = aExp - bExp;
1677     aSig <<= 6;
1678     bSig <<= 6;
1679     if ( 0 < expDiff ) {
1680         if ( aExp == 0xFF ) {
1681             if ( aSig ) return propagateFloat32NaN( a, b );
1682             return a;
1683         }
1684         if ( bExp == 0 ) {
1685             --expDiff;
1686         }
1687         else {
1688             bSig |= 0x20000000;
1689         }
1690         shift32RightJamming( bSig, expDiff, &bSig );
1691         zExp = aExp;
1692     }
1693     else if ( expDiff < 0 ) {
1694         if ( bExp == 0xFF ) {
1695             if ( bSig ) return propagateFloat32NaN( a, b );
1696             return packFloat32( zSign, 0xFF, 0 );
1697         }
1698         if ( aExp == 0 ) {
1699             ++expDiff;
1700         }
1701         else {
1702             aSig |= 0x20000000;
1703         }
1704         shift32RightJamming( aSig, - expDiff, &aSig );
1705         zExp = bExp;
1706     }
1707     else {
1708         if ( aExp == 0xFF ) {
1709             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1710             return a;
1711         }
1712         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1713         zSig = 0x40000000 + aSig + bSig;
1714         zExp = aExp;
1715         goto roundAndPack;
1716     }
1717     aSig |= 0x20000000;
1718     zSig = ( aSig + bSig )<<1;
1719     --zExp;
1720     if ( (sbits32) zSig < 0 ) {
1721         zSig = aSig + bSig;
1722         ++zExp;
1723     }
1724  roundAndPack:
1725     return roundAndPackFloat32( zSign, zExp, zSig );
1726
1727 }
1728
1729 /*
1730 -------------------------------------------------------------------------------
1731 Returns the result of subtracting the absolute values of the single-
1732 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1733 difference is negated before being returned.  `zSign' is ignored if the
1734 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1735 Standard for Binary Floating-Point Arithmetic.
1736 -------------------------------------------------------------------------------
1737 */
1738 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1739 {
1740     int16 aExp, bExp, zExp;
1741     bits32 aSig, bSig, zSig;
1742     int16 expDiff;
1743
1744     aSig = extractFloat32Frac( a );
1745     aExp = extractFloat32Exp( a );
1746     bSig = extractFloat32Frac( b );
1747     bExp = extractFloat32Exp( b );
1748     expDiff = aExp - bExp;
1749     aSig <<= 7;
1750     bSig <<= 7;
1751     if ( 0 < expDiff ) goto aExpBigger;
1752     if ( expDiff < 0 ) goto bExpBigger;
1753     if ( aExp == 0xFF ) {
1754         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1755         float_raise( float_flag_invalid );
1756         return float32_default_nan;
1757     }
1758     if ( aExp == 0 ) {
1759         aExp = 1;
1760         bExp = 1;
1761     }
1762     if ( bSig < aSig ) goto aBigger;
1763     if ( aSig < bSig ) goto bBigger;
1764     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1765  bExpBigger:
1766     if ( bExp == 0xFF ) {
1767         if ( bSig ) return propagateFloat32NaN( a, b );
1768         return packFloat32( zSign ^ 1, 0xFF, 0 );
1769     }
1770     if ( aExp == 0 ) {
1771         ++expDiff;
1772     }
1773     else {
1774         aSig |= 0x40000000;
1775     }
1776     shift32RightJamming( aSig, - expDiff, &aSig );
1777     bSig |= 0x40000000;
1778  bBigger:
1779     zSig = bSig - aSig;
1780     zExp = bExp;
1781     zSign ^= 1;
1782     goto normalizeRoundAndPack;
1783  aExpBigger:
1784     if ( aExp == 0xFF ) {
1785         if ( aSig ) return propagateFloat32NaN( a, b );
1786         return a;
1787     }
1788     if ( bExp == 0 ) {
1789         --expDiff;
1790     }
1791     else {
1792         bSig |= 0x40000000;
1793     }
1794     shift32RightJamming( bSig, expDiff, &bSig );
1795     aSig |= 0x40000000;
1796  aBigger:
1797     zSig = aSig - bSig;
1798     zExp = aExp;
1799  normalizeRoundAndPack:
1800     --zExp;
1801     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1802
1803 }
1804
1805 /*
1806 -------------------------------------------------------------------------------
1807 Returns the result of adding the single-precision floating-point values `a'
1808 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1809 Binary Floating-Point Arithmetic.
1810 -------------------------------------------------------------------------------
1811 */
1812 float32 float32_add( float32 a, float32 b )
1813 {
1814     flag aSign, bSign;
1815
1816     aSign = extractFloat32Sign( a );
1817     bSign = extractFloat32Sign( b );
1818     if ( aSign == bSign ) {
1819         return addFloat32Sigs( a, b, aSign );
1820     }
1821     else {
1822         return subFloat32Sigs( a, b, aSign );
1823     }
1824
1825 }
1826
1827 /*
1828 -------------------------------------------------------------------------------
1829 Returns the result of subtracting the single-precision floating-point values
1830 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1831 for Binary Floating-Point Arithmetic.
1832 -------------------------------------------------------------------------------
1833 */
1834 float32 float32_sub( float32 a, float32 b )
1835 {
1836     flag aSign, bSign;
1837
1838     aSign = extractFloat32Sign( a );
1839     bSign = extractFloat32Sign( b );
1840     if ( aSign == bSign ) {
1841         return subFloat32Sigs( a, b, aSign );
1842     }
1843     else {
1844         return addFloat32Sigs( a, b, aSign );
1845     }
1846
1847 }
1848
1849 /*
1850 -------------------------------------------------------------------------------
1851 Returns the result of multiplying the single-precision floating-point values
1852 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1853 for Binary Floating-Point Arithmetic.
1854 -------------------------------------------------------------------------------
1855 */
1856 float32 float32_mul( float32 a, float32 b )
1857 {
1858     flag aSign, bSign, zSign;
1859     int16 aExp, bExp, zExp;
1860     bits32 aSig, bSig;
1861     bits64 zSig64;
1862     bits32 zSig;
1863
1864     aSig = extractFloat32Frac( a );
1865     aExp = extractFloat32Exp( a );
1866     aSign = extractFloat32Sign( a );
1867     bSig = extractFloat32Frac( b );
1868     bExp = extractFloat32Exp( b );
1869     bSign = extractFloat32Sign( b );
1870     zSign = aSign ^ bSign;
1871     if ( aExp == 0xFF ) {
1872         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1873             return propagateFloat32NaN( a, b );
1874         }
1875         if ( ( bExp | bSig ) == 0 ) {
1876             float_raise( float_flag_invalid );
1877             return float32_default_nan;
1878         }
1879         return packFloat32( zSign, 0xFF, 0 );
1880     }
1881     if ( bExp == 0xFF ) {
1882         if ( bSig ) return propagateFloat32NaN( a, b );
1883         if ( ( aExp | aSig ) == 0 ) {
1884             float_raise( float_flag_invalid );
1885             return float32_default_nan;
1886         }
1887         return packFloat32( zSign, 0xFF, 0 );
1888     }
1889     if ( aExp == 0 ) {
1890         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1891         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1892     }
1893     if ( bExp == 0 ) {
1894         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1895         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1896     }
1897     zExp = aExp + bExp - 0x7F;
1898     aSig = ( aSig | 0x00800000 )<<7;
1899     bSig = ( bSig | 0x00800000 )<<8;
1900     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1901     zSig = zSig64;
1902     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1903         zSig <<= 1;
1904         --zExp;
1905     }
1906     return roundAndPackFloat32( zSign, zExp, zSig );
1907
1908 }
1909
1910 /*
1911 -------------------------------------------------------------------------------
1912 Returns the result of dividing the single-precision floating-point value `a'
1913 by the corresponding value `b'.  The operation is performed according to the
1914 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1915 -------------------------------------------------------------------------------
1916 */
1917 float32 float32_div( float32 a, float32 b )
1918 {
1919     flag aSign, bSign, zSign;
1920     int16 aExp, bExp, zExp;
1921     bits32 aSig, bSig, zSig;
1922
1923     aSig = extractFloat32Frac( a );
1924     aExp = extractFloat32Exp( a );
1925     aSign = extractFloat32Sign( a );
1926     bSig = extractFloat32Frac( b );
1927     bExp = extractFloat32Exp( b );
1928     bSign = extractFloat32Sign( b );
1929     zSign = aSign ^ bSign;
1930     if ( aExp == 0xFF ) {
1931         if ( aSig ) return propagateFloat32NaN( a, b );
1932         if ( bExp == 0xFF ) {
1933             if ( bSig ) return propagateFloat32NaN( a, b );
1934             float_raise( float_flag_invalid );
1935             return float32_default_nan;
1936         }
1937         return packFloat32( zSign, 0xFF, 0 );
1938     }
1939     if ( bExp == 0xFF ) {
1940         if ( bSig ) return propagateFloat32NaN( a, b );
1941         return packFloat32( zSign, 0, 0 );
1942     }
1943     if ( bExp == 0 ) {
1944         if ( bSig == 0 ) {
1945             if ( ( aExp | aSig ) == 0 ) {
1946                 float_raise( float_flag_invalid );
1947                 return float32_default_nan;
1948             }
1949             float_raise( float_flag_divbyzero );
1950             return packFloat32( zSign, 0xFF, 0 );
1951         }
1952         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1953     }
1954     if ( aExp == 0 ) {
1955         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1956         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1957     }
1958     zExp = aExp - bExp + 0x7D;
1959     aSig = ( aSig | 0x00800000 )<<7;
1960     bSig = ( bSig | 0x00800000 )<<8;
1961     if ( bSig <= ( aSig + aSig ) ) {
1962         aSig >>= 1;
1963         ++zExp;
1964     }
1965     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1966     if ( ( zSig & 0x3F ) == 0 ) {
1967         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1968     }
1969     return roundAndPackFloat32( zSign, zExp, zSig );
1970
1971 }
1972
1973 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1974 /*
1975 -------------------------------------------------------------------------------
1976 Returns the remainder of the single-precision floating-point value `a'
1977 with respect to the corresponding value `b'.  The operation is performed
1978 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1979 -------------------------------------------------------------------------------
1980 */
1981 float32 float32_rem( float32 a, float32 b )
1982 {
1983     flag aSign, bSign, zSign;
1984     int16 aExp, bExp, expDiff;
1985     bits32 aSig, bSig;
1986     bits32 q;
1987     bits64 aSig64, bSig64, q64;
1988     bits32 alternateASig;
1989     sbits32 sigMean;
1990
1991     aSig = extractFloat32Frac( a );
1992     aExp = extractFloat32Exp( a );
1993     aSign = extractFloat32Sign( a );
1994     bSig = extractFloat32Frac( b );
1995     bExp = extractFloat32Exp( b );
1996     bSign = extractFloat32Sign( b );
1997     if ( aExp == 0xFF ) {
1998         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1999             return propagateFloat32NaN( a, b );
2000         }
2001         float_raise( float_flag_invalid );
2002         return float32_default_nan;
2003     }
2004     if ( bExp == 0xFF ) {
2005         if ( bSig ) return propagateFloat32NaN( a, b );
2006         return a;
2007     }
2008     if ( bExp == 0 ) {
2009         if ( bSig == 0 ) {
2010             float_raise( float_flag_invalid );
2011             return float32_default_nan;
2012         }
2013         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2014     }
2015     if ( aExp == 0 ) {
2016         if ( aSig == 0 ) return a;
2017         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2018     }
2019     expDiff = aExp - bExp;
2020     aSig |= 0x00800000;
2021     bSig |= 0x00800000;
2022     if ( expDiff < 32 ) {
2023         aSig <<= 8;
2024         bSig <<= 8;
2025         if ( expDiff < 0 ) {
2026             if ( expDiff < -1 ) return a;
2027             aSig >>= 1;
2028         }
2029         q = ( bSig <= aSig );
2030         if ( q ) aSig -= bSig;
2031         if ( 0 < expDiff ) {
2032             q = ( ( (bits64) aSig )<<32 ) / bSig;
2033             q >>= 32 - expDiff;
2034             bSig >>= 2;
2035             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2036         }
2037         else {
2038             aSig >>= 2;
2039             bSig >>= 2;
2040         }
2041     }
2042     else {
2043         if ( bSig <= aSig ) aSig -= bSig;
2044         aSig64 = ( (bits64) aSig )<<40;
2045         bSig64 = ( (bits64) bSig )<<40;
2046         expDiff -= 64;
2047         while ( 0 < expDiff ) {
2048             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2049             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2050             aSig64 = - ( ( bSig * q64 )<<38 );
2051             expDiff -= 62;
2052         }
2053         expDiff += 64;
2054         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2055         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2056         q = q64>>( 64 - expDiff );
2057         bSig <<= 6;
2058         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2059     }
2060     do {
2061         alternateASig = aSig;
2062         ++q;
2063         aSig -= bSig;
2064     } while ( 0 <= (sbits32) aSig );
2065     sigMean = aSig + alternateASig;
2066     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2067         aSig = alternateASig;
2068     }
2069     zSign = ( (sbits32) aSig < 0 );
2070     if ( zSign ) aSig = - aSig;
2071     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2072
2073 }
2074 #endif /* !SOFTFLOAT_FOR_GCC */
2075
2076 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2077 /*
2078 -------------------------------------------------------------------------------
2079 Returns the square root of the single-precision floating-point value `a'.
2080 The operation is performed according to the IEC/IEEE Standard for Binary
2081 Floating-Point Arithmetic.
2082 -------------------------------------------------------------------------------
2083 */
2084 float32 float32_sqrt( float32 a )
2085 {
2086     flag aSign;
2087     int16 aExp, zExp;
2088     bits32 aSig, zSig;
2089     bits64 rem, term;
2090
2091     aSig = extractFloat32Frac( a );
2092     aExp = extractFloat32Exp( a );
2093     aSign = extractFloat32Sign( a );
2094     if ( aExp == 0xFF ) {
2095         if ( aSig ) return propagateFloat32NaN( a, 0 );
2096         if ( ! aSign ) return a;
2097         float_raise( float_flag_invalid );
2098         return float32_default_nan;
2099     }
2100     if ( aSign ) {
2101         if ( ( aExp | aSig ) == 0 ) return a;
2102         float_raise( float_flag_invalid );
2103         return float32_default_nan;
2104     }
2105     if ( aExp == 0 ) {
2106         if ( aSig == 0 ) return 0;
2107         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2108     }
2109     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2110     aSig = ( aSig | 0x00800000 )<<8;
2111     zSig = estimateSqrt32( aExp, aSig ) + 2;
2112     if ( ( zSig & 0x7F ) <= 5 ) {
2113         if ( zSig < 2 ) {
2114             zSig = 0x7FFFFFFF;
2115             goto roundAndPack;
2116         }
2117         aSig >>= aExp & 1;
2118         term = ( (bits64) zSig ) * zSig;
2119         rem = ( ( (bits64) aSig )<<32 ) - term;
2120         while ( (sbits64) rem < 0 ) {
2121             --zSig;
2122             rem += ( ( (bits64) zSig )<<1 ) | 1;
2123         }
2124         zSig |= ( rem != 0 );
2125     }
2126     shift32RightJamming( zSig, 1, &zSig );
2127  roundAndPack:
2128     return roundAndPackFloat32( 0, zExp, zSig );
2129
2130 }
2131 #endif /* !SOFTFLOAT_FOR_GCC */
2132
2133 /*
2134 -------------------------------------------------------------------------------
2135 Returns 1 if the single-precision floating-point value `a' is equal to
2136 the corresponding value `b', and 0 otherwise.  The comparison is performed
2137 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2138 -------------------------------------------------------------------------------
2139 */
2140 flag float32_eq( float32 a, float32 b )
2141 {
2142
2143     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2144          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2145        ) {
2146         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2147             float_raise( float_flag_invalid );
2148         }
2149         return 0;
2150     }
2151     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2152
2153 }
2154
2155 /*
2156 -------------------------------------------------------------------------------
2157 Returns 1 if the single-precision floating-point value `a' is less than
2158 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2159 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2160 Arithmetic.
2161 -------------------------------------------------------------------------------
2162 */
2163 flag float32_le( float32 a, float32 b )
2164 {
2165     flag aSign, bSign;
2166
2167     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2168          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2169        ) {
2170         float_raise( float_flag_invalid );
2171         return 0;
2172     }
2173     aSign = extractFloat32Sign( a );
2174     bSign = extractFloat32Sign( b );
2175     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2176     return ( a == b ) || ( aSign ^ ( a < b ) );
2177
2178 }
2179
2180 /*
2181 -------------------------------------------------------------------------------
2182 Returns 1 if the single-precision floating-point value `a' is less than
2183 the corresponding value `b', and 0 otherwise.  The comparison is performed
2184 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185 -------------------------------------------------------------------------------
2186 */
2187 flag float32_lt( float32 a, float32 b )
2188 {
2189     flag aSign, bSign;
2190
2191     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2192          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2193        ) {
2194         float_raise( float_flag_invalid );
2195         return 0;
2196     }
2197     aSign = extractFloat32Sign( a );
2198     bSign = extractFloat32Sign( b );
2199     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2200     return ( a != b ) && ( aSign ^ ( a < b ) );
2201
2202 }
2203
2204 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2205 /*
2206 -------------------------------------------------------------------------------
2207 Returns 1 if the single-precision floating-point value `a' is equal to
2208 the corresponding value `b', and 0 otherwise.  The invalid exception is
2209 raised if either operand is a NaN.  Otherwise, the comparison is performed
2210 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2211 -------------------------------------------------------------------------------
2212 */
2213 flag float32_eq_signaling( float32 a, float32 b )
2214 {
2215
2216     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2217          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2218        ) {
2219         float_raise( float_flag_invalid );
2220         return 0;
2221     }
2222     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2223
2224 }
2225
2226 /*
2227 -------------------------------------------------------------------------------
2228 Returns 1 if the single-precision floating-point value `a' is less than or
2229 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2230 cause an exception.  Otherwise, the comparison is performed according to the
2231 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2232 -------------------------------------------------------------------------------
2233 */
2234 flag float32_le_quiet( float32 a, float32 b )
2235 {
2236     flag aSign, bSign;
2237
2238     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2239          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2240        ) {
2241         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2242             float_raise( float_flag_invalid );
2243         }
2244         return 0;
2245     }
2246     aSign = extractFloat32Sign( a );
2247     bSign = extractFloat32Sign( b );
2248     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2249     return ( a == b ) || ( aSign ^ ( a < b ) );
2250
2251 }
2252
2253 /*
2254 -------------------------------------------------------------------------------
2255 Returns 1 if the single-precision floating-point value `a' is less than
2256 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2257 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2258 Standard for Binary Floating-Point Arithmetic.
2259 -------------------------------------------------------------------------------
2260 */
2261 flag float32_lt_quiet( float32 a, float32 b )
2262 {
2263     flag aSign, bSign;
2264
2265     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2266          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2267        ) {
2268         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2269             float_raise( float_flag_invalid );
2270         }
2271         return 0;
2272     }
2273     aSign = extractFloat32Sign( a );
2274     bSign = extractFloat32Sign( b );
2275     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2276     return ( a != b ) && ( aSign ^ ( a < b ) );
2277
2278 }
2279 #endif /* !SOFTFLOAT_FOR_GCC */
2280
2281 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2282 /*
2283 -------------------------------------------------------------------------------
2284 Returns the result of converting the double-precision floating-point value
2285 `a' to the 32-bit two's complement integer format.  The conversion is
2286 performed according to the IEC/IEEE Standard for Binary Floating-Point
2287 Arithmetic---which means in particular that the conversion is rounded
2288 according to the current rounding mode.  If `a' is a NaN, the largest
2289 positive integer is returned.  Otherwise, if the conversion overflows, the
2290 largest integer with the same sign as `a' is returned.
2291 -------------------------------------------------------------------------------
2292 */
2293 int32 float64_to_int32( float64 a )
2294 {
2295     flag aSign;
2296     int16 aExp, shiftCount;
2297     bits64 aSig;
2298
2299     aSig = extractFloat64Frac( a );
2300     aExp = extractFloat64Exp( a );
2301     aSign = extractFloat64Sign( a );
2302     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2303     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2304     shiftCount = 0x42C - aExp;
2305     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2306     return roundAndPackInt32( aSign, aSig );
2307
2308 }
2309 #endif /* !SOFTFLOAT_FOR_GCC */
2310
2311 /*
2312 -------------------------------------------------------------------------------
2313 Returns the result of converting the double-precision floating-point value
2314 `a' to the 32-bit two's complement integer format.  The conversion is
2315 performed according to the IEC/IEEE Standard for Binary Floating-Point
2316 Arithmetic, except that the conversion is always rounded toward zero.
2317 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2318 the conversion overflows, the largest integer with the same sign as `a' is
2319 returned.
2320 -------------------------------------------------------------------------------
2321 */
2322 int32 float64_to_int32_round_to_zero( float64 a )
2323 {
2324     flag aSign;
2325     int16 aExp, shiftCount;
2326     bits64 aSig, savedASig;
2327     int32 z;
2328
2329     aSig = extractFloat64Frac( a );
2330     aExp = extractFloat64Exp( a );
2331     aSign = extractFloat64Sign( a );
2332     if ( 0x41E < aExp ) {
2333         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2334         goto invalid;
2335     }
2336     else if ( aExp < 0x3FF ) {
2337         if ( aExp || aSig ) float_set_inexact();
2338         return 0;
2339     }
2340     aSig |= LIT64( 0x0010000000000000 );
2341     shiftCount = 0x433 - aExp;
2342     savedASig = aSig;
2343     aSig >>= shiftCount;
2344     z = aSig;
2345     if ( aSign ) z = - z;
2346     if ( ( z < 0 ) ^ aSign ) {
2347  invalid:
2348         float_raise( float_flag_invalid );
2349         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2350     }
2351     if ( ( aSig<<shiftCount ) != savedASig ) {
2352         float_set_inexact();
2353     }
2354     return z;
2355
2356 }
2357
2358 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2359 /*
2360 -------------------------------------------------------------------------------
2361 Returns the result of converting the double-precision floating-point value
2362 `a' to the 64-bit two's complement integer format.  The conversion is
2363 performed according to the IEC/IEEE Standard for Binary Floating-Point
2364 Arithmetic---which means in particular that the conversion is rounded
2365 according to the current rounding mode.  If `a' is a NaN, the largest
2366 positive integer is returned.  Otherwise, if the conversion overflows, the
2367 largest integer with the same sign as `a' is returned.
2368 -------------------------------------------------------------------------------
2369 */
2370 int64 float64_to_int64( float64 a )
2371 {
2372     flag aSign;
2373     int16 aExp, shiftCount;
2374     bits64 aSig, aSigExtra;
2375
2376     aSig = extractFloat64Frac( a );
2377     aExp = extractFloat64Exp( a );
2378     aSign = extractFloat64Sign( a );
2379     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2380     shiftCount = 0x433 - aExp;
2381     if ( shiftCount <= 0 ) {
2382         if ( 0x43E < aExp ) {
2383             float_raise( float_flag_invalid );
2384             if (    ! aSign
2385                  || (    ( aExp == 0x7FF )
2386                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2387                ) {
2388                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2389             }
2390             return (sbits64) LIT64( 0x8000000000000000 );
2391         }
2392         aSigExtra = 0;
2393         aSig <<= - shiftCount;
2394     }
2395     else {
2396         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2397     }
2398     return roundAndPackInt64( aSign, aSig, aSigExtra );
2399
2400 }
2401
2402 /*
2403 -------------------------------------------------------------------------------
2404 Returns the result of converting the double-precision floating-point value
2405 `a' to the 64-bit two's complement integer format.  The conversion is
2406 performed according to the IEC/IEEE Standard for Binary Floating-Point
2407 Arithmetic, except that the conversion is always rounded toward zero.
2408 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2409 the conversion overflows, the largest integer with the same sign as `a' is
2410 returned.
2411 -------------------------------------------------------------------------------
2412 */
2413 int64 float64_to_int64_round_to_zero( float64 a )
2414 {
2415     flag aSign;
2416     int16 aExp, shiftCount;
2417     bits64 aSig;
2418     int64 z;
2419
2420     aSig = extractFloat64Frac( a );
2421     aExp = extractFloat64Exp( a );
2422     aSign = extractFloat64Sign( a );
2423     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2424     shiftCount = aExp - 0x433;
2425     if ( 0 <= shiftCount ) {
2426         if ( 0x43E <= aExp ) {
2427             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2428                 float_raise( float_flag_invalid );
2429                 if (    ! aSign
2430                      || (    ( aExp == 0x7FF )
2431                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2432                    ) {
2433                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2434                 }
2435             }
2436             return (sbits64) LIT64( 0x8000000000000000 );
2437         }
2438         z = aSig<<shiftCount;
2439     }
2440     else {
2441         if ( aExp < 0x3FE ) {
2442             if ( aExp | aSig ) float_set_inexact();
2443             return 0;
2444         }
2445         z = aSig>>( - shiftCount );
2446         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2447             float_set_inexact();
2448         }
2449     }
2450     if ( aSign ) z = - z;
2451     return z;
2452
2453 }
2454 #endif /* !SOFTFLOAT_FOR_GCC */
2455
2456 /*
2457 -------------------------------------------------------------------------------
2458 Returns the result of converting the double-precision floating-point value
2459 `a' to the single-precision floating-point format.  The conversion is
2460 performed according to the IEC/IEEE Standard for Binary Floating-Point
2461 Arithmetic.
2462 -------------------------------------------------------------------------------
2463 */
2464 float32 float64_to_float32( float64 a )
2465 {
2466     flag aSign;
2467     int16 aExp;
2468     bits64 aSig;
2469     bits32 zSig;
2470
2471     aSig = extractFloat64Frac( a );
2472     aExp = extractFloat64Exp( a );
2473     aSign = extractFloat64Sign( a );
2474     if ( aExp == 0x7FF ) {
2475         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2476         return packFloat32( aSign, 0xFF, 0 );
2477     }
2478     shift64RightJamming( aSig, 22, &aSig );
2479     zSig = aSig;
2480     if ( aExp || zSig ) {
2481         zSig |= 0x40000000;
2482         aExp -= 0x381;
2483     }
2484     return roundAndPackFloat32( aSign, aExp, zSig );
2485
2486 }
2487
2488 #ifdef FLOATX80
2489
2490 /*
2491 -------------------------------------------------------------------------------
2492 Returns the result of converting the double-precision floating-point value
2493 `a' to the extended double-precision floating-point format.  The conversion
2494 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2495 Arithmetic.
2496 -------------------------------------------------------------------------------
2497 */
2498 floatx80 float64_to_floatx80( float64 a )
2499 {
2500     flag aSign;
2501     int16 aExp;
2502     bits64 aSig;
2503
2504     aSig = extractFloat64Frac( a );
2505     aExp = extractFloat64Exp( a );
2506     aSign = extractFloat64Sign( a );
2507     if ( aExp == 0x7FF ) {
2508         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2509         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2510     }
2511     if ( aExp == 0 ) {
2512         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2513         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2514     }
2515     return
2516         packFloatx80(
2517             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2518
2519 }
2520
2521 #endif
2522
2523 #ifdef FLOAT128
2524
2525 /*
2526 -------------------------------------------------------------------------------
2527 Returns the result of converting the double-precision floating-point value
2528 `a' to the quadruple-precision floating-point format.  The conversion is
2529 performed according to the IEC/IEEE Standard for Binary Floating-Point
2530 Arithmetic.
2531 -------------------------------------------------------------------------------
2532 */
2533 float128 float64_to_float128( float64 a )
2534 {
2535     flag aSign;
2536     int16 aExp;
2537     bits64 aSig, zSig0, zSig1;
2538
2539     aSig = extractFloat64Frac( a );
2540     aExp = extractFloat64Exp( a );
2541     aSign = extractFloat64Sign( a );
2542     if ( aExp == 0x7FF ) {
2543         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2544         return packFloat128( aSign, 0x7FFF, 0, 0 );
2545     }
2546     if ( aExp == 0 ) {
2547         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2548         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2549         --aExp;
2550     }
2551     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2552     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2553
2554 }
2555
2556 #endif
2557
2558 #ifndef SOFTFLOAT_FOR_GCC
2559 /*
2560 -------------------------------------------------------------------------------
2561 Rounds the double-precision floating-point value `a' to an integer, and
2562 returns the result as a double-precision floating-point value.  The
2563 operation is performed according to the IEC/IEEE Standard for Binary
2564 Floating-Point Arithmetic.
2565 -------------------------------------------------------------------------------
2566 */
2567 float64 float64_round_to_int( float64 a )
2568 {
2569     flag aSign;
2570     int16 aExp;
2571     bits64 lastBitMask, roundBitsMask;
2572     int8 roundingMode;
2573     float64 z;
2574
2575     aExp = extractFloat64Exp( a );
2576     if ( 0x433 <= aExp ) {
2577         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2578             return propagateFloat64NaN( a, a );
2579         }
2580         return a;
2581     }
2582     if ( aExp < 0x3FF ) {
2583         if ( (bits64) ( a<<1 ) == 0 ) return a;
2584         float_set_inexact();
2585         aSign = extractFloat64Sign( a );
2586         switch ( float_rounding_mode() ) {
2587          case float_round_nearest_even:
2588             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2589                 return packFloat64( aSign, 0x3FF, 0 );
2590             }
2591             break;
2592          case float_round_down:
2593             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2594          case float_round_up:
2595             return
2596             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2597         }
2598         return packFloat64( aSign, 0, 0 );
2599     }
2600     lastBitMask = 1;
2601     lastBitMask <<= 0x433 - aExp;
2602     roundBitsMask = lastBitMask - 1;
2603     z = a;
2604     roundingMode = float_rounding_mode();
2605     if ( roundingMode == float_round_nearest_even ) {
2606         z += lastBitMask>>1;
2607         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2608     }
2609     else if ( roundingMode != float_round_to_zero ) {
2610         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2611             z += roundBitsMask;
2612         }
2613     }
2614     z &= ~ roundBitsMask;
2615     if ( z != a ) float_set_inexact();
2616     return z;
2617
2618 }
2619 #endif
2620
2621 /*
2622 -------------------------------------------------------------------------------
2623 Returns the result of adding the absolute values of the double-precision
2624 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2625 before being returned.  `zSign' is ignored if the result is a NaN.
2626 The addition is performed according to the IEC/IEEE Standard for Binary
2627 Floating-Point Arithmetic.
2628 -------------------------------------------------------------------------------
2629 */
2630 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2631 {
2632     int16 aExp, bExp, zExp;
2633     bits64 aSig, bSig, zSig;
2634     int16 expDiff;
2635
2636     aSig = extractFloat64Frac( a );
2637     aExp = extractFloat64Exp( a );
2638     bSig = extractFloat64Frac( b );
2639     bExp = extractFloat64Exp( b );
2640     expDiff = aExp - bExp;
2641     aSig <<= 9;
2642     bSig <<= 9;
2643     if ( 0 < expDiff ) {
2644         if ( aExp == 0x7FF ) {
2645             if ( aSig ) return propagateFloat64NaN( a, b );
2646             return a;
2647         }
2648         if ( bExp == 0 ) {
2649             --expDiff;
2650         }
2651         else {
2652             bSig |= LIT64( 0x2000000000000000 );
2653         }
2654         shift64RightJamming( bSig, expDiff, &bSig );
2655         zExp = aExp;
2656     }
2657     else if ( expDiff < 0 ) {
2658         if ( bExp == 0x7FF ) {
2659             if ( bSig ) return propagateFloat64NaN( a, b );
2660             return packFloat64( zSign, 0x7FF, 0 );
2661         }
2662         if ( aExp == 0 ) {
2663             ++expDiff;
2664         }
2665         else {
2666             aSig |= LIT64( 0x2000000000000000 );
2667         }
2668         shift64RightJamming( aSig, - expDiff, &aSig );
2669         zExp = bExp;
2670     }
2671     else {
2672         if ( aExp == 0x7FF ) {
2673             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2674             return a;
2675         }
2676         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2677         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2678         zExp = aExp;
2679         goto roundAndPack;
2680     }
2681     aSig |= LIT64( 0x2000000000000000 );
2682     zSig = ( aSig + bSig )<<1;
2683     --zExp;
2684     if ( (sbits64) zSig < 0 ) {
2685         zSig = aSig + bSig;
2686         ++zExp;
2687     }
2688  roundAndPack:
2689     return roundAndPackFloat64( zSign, zExp, zSig );
2690
2691 }
2692
2693 /*
2694 -------------------------------------------------------------------------------
2695 Returns the result of subtracting the absolute values of the double-
2696 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2697 difference is negated before being returned.  `zSign' is ignored if the
2698 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2699 Standard for Binary Floating-Point Arithmetic.
2700 -------------------------------------------------------------------------------
2701 */
2702 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2703 {
2704     int16 aExp, bExp, zExp;
2705     bits64 aSig, bSig, zSig;
2706     int16 expDiff;
2707
2708     aSig = extractFloat64Frac( a );
2709     aExp = extractFloat64Exp( a );
2710     bSig = extractFloat64Frac( b );
2711     bExp = extractFloat64Exp( b );
2712     expDiff = aExp - bExp;
2713     aSig <<= 10;
2714     bSig <<= 10;
2715     if ( 0 < expDiff ) goto aExpBigger;
2716     if ( expDiff < 0 ) goto bExpBigger;
2717     if ( aExp == 0x7FF ) {
2718         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2719         float_raise( float_flag_invalid );
2720         return float64_default_nan;
2721     }
2722     if ( aExp == 0 ) {
2723         aExp = 1;
2724         bExp = 1;
2725     }
2726     if ( bSig < aSig ) goto aBigger;
2727     if ( aSig < bSig ) goto bBigger;
2728     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2729  bExpBigger:
2730     if ( bExp == 0x7FF ) {
2731         if ( bSig ) return propagateFloat64NaN( a, b );
2732         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2733     }
2734     if ( aExp == 0 ) {
2735         ++expDiff;
2736     }
2737     else {
2738         aSig |= LIT64( 0x4000000000000000 );
2739     }
2740     shift64RightJamming( aSig, - expDiff, &aSig );
2741     bSig |= LIT64( 0x4000000000000000 );
2742  bBigger:
2743     zSig = bSig - aSig;
2744     zExp = bExp;
2745     zSign ^= 1;
2746     goto normalizeRoundAndPack;
2747  aExpBigger:
2748     if ( aExp == 0x7FF ) {
2749         if ( aSig ) return propagateFloat64NaN( a, b );
2750         return a;
2751     }
2752     if ( bExp == 0 ) {
2753         --expDiff;
2754     }
2755     else {
2756         bSig |= LIT64( 0x4000000000000000 );
2757     }
2758     shift64RightJamming( bSig, expDiff, &bSig );
2759     aSig |= LIT64( 0x4000000000000000 );
2760  aBigger:
2761     zSig = aSig - bSig;
2762     zExp = aExp;
2763  normalizeRoundAndPack:
2764     --zExp;
2765     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2766
2767 }
2768
2769 /*
2770 -------------------------------------------------------------------------------
2771 Returns the result of adding the double-precision floating-point values `a'
2772 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2773 Binary Floating-Point Arithmetic.
2774 -------------------------------------------------------------------------------
2775 */
2776 float64 float64_add( float64 a, float64 b )
2777 {
2778     flag aSign, bSign;
2779
2780     aSign = extractFloat64Sign( a );
2781     bSign = extractFloat64Sign( b );
2782     if ( aSign == bSign ) {
2783         return addFloat64Sigs( a, b, aSign );
2784     }
2785     else {
2786         return subFloat64Sigs( a, b, aSign );
2787     }
2788
2789 }
2790
2791 /*
2792 -------------------------------------------------------------------------------
2793 Returns the result of subtracting the double-precision floating-point values
2794 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2795 for Binary Floating-Point Arithmetic.
2796 -------------------------------------------------------------------------------
2797 */
2798 float64 float64_sub( float64 a, float64 b )
2799 {
2800     flag aSign, bSign;
2801
2802     aSign = extractFloat64Sign( a );
2803     bSign = extractFloat64Sign( b );
2804     if ( aSign == bSign ) {
2805         return subFloat64Sigs( a, b, aSign );
2806     }
2807     else {
2808         return addFloat64Sigs( a, b, aSign );
2809     }
2810
2811 }
2812
2813 /*
2814 -------------------------------------------------------------------------------
2815 Returns the result of multiplying the double-precision floating-point values
2816 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2817 for Binary Floating-Point Arithmetic.
2818 -------------------------------------------------------------------------------
2819 */
2820 float64 float64_mul( float64 a, float64 b )
2821 {
2822     flag aSign, bSign, zSign;
2823     int16 aExp, bExp, zExp;
2824     bits64 aSig, bSig, zSig0, zSig1;
2825
2826     aSig = extractFloat64Frac( a );
2827     aExp = extractFloat64Exp( a );
2828     aSign = extractFloat64Sign( a );
2829     bSig = extractFloat64Frac( b );
2830     bExp = extractFloat64Exp( b );
2831     bSign = extractFloat64Sign( b );
2832     zSign = aSign ^ bSign;
2833     if ( aExp == 0x7FF ) {
2834         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2835             return propagateFloat64NaN( a, b );
2836         }
2837         if ( ( bExp | bSig ) == 0 ) {
2838             float_raise( float_flag_invalid );
2839             return float64_default_nan;
2840         }
2841         return packFloat64( zSign, 0x7FF, 0 );
2842     }
2843     if ( bExp == 0x7FF ) {
2844         if ( bSig ) return propagateFloat64NaN( a, b );
2845         if ( ( aExp | aSig ) == 0 ) {
2846             float_raise( float_flag_invalid );
2847             return float64_default_nan;
2848         }
2849         return packFloat64( zSign, 0x7FF, 0 );
2850     }
2851     if ( aExp == 0 ) {
2852         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2853         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2854     }
2855     if ( bExp == 0 ) {
2856         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2857         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2858     }
2859     zExp = aExp + bExp - 0x3FF;
2860     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2861     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2862     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2863     zSig0 |= ( zSig1 != 0 );
2864     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2865         zSig0 <<= 1;
2866         --zExp;
2867     }
2868     return roundAndPackFloat64( zSign, zExp, zSig0 );
2869
2870 }
2871
2872 /*
2873 -------------------------------------------------------------------------------
2874 Returns the result of dividing the double-precision floating-point value `a'
2875 by the corresponding value `b'.  The operation is performed according to
2876 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2877 -------------------------------------------------------------------------------
2878 */
2879 float64 float64_div( float64 a, float64 b )
2880 {
2881     flag aSign, bSign, zSign;
2882     int16 aExp, bExp, zExp;
2883     bits64 aSig, bSig, zSig;
2884     bits64 rem0, rem1;
2885     bits64 term0, term1;
2886
2887     aSig = extractFloat64Frac( a );
2888     aExp = extractFloat64Exp( a );
2889     aSign = extractFloat64Sign( a );
2890     bSig = extractFloat64Frac( b );
2891     bExp = extractFloat64Exp( b );
2892     bSign = extractFloat64Sign( b );
2893     zSign = aSign ^ bSign;
2894     if ( aExp == 0x7FF ) {
2895         if ( aSig ) return propagateFloat64NaN( a, b );
2896         if ( bExp == 0x7FF ) {
2897             if ( bSig ) return propagateFloat64NaN( a, b );
2898             float_raise( float_flag_invalid );
2899             return float64_default_nan;
2900         }
2901         return packFloat64( zSign, 0x7FF, 0 );
2902     }
2903     if ( bExp == 0x7FF ) {
2904         if ( bSig ) return propagateFloat64NaN( a, b );
2905         return packFloat64( zSign, 0, 0 );
2906     }
2907     if ( bExp == 0 ) {
2908         if ( bSig == 0 ) {
2909             if ( ( aExp | aSig ) == 0 ) {
2910                 float_raise( float_flag_invalid );
2911                 return float64_default_nan;
2912             }
2913             float_raise( float_flag_divbyzero );
2914             return packFloat64( zSign, 0x7FF, 0 );
2915         }
2916         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2917     }
2918     if ( aExp == 0 ) {
2919         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2920         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2921     }
2922     zExp = aExp - bExp + 0x3FD;
2923     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2924     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2925     if ( bSig <= ( aSig + aSig ) ) {
2926         aSig >>= 1;
2927         ++zExp;
2928     }
2929     zSig = estimateDiv128To64( aSig, 0, bSig );
2930     if ( ( zSig & 0x1FF ) <= 2 ) {
2931         mul64To128( bSig, zSig, &term0, &term1 );
2932         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2933         while ( (sbits64) rem0 < 0 ) {
2934             --zSig;
2935             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2936         }
2937         zSig |= ( rem1 != 0 );
2938     }
2939     return roundAndPackFloat64( zSign, zExp, zSig );
2940
2941 }
2942
2943 #ifndef SOFTFLOAT_FOR_GCC
2944 /*
2945 -------------------------------------------------------------------------------
2946 Returns the remainder of the double-precision floating-point value `a'
2947 with respect to the corresponding value `b'.  The operation is performed
2948 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2949 -------------------------------------------------------------------------------
2950 */
2951 float64 float64_rem( float64 a, float64 b )
2952 {
2953     flag aSign, bSign, zSign;
2954     int16 aExp, bExp, expDiff;
2955     bits64 aSig, bSig;
2956     bits64 q, alternateASig;
2957     sbits64 sigMean;
2958
2959     aSig = extractFloat64Frac( a );
2960     aExp = extractFloat64Exp( a );
2961     aSign = extractFloat64Sign( a );
2962     bSig = extractFloat64Frac( b );
2963     bExp = extractFloat64Exp( b );
2964     bSign = extractFloat64Sign( b );
2965     if ( aExp == 0x7FF ) {
2966         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2967             return propagateFloat64NaN( a, b );
2968         }
2969         float_raise( float_flag_invalid );
2970         return float64_default_nan;
2971     }
2972     if ( bExp == 0x7FF ) {
2973         if ( bSig ) return propagateFloat64NaN( a, b );
2974         return a;
2975     }
2976     if ( bExp == 0 ) {
2977         if ( bSig == 0 ) {
2978             float_raise( float_flag_invalid );
2979             return float64_default_nan;
2980         }
2981         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2982     }
2983     if ( aExp == 0 ) {
2984         if ( aSig == 0 ) return a;
2985         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2986     }
2987     expDiff = aExp - bExp;
2988     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2989     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2990     if ( expDiff < 0 ) {
2991         if ( expDiff < -1 ) return a;
2992         aSig >>= 1;
2993     }
2994     q = ( bSig <= aSig );
2995     if ( q ) aSig -= bSig;
2996     expDiff -= 64;
2997     while ( 0 < expDiff ) {
2998         q = estimateDiv128To64( aSig, 0, bSig );
2999         q = ( 2 < q ) ? q - 2 : 0;
3000         aSig = - ( ( bSig>>2 ) * q );
3001         expDiff -= 62;
3002     }
3003     expDiff += 64;
3004     if ( 0 < expDiff ) {
3005         q = estimateDiv128To64( aSig, 0, bSig );
3006         q = ( 2 < q ) ? q - 2 : 0;
3007         q >>= 64 - expDiff;
3008         bSig >>= 2;
3009         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3010     }
3011     else {
3012         aSig >>= 2;
3013         bSig >>= 2;
3014     }
3015     do {
3016         alternateASig = aSig;
3017         ++q;
3018         aSig -= bSig;
3019     } while ( 0 <= (sbits64) aSig );
3020     sigMean = aSig + alternateASig;
3021     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3022         aSig = alternateASig;
3023     }
3024     zSign = ( (sbits64) aSig < 0 );
3025     if ( zSign ) aSig = - aSig;
3026     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3027
3028 }
3029
3030 /*
3031 -------------------------------------------------------------------------------
3032 Returns the square root of the double-precision floating-point value `a'.
3033 The operation is performed according to the IEC/IEEE Standard for Binary
3034 Floating-Point Arithmetic.
3035 -------------------------------------------------------------------------------
3036 */
3037 float64 float64_sqrt( float64 a )
3038 {
3039     flag aSign;
3040     int16 aExp, zExp;
3041     bits64 aSig, zSig, doubleZSig;
3042     bits64 rem0, rem1, term0, term1;
3043
3044     aSig = extractFloat64Frac( a );
3045     aExp = extractFloat64Exp( a );
3046     aSign = extractFloat64Sign( a );
3047     if ( aExp == 0x7FF ) {
3048         if ( aSig ) return propagateFloat64NaN( a, a );
3049         if ( ! aSign ) return a;
3050         float_raise( float_flag_invalid );
3051         return float64_default_nan;
3052     }
3053     if ( aSign ) {
3054         if ( ( aExp | aSig ) == 0 ) return a;
3055         float_raise( float_flag_invalid );
3056         return float64_default_nan;
3057     }
3058     if ( aExp == 0 ) {
3059         if ( aSig == 0 ) return 0;
3060         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3061     }
3062     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3063     aSig |= LIT64( 0x0010000000000000 );
3064     zSig = estimateSqrt32( aExp, aSig>>21 );
3065     aSig <<= 9 - ( aExp & 1 );
3066     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3067     if ( ( zSig & 0x1FF ) <= 5 ) {
3068         doubleZSig = zSig<<1;
3069         mul64To128( zSig, zSig, &term0, &term1 );
3070         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3071         while ( (sbits64) rem0 < 0 ) {
3072             --zSig;
3073             doubleZSig -= 2;
3074             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3075         }
3076         zSig |= ( ( rem0 | rem1 ) != 0 );
3077     }
3078     return roundAndPackFloat64( 0, zExp, zSig );
3079
3080 }
3081 #endif
3082
3083 /*
3084 -------------------------------------------------------------------------------
3085 Returns 1 if the double-precision floating-point value `a' is equal to the
3086 corresponding value `b', and 0 otherwise.  The comparison is performed
3087 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3088 -------------------------------------------------------------------------------
3089 */
3090 flag float64_eq( float64 a, float64 b )
3091 {
3092
3093     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3094          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3095        ) {
3096         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3097             float_raise( float_flag_invalid );
3098         }
3099         return 0;
3100     }
3101     return ( a == b ) ||
3102         ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3103
3104 }
3105
3106 /*
3107 -------------------------------------------------------------------------------
3108 Returns 1 if the double-precision floating-point value `a' is less than or
3109 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3110 performed according to the IEC/IEEE Standard for Binary Floating-Point
3111 Arithmetic.
3112 -------------------------------------------------------------------------------
3113 */
3114 flag float64_le( float64 a, float64 b )
3115 {
3116     flag aSign, bSign;
3117
3118     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3119          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3120        ) {
3121         float_raise( float_flag_invalid );
3122         return 0;
3123     }
3124     aSign = extractFloat64Sign( a );
3125     bSign = extractFloat64Sign( b );
3126     if ( aSign != bSign )
3127         return aSign ||
3128             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3129               0 );
3130     return ( a == b ) ||
3131         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3132
3133 }
3134
3135 /*
3136 -------------------------------------------------------------------------------
3137 Returns 1 if the double-precision floating-point value `a' is less than
3138 the corresponding value `b', and 0 otherwise.  The comparison is performed
3139 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3140 -------------------------------------------------------------------------------
3141 */
3142 flag float64_lt( float64 a, float64 b )
3143 {
3144     flag aSign, bSign;
3145
3146     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3147          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3148        ) {
3149         float_raise( float_flag_invalid );
3150         return 0;
3151     }
3152     aSign = extractFloat64Sign( a );
3153     bSign = extractFloat64Sign( b );
3154     if ( aSign != bSign )
3155         return aSign &&
3156             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3157               0 );
3158     return ( a != b ) &&
3159         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3160
3161 }
3162
3163 #ifndef SOFTFLOAT_FOR_GCC
3164 /*
3165 -------------------------------------------------------------------------------
3166 Returns 1 if the double-precision floating-point value `a' is equal to the
3167 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3168 if either operand is a NaN.  Otherwise, the comparison is performed
3169 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3170 -------------------------------------------------------------------------------
3171 */
3172 flag float64_eq_signaling( float64 a, float64 b )
3173 {
3174
3175     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3176          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3177        ) {
3178         float_raise( float_flag_invalid );
3179         return 0;
3180     }
3181     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3182
3183 }
3184
3185 /*
3186 -------------------------------------------------------------------------------
3187 Returns 1 if the double-precision floating-point value `a' is less than or
3188 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3189 cause an exception.  Otherwise, the comparison is performed according to the
3190 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3191 -------------------------------------------------------------------------------
3192 */
3193 flag float64_le_quiet( float64 a, float64 b )
3194 {
3195     flag aSign, bSign;
3196
3197     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3198          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3199        ) {
3200         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3201             float_raise( float_flag_invalid );
3202         }
3203         return 0;
3204     }
3205     aSign = extractFloat64Sign( a );
3206     bSign = extractFloat64Sign( b );
3207     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3208     return ( a == b ) || ( aSign ^ ( a < b ) );
3209
3210 }
3211
3212 /*
3213 -------------------------------------------------------------------------------
3214 Returns 1 if the double-precision floating-point value `a' is less than
3215 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3216 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3217 Standard for Binary Floating-Point Arithmetic.
3218 -------------------------------------------------------------------------------
3219 */
3220 flag float64_lt_quiet( float64 a, float64 b )
3221 {
3222     flag aSign, bSign;
3223
3224     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3225          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3226        ) {
3227         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3228             float_raise( float_flag_invalid );
3229         }
3230         return 0;
3231     }
3232     aSign = extractFloat64Sign( a );
3233     bSign = extractFloat64Sign( b );
3234     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3235     return ( a != b ) && ( aSign ^ ( a < b ) );
3236
3237 }
3238 #endif
3239
3240 #ifdef FLOATX80
3241
3242 /*
3243 -------------------------------------------------------------------------------
3244 Returns the result of converting the extended double-precision floating-
3245 point value `a' to the 32-bit two's complement integer format.  The
3246 conversion is performed according to the IEC/IEEE Standard for Binary
3247 Floating-Point Arithmetic---which means in particular that the conversion
3248 is rounded according to the current rounding mode.  If `a' is a NaN, the
3249 largest positive integer is returned.  Otherwise, if the conversion
3250 overflows, the largest integer with the same sign as `a' is returned.
3251 -------------------------------------------------------------------------------
3252 */
3253 int32 floatx80_to_int32( floatx80 a )
3254 {
3255     flag aSign;
3256     int32 aExp, shiftCount;
3257     bits64 aSig;
3258
3259     aSig = extractFloatx80Frac( a );
3260     aExp = extractFloatx80Exp( a );
3261     aSign = extractFloatx80Sign( a );
3262     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3263     shiftCount = 0x4037 - aExp;
3264     if ( shiftCount <= 0 ) shiftCount = 1;
3265     shift64RightJamming( aSig, shiftCount, &aSig );
3266     return roundAndPackInt32( aSign, aSig );
3267
3268 }
3269
3270 /*
3271 -------------------------------------------------------------------------------
3272 Returns the result of converting the extended double-precision floating-
3273 point value `a' to the 32-bit two's complement integer format.  The
3274 conversion is performed according to the IEC/IEEE Standard for Binary
3275 Floating-Point Arithmetic, except that the conversion is always rounded
3276 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3277 Otherwise, if the conversion overflows, the largest integer with the same
3278 sign as `a' is returned.
3279 -------------------------------------------------------------------------------
3280 */
3281 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3282 {
3283     flag aSign;
3284     int32 aExp, shiftCount;
3285     bits64 aSig, savedASig;
3286     int32 z;
3287
3288     aSig = extractFloatx80Frac( a );
3289     aExp = extractFloatx80Exp( a );
3290     aSign = extractFloatx80Sign( a );
3291     if ( 0x401E < aExp ) {
3292         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3293         goto invalid;
3294     }
3295     else if ( aExp < 0x3FFF ) {
3296         if ( aExp || aSig ) float_set_inexact();
3297         return 0;
3298     }
3299     shiftCount = 0x403E - aExp;
3300     savedASig = aSig;
3301     aSig >>= shiftCount;
3302     z = aSig;
3303     if ( aSign ) z = - z;
3304     if ( ( z < 0 ) ^ aSign ) {
3305  invalid:
3306         float_raise( float_flag_invalid );
3307         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3308     }
3309     if ( ( aSig<<shiftCount ) != savedASig ) {
3310         float_set_inexact();
3311     }
3312     return z;
3313
3314 }
3315
3316 /*
3317 -------------------------------------------------------------------------------
3318 Returns the result of converting the extended double-precision floating-
3319 point value `a' to the 64-bit two's complement integer format.  The
3320 conversion is performed according to the IEC/IEEE Standard for Binary
3321 Floating-Point Arithmetic---which means in particular that the conversion
3322 is rounded according to the current rounding mode.  If `a' is a NaN,
3323 the largest positive integer is returned.  Otherwise, if the conversion
3324 overflows, the largest integer with the same sign as `a' is returned.
3325 -------------------------------------------------------------------------------
3326 */
3327 int64 floatx80_to_int64( floatx80 a )
3328 {
3329     flag aSign;
3330     int32 aExp, shiftCount;
3331     bits64 aSig, aSigExtra;
3332
3333     aSig = extractFloatx80Frac( a );
3334     aExp = extractFloatx80Exp( a );
3335     aSign = extractFloatx80Sign( a );
3336     shiftCount = 0x403E - aExp;
3337     if ( shiftCount <= 0 ) {
3338         if ( shiftCount ) {
3339             float_raise( float_flag_invalid );
3340             if (    ! aSign
3341                  || (    ( aExp == 0x7FFF )
3342                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3343                ) {
3344                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3345             }
3346             return (sbits64) LIT64( 0x8000000000000000 );
3347         }
3348         aSigExtra = 0;
3349     }
3350     else {
3351         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3352     }
3353     return roundAndPackInt64( aSign, aSig, aSigExtra );
3354
3355 }
3356
3357 /*
3358 -------------------------------------------------------------------------------
3359 Returns the result of converting the extended double-precision floating-
3360 point value `a' to the 64-bit two's complement integer format.  The
3361 conversion is performed according to the IEC/IEEE Standard for Binary
3362 Floating-Point Arithmetic, except that the conversion is always rounded
3363 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3364 Otherwise, if the conversion overflows, the largest integer with the same
3365 sign as `a' is returned.
3366 -------------------------------------------------------------------------------
3367 */
3368 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3369 {
3370     flag aSign;
3371     int32 aExp, shiftCount;
3372     bits64 aSig;
3373     int64 z;
3374
3375     aSig = extractFloatx80Frac( a );
3376     aExp = extractFloatx80Exp( a );
3377     aSign = extractFloatx80Sign( a );
3378     shiftCount = aExp - 0x403E;
3379     if ( 0 <= shiftCount ) {
3380         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3381         if ( ( a.high != 0xC03E ) || aSig ) {
3382             float_raise( float_flag_invalid );
3383             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3384                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3385             }
3386         }
3387         return (sbits64) LIT64( 0x8000000000000000 );
3388     }
3389     else if ( aExp < 0x3FFF ) {
3390         if ( aExp | aSig ) float_set_inexact();
3391         return 0;
3392     }
3393     z = aSig>>( - shiftCount );
3394     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3395         float_set_inexact();
3396     }
3397     if ( aSign ) z = - z;
3398     return z;
3399
3400 }
3401
3402 /*
3403 -------------------------------------------------------------------------------
3404 Returns the result of converting the extended double-precision floating-
3405 point value `a' to the single-precision floating-point format.  The
3406 conversion is performed according to the IEC/IEEE Standard for Binary
3407 Floating-Point Arithmetic.
3408 -------------------------------------------------------------------------------
3409 */
3410 float32 floatx80_to_float32( floatx80 a )
3411 {
3412     flag aSign;
3413     int32 aExp;
3414     bits64 aSig;
3415
3416     aSig = extractFloatx80Frac( a );
3417     aExp = extractFloatx80Exp( a );
3418     aSign = extractFloatx80Sign( a );
3419     if ( aExp == 0x7FFF ) {
3420         if ( (bits64) ( aSig<<1 ) ) {
3421             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3422         }
3423         return packFloat32( aSign, 0xFF, 0 );
3424     }
3425     shift64RightJamming( aSig, 33, &aSig );
3426     if ( aExp || aSig ) aExp -= 0x3F81;
3427     return roundAndPackFloat32( aSign, aExp, aSig );
3428
3429 }
3430
3431 /*
3432 -------------------------------------------------------------------------------
3433 Returns the result of converting the extended double-precision floating-
3434 point value `a' to the double-precision floating-point format.  The
3435 conversion is performed according to the IEC/IEEE Standard for Binary
3436 Floating-Point Arithmetic.
3437 -------------------------------------------------------------------------------
3438 */
3439 float64 floatx80_to_float64( floatx80 a )
3440 {
3441     flag aSign;
3442     int32 aExp;
3443     bits64 aSig, zSig;
3444
3445     aSig = extractFloatx80Frac( a );
3446     aExp = extractFloatx80Exp( a );
3447     aSign = extractFloatx80Sign( a );
3448     if ( aExp == 0x7FFF ) {
3449         if ( (bits64) ( aSig<<1 ) ) {
3450             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3451         }
3452         return packFloat64( aSign, 0x7FF, 0 );
3453     }
3454     shift64RightJamming( aSig, 1, &zSig );
3455     if ( aExp || aSig ) aExp -= 0x3C01;
3456     return roundAndPackFloat64( aSign, aExp, zSig );
3457
3458 }
3459
3460 #ifdef FLOAT128
3461
3462 /*
3463 -------------------------------------------------------------------------------
3464 Returns the result of converting the extended double-precision floating-
3465 point value `a' to the quadruple-precision floating-point format.  The
3466 conversion is performed according to the IEC/IEEE Standard for Binary
3467 Floating-Point Arithmetic.
3468 -------------------------------------------------------------------------------
3469 */
3470 float128 floatx80_to_float128( floatx80 a )
3471 {
3472     flag aSign;
3473     int16 aExp;
3474     bits64 aSig, zSig0, zSig1;
3475
3476     aSig = extractFloatx80Frac( a );
3477     aExp = extractFloatx80Exp( a );
3478     aSign = extractFloatx80Sign( a );
3479     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3480         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3481     }
3482     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3483     return packFloat128( aSign, aExp, zSig0, zSig1 );
3484
3485 }
3486
3487 #endif
3488
3489 /*
3490 -------------------------------------------------------------------------------
3491 Rounds the extended double-precision floating-point value `a' to an integer,
3492 and returns the result as an extended quadruple-precision floating-point
3493 value.  The operation is performed according to the IEC/IEEE Standard for
3494 Binary Floating-Point Arithmetic.
3495 -------------------------------------------------------------------------------
3496 */
3497 floatx80 floatx80_round_to_int( floatx80 a )
3498 {
3499     flag aSign;
3500     int32 aExp;
3501     bits64 lastBitMask, roundBitsMask;
3502     int8 roundingMode;
3503     floatx80 z;
3504
3505     aExp = extractFloatx80Exp( a );
3506     if ( 0x403E <= aExp ) {
3507         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3508             return propagateFloatx80NaN( a, a );
3509         }
3510         return a;
3511     }
3512     if ( aExp < 0x3FFF ) {
3513         if (    ( aExp == 0 )
3514              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3515             return a;
3516         }
3517         float_set_inexact();
3518         aSign = extractFloatx80Sign( a );
3519         switch ( float_rounding_mode() ) {
3520          case float_round_nearest_even:
3521             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3522                ) {
3523                 return
3524                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3525             }
3526             break;
3527          case float_round_down:
3528             return
3529                   aSign ?
3530                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3531                 : packFloatx80( 0, 0, 0 );
3532          case float_round_up:
3533             return
3534                   aSign ? packFloatx80( 1, 0, 0 )
3535                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3536         }
3537         return packFloatx80( aSign, 0, 0 );
3538     }
3539     lastBitMask = 1;
3540     lastBitMask <<= 0x403E - aExp;
3541     roundBitsMask = lastBitMask - 1;
3542     z = a;
3543     roundingMode = float_rounding_mode();
3544     if ( roundingMode == float_round_nearest_even ) {
3545         z.low += lastBitMask>>1;
3546         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3547     }
3548     else if ( roundingMode != float_round_to_zero ) {
3549         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3550             z.low += roundBitsMask;
3551         }
3552     }
3553     z.low &= ~ roundBitsMask;
3554     if ( z.low == 0 ) {
3555         ++z.high;
3556         z.low = LIT64( 0x8000000000000000 );
3557     }
3558     if ( z.low != a.low ) float_set_inexact();
3559     return z;
3560
3561 }
3562
3563 /*
3564 -------------------------------------------------------------------------------
3565 Returns the result of adding the absolute values of the extended double-
3566 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3567 negated before being returned.  `zSign' is ignored if the result is a NaN.
3568 The addition is performed according to the IEC/IEEE Standard for Binary
3569 Floating-Point Arithmetic.
3570 -------------------------------------------------------------------------------
3571 */
3572 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3573 {
3574     int32 aExp, bExp, zExp;
3575     bits64 aSig, bSig, zSig0, zSig1;
3576     int32 expDiff;
3577
3578     aSig = extractFloatx80Frac( a );
3579     aExp = extractFloatx80Exp( a );
3580     bSig = extractFloatx80Frac( b );
3581     bExp = extractFloatx80Exp( b );
3582     expDiff = aExp - bExp;
3583     if ( 0 < expDiff ) {
3584         if ( aExp == 0x7FFF ) {
3585             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3586             return a;
3587         }
3588         if ( bExp == 0 ) --expDiff;
3589         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3590         zExp = aExp;
3591     }
3592     else if ( expDiff < 0 ) {
3593         if ( bExp == 0x7FFF ) {
3594             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3595             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3596         }
3597         if ( aExp == 0 ) ++expDiff;
3598         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3599         zExp = bExp;
3600     }
3601     else {
3602         if ( aExp == 0x7FFF ) {
3603             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3604                 return propagateFloatx80NaN( a, b );
3605             }
3606             return a;
3607         }
3608         zSig1 = 0;
3609         zSig0 = aSig + bSig;
3610         if ( aExp == 0 ) {
3611             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3612             goto roundAndPack;
3613         }
3614         zExp = aExp;
3615         goto shiftRight1;
3616     }
3617     zSig0 = aSig + bSig;
3618     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3619  shiftRight1:
3620     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3621     zSig0 |= LIT64( 0x8000000000000000 );
3622     ++zExp;
3623  roundAndPack:
3624     return
3625         roundAndPackFloatx80(
3626             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3627
3628 }
3629
3630 /*
3631 -------------------------------------------------------------------------------
3632 Returns the result of subtracting the absolute values of the extended
3633 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3634 difference is negated before being returned.  `zSign' is ignored if the
3635 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3636 Standard for Binary Floating-Point Arithmetic.
3637 -------------------------------------------------------------------------------
3638 */
3639 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3640 {
3641     int32 aExp, bExp, zExp;
3642     bits64 aSig, bSig, zSig0, zSig1;
3643     int32 expDiff;
3644     floatx80 z;
3645
3646     aSig = extractFloatx80Frac( a );
3647     aExp = extractFloatx80Exp( a );
3648     bSig = extractFloatx80Frac( b );
3649     bExp = extractFloatx80Exp( b );
3650     expDiff = aExp - bExp;
3651     if ( 0 < expDiff ) goto aExpBigger;
3652     if ( expDiff < 0 ) goto bExpBigger;
3653     if ( aExp == 0x7FFF ) {
3654         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3655             return propagateFloatx80NaN( a, b );
3656         }
3657         float_raise( float_flag_invalid );
3658         z.low = floatx80_default_nan_low;
3659         z.high = floatx80_default_nan_high;
3660         return z;
3661     }
3662     if ( aExp == 0 ) {
3663         aExp = 1;
3664         bExp = 1;
3665     }
3666     zSig1 = 0;
3667     if ( bSig < aSig ) goto aBigger;
3668     if ( aSig < bSig ) goto bBigger;
3669     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3670  bExpBigger:
3671     if ( bExp == 0x7FFF ) {
3672         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3673         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3674     }
3675     if ( aExp == 0 ) ++expDiff;
3676     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3677  bBigger:
3678     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3679     zExp = bExp;
3680     zSign ^= 1;
3681     goto normalizeRoundAndPack;
3682  aExpBigger:
3683     if ( aExp == 0x7FFF ) {
3684         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3685         return a;
3686     }
3687     if ( bExp == 0 ) --expDiff;
3688     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3689  aBigger:
3690     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3691     zExp = aExp;
3692  normalizeRoundAndPack:
3693     return
3694         normalizeRoundAndPackFloatx80(
3695             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3696
3697 }
3698
3699 /*
3700 -------------------------------------------------------------------------------
3701 Returns the result of adding the extended double-precision floating-point
3702 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3703 Standard for Binary Floating-Point Arithmetic.
3704 -------------------------------------------------------------------------------
3705 */
3706 floatx80 floatx80_add( floatx80 a, floatx80 b )
3707 {
3708     flag aSign, bSign;
3709
3710     aSign = extractFloatx80Sign( a );
3711     bSign = extractFloatx80Sign( b );
3712     if ( aSign == bSign ) {
3713         return addFloatx80Sigs( a, b, aSign );
3714     }
3715     else {
3716         return subFloatx80Sigs( a, b, aSign );
3717     }
3718
3719 }
3720
3721 /*
3722 -------------------------------------------------------------------------------
3723 Returns the result of subtracting the extended double-precision floating-
3724 point values `a' and `b'.  The operation is performed according to the
3725 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3726 -------------------------------------------------------------------------------
3727 */
3728 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3729 {
3730     flag aSign, bSign;
3731
3732     aSign = extractFloatx80Sign( a );
3733     bSign = extractFloatx80Sign( b );
3734     if ( aSign == bSign ) {
3735         return subFloatx80Sigs( a, b, aSign );
3736     }
3737     else {
3738         return addFloatx80Sigs( a, b, aSign );
3739     }
3740
3741 }
3742
3743 /*
3744 -------------------------------------------------------------------------------
3745 Returns the result of multiplying the extended double-precision floating-
3746 point values `a' and `b'.  The operation is performed according to the
3747 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3748 -------------------------------------------------------------------------------
3749 */
3750 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3751 {
3752     flag aSign, bSign, zSign;
3753     int32 aExp, bExp, zExp;
3754     bits64 aSig, bSig, zSig0, zSig1;
3755     floatx80 z;
3756
3757     aSig = extractFloatx80Frac( a );
3758     aExp = extractFloatx80Exp( a );
3759     aSign = extractFloatx80Sign( a );
3760     bSig = extractFloatx80Frac( b );
3761     bExp = extractFloatx80Exp( b );
3762     bSign = extractFloatx80Sign( b );
3763     zSign = aSign ^ bSign;
3764     if ( aExp == 0x7FFF ) {
3765         if (    (bits64) ( aSig<<1 )
3766              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3767             return propagateFloatx80NaN( a, b );
3768         }
3769         if ( ( bExp | bSig ) == 0 ) goto invalid;
3770         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3771     }
3772     if ( bExp == 0x7FFF ) {
3773         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3774         if ( ( aExp | aSig ) == 0 ) {
3775  invalid:
3776             float_raise( float_flag_invalid );
3777             z.low = floatx80_default_nan_low;
3778             z.high = floatx80_default_nan_high;
3779             return z;
3780         }
3781         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3782     }
3783     if ( aExp == 0 ) {
3784         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3785         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3786     }
3787     if ( bExp == 0 ) {
3788         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3789         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3790     }
3791     zExp = aExp + bExp - 0x3FFE;
3792     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3793     if ( 0 < (sbits64) zSig0 ) {
3794         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3795         --zExp;
3796     }
3797     return
3798         roundAndPackFloatx80(
3799             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3800
3801 }
3802
3803 /*
3804 -------------------------------------------------------------------------------
3805 Returns the result of dividing the extended double-precision floating-point
3806 value `a' by the corresponding value `b'.  The operation is performed
3807 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3808 -------------------------------------------------------------------------------
3809 */
3810 floatx80 floatx80_div( floatx80 a, floatx80 b )
3811 {
3812     flag aSign, bSign, zSign;
3813     int32 aExp, bExp, zExp;
3814     bits64 aSig, bSig, zSig0, zSig1;
3815     bits64 rem0, rem1, rem2, term0, term1, term2;
3816     floatx80 z;
3817
3818     aSig = extractFloatx80Frac( a );
3819     aExp = extractFloatx80Exp( a );
3820     aSign = extractFloatx80Sign( a );
3821     bSig = extractFloatx80Frac( b );
3822     bExp = extractFloatx80Exp( b );
3823     bSign = extractFloatx80Sign( b );
3824     zSign = aSign ^ bSign;
3825     if ( aExp == 0x7FFF ) {
3826         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3827         if ( bExp == 0x7FFF ) {
3828             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3829             goto invalid;
3830         }
3831         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3832     }
3833     if ( bExp == 0x7FFF ) {
3834         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3835         return packFloatx80( zSign, 0, 0 );
3836     }
3837     if ( bExp == 0 ) {
3838         if ( bSig == 0 ) {
3839             if ( ( aExp | aSig ) == 0 ) {
3840  invalid:
3841                 float_raise( float_flag_invalid );
3842                 z.low = floatx80_default_nan_low;
3843                 z.high = floatx80_default_nan_high;
3844                 return z;
3845             }
3846             float_raise( float_flag_divbyzero );
3847             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3848         }
3849         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3850     }
3851     if ( aExp == 0 ) {
3852         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3853         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3854     }
3855     zExp = aExp - bExp + 0x3FFE;
3856     rem1 = 0;
3857     if ( bSig <= aSig ) {
3858         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3859         ++zExp;
3860     }
3861     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3862     mul64To128( bSig, zSig0, &term0, &term1 );
3863     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3864     while ( (sbits64) rem0 < 0 ) {
3865         --zSig0;
3866         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3867     }
3868     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3869     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3870         mul64To128( bSig, zSig1, &term1, &term2 );
3871         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3872         while ( (sbits64) rem1 < 0 ) {
3873             --zSig1;
3874             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3875         }
3876         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3877     }
3878     return
3879         roundAndPackFloatx80(
3880             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3881
3882 }
3883
3884 /*
3885 -------------------------------------------------------------------------------
3886 Returns the remainder of the extended double-precision floating-point value
3887 `a' with respect to the corresponding value `b'.  The operation is performed
3888 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3889 -------------------------------------------------------------------------------
3890 */
3891 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3892 {
3893     flag aSign, bSign, zSign;
3894     int32 aExp, bExp, expDiff;
3895     bits64 aSig0, aSig1, bSig;
3896     bits64 q, term0, term1, alternateASig0, alternateASig1;
3897     floatx80 z;
3898
3899     aSig0 = extractFloatx80Frac( a );
3900     aExp = extractFloatx80Exp( a );
3901     aSign = extractFloatx80Sign( a );
3902     bSig = extractFloatx80Frac( b );
3903     bExp = extractFloatx80Exp( b );
3904     bSign = extractFloatx80Sign( b );
3905     if ( aExp == 0x7FFF ) {
3906         if (    (bits64) ( aSig0<<1 )
3907              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3908             return propagateFloatx80NaN( a, b );
3909         }
3910         goto invalid;
3911     }
3912     if ( bExp == 0x7FFF ) {
3913         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3914         return a;
3915     }
3916     if ( bExp == 0 ) {
3917         if ( bSig == 0 ) {
3918  invalid:
3919             float_raise( float_flag_invalid );
3920             z.low = floatx80_default_nan_low;
3921             z.high = floatx80_default_nan_high;
3922             return z;
3923         }
3924         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3925     }
3926     if ( aExp == 0 ) {
3927         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3928         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3929     }
3930     bSig |= LIT64( 0x8000000000000000 );
3931     zSign = aSign;
3932     expDiff = aExp - bExp;
3933     aSig1 = 0;
3934     if ( expDiff < 0 ) {
3935         if ( expDiff < -1 ) return a;
3936         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3937         expDiff = 0;
3938     }
3939     q = ( bSig <= aSig0 );
3940     if ( q ) aSig0 -= bSig;
3941     expDiff -= 64;
3942     while ( 0 < expDiff ) {
3943         q = estimateDiv128To64( aSig0, aSig1, bSig );
3944         q = ( 2 < q ) ? q - 2 : 0;
3945         mul64To128( bSig, q, &term0, &term1 );
3946         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3947         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3948         expDiff -= 62;
3949     }
3950     expDiff += 64;
3951     if ( 0 < expDiff ) {
3952         q = estimateDiv128To64( aSig0, aSig1, bSig );
3953         q = ( 2 < q ) ? q - 2 : 0;
3954         q >>= 64 - expDiff;
3955         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3956         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3957         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3958         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3959             ++q;
3960             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3961         }
3962     }
3963     else {
3964         term1 = 0;
3965         term0 = bSig;
3966     }
3967     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3968     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3969          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3970               && ( q & 1 ) )
3971        ) {
3972         aSig0 = alternateASig0;
3973         aSig1 = alternateASig1;
3974         zSign = ! zSign;
3975     }
3976     return
3977         normalizeRoundAndPackFloatx80(
3978             80, zSign, bExp + expDiff, aSig0, aSig1 );
3979
3980 }
3981
3982 /*
3983 -------------------------------------------------------------------------------
3984 Returns the square root of the extended double-precision floating-point
3985 value `a'.  The operation is performed according to the IEC/IEEE Standard
3986 for Binary Floating-Point Arithmetic.
3987 -------------------------------------------------------------------------------
3988 */
3989 floatx80 floatx80_sqrt( floatx80 a )
3990 {
3991     flag aSign;
3992     int32 aExp, zExp;
3993     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3994     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3995     floatx80 z;
3996
3997     aSig0 = extractFloatx80Frac( a );
3998     aExp = extractFloatx80Exp( a );
3999     aSign = extractFloatx80Sign( a );
4000     if ( aExp == 0x7FFF ) {
4001         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4002         if ( ! aSign ) return a;
4003         goto invalid;
4004     }
4005     if ( aSign ) {
4006         if ( ( aExp | aSig0 ) == 0 ) return a;
4007  invalid:
4008         float_raise( float_flag_invalid );
4009         z.low = floatx80_default_nan_low;
4010         z.high = floatx80_default_nan_high;
4011         return z;
4012     }
4013     if ( aExp == 0 ) {
4014         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4015         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4016     }
4017     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4018     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4019     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4020     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4021     doubleZSig0 = zSig0<<1;
4022     mul64To128( zSig0, zSig0, &term0, &term1 );
4023     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4024     while ( (sbits64) rem0 < 0 ) {
4025         --zSig0;
4026         doubleZSig0 -= 2;
4027         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4028     }
4029     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4030     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4031         if ( zSig1 == 0 ) zSig1 = 1;
4032         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4033         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4034         mul64To128( zSig1, zSig1, &term2, &term3 );
4035         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4036         while ( (sbits64) rem1 < 0 ) {
4037             --zSig1;
4038             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4039             term3 |= 1;
4040             term2 |= doubleZSig0;
4041             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4042         }
4043         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4044     }
4045     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4046     zSig0 |= doubleZSig0;
4047     return
4048         roundAndPackFloatx80(
4049             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4050
4051 }
4052
4053 /*
4054 -------------------------------------------------------------------------------
4055 Returns 1 if the extended double-precision floating-point value `a' is
4056 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4057 performed according to the IEC/IEEE Standard for Binary Floating-Point
4058 Arithmetic.
4059 -------------------------------------------------------------------------------
4060 */
4061 flag floatx80_eq( floatx80 a, floatx80 b )
4062 {
4063
4064     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4065               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4066          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4067               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4068        ) {
4069         if (    floatx80_is_signaling_nan( a )
4070              || floatx80_is_signaling_nan( b ) ) {
4071             float_raise( float_flag_invalid );
4072         }
4073         return 0;
4074     }
4075     return
4076            ( a.low == b.low )
4077         && (    ( a.high == b.high )
4078              || (    ( a.low == 0 )
4079                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4080            );
4081
4082 }
4083
4084 /*
4085 -------------------------------------------------------------------------------
4086 Returns 1 if the extended double-precision floating-point value `a' is
4087 less than or equal to the corresponding value `b', and 0 otherwise.  The
4088 comparison is performed according to the IEC/IEEE Standard for Binary
4089 Floating-Point Arithmetic.
4090 -------------------------------------------------------------------------------
4091 */
4092 flag floatx80_le( floatx80 a, floatx80 b )
4093 {
4094     flag aSign, bSign;
4095
4096     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4097               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4098          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4099               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4100        ) {
4101         float_raise( float_flag_invalid );
4102         return 0;
4103     }
4104     aSign = extractFloatx80Sign( a );
4105     bSign = extractFloatx80Sign( b );
4106     if ( aSign != bSign ) {
4107         return
4108                aSign
4109             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4110                  == 0 );
4111     }
4112     return
4113           aSign ? le128( b.high, b.low, a.high, a.low )
4114         : le128( a.high, a.low, b.high, b.low );
4115
4116 }
4117
4118 /*
4119 -------------------------------------------------------------------------------
4120 Returns 1 if the extended double-precision floating-point value `a' is
4121 less than the corresponding value `b', and 0 otherwise.  The comparison
4122 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4123 Arithmetic.
4124 -------------------------------------------------------------------------------
4125 */
4126 flag floatx80_lt( floatx80 a, floatx80 b )
4127 {
4128     flag aSign, bSign;
4129
4130     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4131               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4132          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4133               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4134        ) {
4135         float_raise( float_flag_invalid );
4136         return 0;
4137     }
4138     aSign = extractFloatx80Sign( a );
4139     bSign = extractFloatx80Sign( b );
4140     if ( aSign != bSign ) {
4141         return
4142                aSign
4143             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4144                  != 0 );
4145     }
4146     return
4147           aSign ? lt128( b.high, b.low, a.high, a.low )
4148         : lt128( a.high, a.low, b.high, b.low );
4149
4150 }
4151
4152 /*
4153 -------------------------------------------------------------------------------
4154 Returns 1 if the extended double-precision floating-point value `a' is equal
4155 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4156 raised if either operand is a NaN.  Otherwise, the comparison is performed
4157 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4158 -------------------------------------------------------------------------------
4159 */
4160 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4161 {
4162
4163     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4164               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4165          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4166               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4167        ) {
4168         float_raise( float_flag_invalid );
4169         return 0;
4170     }
4171     return
4172            ( a.low == b.low )
4173         && (    ( a.high == b.high )
4174              || (    ( a.low == 0 )
4175                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4176            );
4177
4178 }
4179
4180 /*
4181 -------------------------------------------------------------------------------
4182 Returns 1 if the extended double-precision floating-point value `a' is less
4183 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4184 do not cause an exception.  Otherwise, the comparison is performed according
4185 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4186 -------------------------------------------------------------------------------
4187 */
4188 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4189 {
4190     flag aSign, bSign;
4191
4192     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4193               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4194          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4195               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4196        ) {
4197         if (    floatx80_is_signaling_nan( a )
4198              || floatx80_is_signaling_nan( b ) ) {
4199             float_raise( float_flag_invalid );
4200         }
4201         return 0;
4202     }
4203     aSign = extractFloatx80Sign( a );
4204     bSign = extractFloatx80Sign( b );
4205     if ( aSign != bSign ) {
4206         return
4207                aSign
4208             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4209                  == 0 );
4210     }
4211     return
4212           aSign ? le128( b.high, b.low, a.high, a.low )
4213         : le128( a.high, a.low, b.high, b.low );
4214
4215 }
4216
4217 /*
4218 -------------------------------------------------------------------------------
4219 Returns 1 if the extended double-precision floating-point value `a' is less
4220 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4221 an exception.  Otherwise, the comparison is performed according to the
4222 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4223 -------------------------------------------------------------------------------
4224 */
4225 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4226 {
4227     flag aSign, bSign;
4228
4229     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4230               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4231          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4232               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4233        ) {
4234         if (    floatx80_is_signaling_nan( a )
4235              || floatx80_is_signaling_nan( b ) ) {
4236             float_raise( float_flag_invalid );
4237         }
4238         return 0;
4239     }
4240     aSign = extractFloatx80Sign( a );
4241     bSign = extractFloatx80Sign( b );
4242     if ( aSign != bSign ) {
4243         return
4244                aSign
4245             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4246                  != 0 );
4247     }
4248     return
4249           aSign ? lt128( b.high, b.low, a.high, a.low )
4250         : lt128( a.high, a.low, b.high, b.low );
4251
4252 }
4253
4254 #endif
4255
4256 #ifdef FLOAT128
4257
4258 /*
4259 -------------------------------------------------------------------------------
4260 Returns the result of converting the quadruple-precision floating-point
4261 value `a' to the 32-bit two's complement integer format.  The conversion
4262 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4263 Arithmetic---which means in particular that the conversion is rounded
4264 according to the current rounding mode.  If `a' is a NaN, the largest
4265 positive integer is returned.  Otherwise, if the conversion overflows, the
4266 largest integer with the same sign as `a' is returned.
4267 -------------------------------------------------------------------------------
4268 */
4269 int32 float128_to_int32( float128 a )
4270 {
4271     flag aSign;
4272     int32 aExp, shiftCount;
4273     bits64 aSig0, aSig1;
4274
4275     aSig1 = extractFloat128Frac1( a );
4276     aSig0 = extractFloat128Frac0( a );
4277     aExp = extractFloat128Exp( a );
4278     aSign = extractFloat128Sign( a );
4279     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4280     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4281     aSig0 |= ( aSig1 != 0 );
4282     shiftCount = 0x4028 - aExp;
4283     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4284     return roundAndPackInt32( aSign, aSig0 );
4285
4286 }
4287
4288 /*
4289 -------------------------------------------------------------------------------
4290 Returns the result of converting the quadruple-precision floating-point
4291 value `a' to the 32-bit two's complement integer format.  The conversion
4292 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4293 Arithmetic, except that the conversion is always rounded toward zero.  If
4294 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4295 conversion overflows, the largest integer with the same sign as `a' is
4296 returned.
4297 -------------------------------------------------------------------------------
4298 */
4299 int32 float128_to_int32_round_to_zero( float128 a )
4300 {
4301     flag aSign;
4302     int32 aExp, shiftCount;
4303     bits64 aSig0, aSig1, savedASig;
4304     int32 z;
4305
4306     aSig1 = extractFloat128Frac1( a );
4307     aSig0 = extractFloat128Frac0( a );
4308     aExp = extractFloat128Exp( a );
4309     aSign = extractFloat128Sign( a );
4310     aSig0 |= ( aSig1 != 0 );
4311     if ( 0x401E < aExp ) {
4312         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4313         goto invalid;
4314     }
4315     else if ( aExp < 0x3FFF ) {
4316         if ( aExp || aSig0 ) float_set_inexact();
4317         return 0;
4318     }
4319     aSig0 |= LIT64( 0x0001000000000000 );
4320     shiftCount = 0x402F - aExp;
4321     savedASig = aSig0;
4322     aSig0 >>= shiftCount;
4323     z = aSig0;
4324     if ( aSign ) z = - z;
4325     if ( ( z < 0 ) ^ aSign ) {
4326  invalid:
4327         float_raise( float_flag_invalid );
4328         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4329     }
4330     if ( ( aSig0<<shiftCount ) != savedASig ) {
4331         float_set_inexact();
4332     }
4333     return z;
4334
4335 }
4336
4337 /*
4338 -------------------------------------------------------------------------------
4339 Returns the result of converting the quadruple-precision floating-point
4340 value `a' to the 64-bit two's complement integer format.  The conversion
4341 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4342 Arithmetic---which means in particular that the conversion is rounded
4343 according to the current rounding mode.  If `a' is a NaN, the largest
4344 positive integer is returned.  Otherwise, if the conversion overflows, the
4345 largest integer with the same sign as `a' is returned.
4346 -------------------------------------------------------------------------------
4347 */
4348 int64 float128_to_int64( float128 a )
4349 {
4350     flag aSign;
4351     int32 aExp, shiftCount;
4352     bits64 aSig0, aSig1;
4353
4354     aSig1 = extractFloat128Frac1( a );
4355     aSig0 = extractFloat128Frac0( a );
4356     aExp = extractFloat128Exp( a );
4357     aSign = extractFloat128Sign( a );
4358     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4359     shiftCount = 0x402F - aExp;
4360     if ( shiftCount <= 0 ) {
4361         if ( 0x403E < aExp ) {
4362             float_raise( float_flag_invalid );
4363             if (    ! aSign
4364                  || (    ( aExp == 0x7FFF )
4365                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4366                     )
4367                ) {
4368                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4369             }
4370             return (sbits64) LIT64( 0x8000000000000000 );
4371         }
4372         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4373     }
4374     else {
4375         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4376     }
4377     return roundAndPackInt64( aSign, aSig0, aSig1 );
4378
4379 }
4380
4381 /*
4382 -------------------------------------------------------------------------------
4383 Returns the result of converting the quadruple-precision floating-point
4384 value `a' to the 64-bit two's complement integer format.  The conversion
4385 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4386 Arithmetic, except that the conversion is always rounded toward zero.
4387 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4388 the conversion overflows, the largest integer with the same sign as `a' is
4389 returned.
4390 -------------------------------------------------------------------------------
4391 */
4392 int64 float128_to_int64_round_to_zero( float128 a )
4393 {
4394     flag aSign;
4395     int32 aExp, shiftCount;
4396     bits64 aSig0, aSig1;
4397     int64 z;
4398
4399     aSig1 = extractFloat128Frac1( a );
4400     aSig0 = extractFloat128Frac0( a );
4401     aExp = extractFloat128Exp( a );
4402     aSign = extractFloat128Sign( a );
4403     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4404     shiftCount = aExp - 0x402F;
4405     if ( 0 < shiftCount ) {
4406         if ( 0x403E <= aExp ) {
4407             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4408             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4409                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4410                 if ( aSig1 ) float_set_inexact();
4411             }
4412             else {
4413                 float_raise( float_flag_invalid );
4414                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4415                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4416                 }
4417             }
4418             return (sbits64) LIT64( 0x8000000000000000 );
4419         }
4420         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4421         if ( (bits64) ( aSig1<<shiftCount ) ) {
4422             float_set_inexact();
4423         }
4424     }
4425     else {
4426         if ( aExp < 0x3FFF ) {
4427             if ( aExp | aSig0 | aSig1 ) {
4428                 float_set_inexact();
4429             }
4430             return 0;
4431         }
4432         z = aSig0>>( - shiftCount );
4433         if (    aSig1
4434              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4435             float_set_inexact();
4436         }
4437     }
4438     if ( aSign ) z = - z;
4439     return z;
4440
4441 }
4442
4443 /*
4444 -------------------------------------------------------------------------------
4445 Returns the result of converting the quadruple-precision floating-point
4446 value `a' to the single-precision floating-point format.  The conversion
4447 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4448 Arithmetic.
4449 -------------------------------------------------------------------------------
4450 */
4451 float32 float128_to_float32( float128 a )
4452 {
4453     flag aSign;
4454     int32 aExp;
4455     bits64 aSig0, aSig1;
4456     bits32 zSig;
4457
4458     aSig1 = extractFloat128Frac1( a );
4459     aSig0 = extractFloat128Frac0( a );
4460     aExp = extractFloat128Exp( a );
4461     aSign = extractFloat128Sign( a );
4462     if ( aExp == 0x7FFF ) {
4463         if ( aSig0 | aSig1 ) {
4464             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4465         }
4466         return packFloat32( aSign, 0xFF, 0 );
4467     }
4468     aSig0 |= ( aSig1 != 0 );
4469     shift64RightJamming( aSig0, 18, &aSig0 );
4470     zSig = aSig0;
4471     if ( aExp || zSig ) {
4472         zSig |= 0x40000000;
4473         aExp -= 0x3F81;
4474     }
4475     return roundAndPackFloat32( aSign, aExp, zSig );
4476
4477 }
4478
4479 /*
4480 -------------------------------------------------------------------------------
4481 Returns the result of converting the quadruple-precision floating-point
4482 value `a' to the double-precision floating-point format.  The conversion
4483 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4484 Arithmetic.
4485 -------------------------------------------------------------------------------
4486 */
4487 float64 float128_to_float64( float128 a )
4488 {
4489     flag aSign;
4490     int32 aExp;
4491     bits64 aSig0, aSig1;
4492
4493     aSig1 = extractFloat128Frac1( a );
4494     aSig0 = extractFloat128Frac0( a );
4495     aExp = extractFloat128Exp( a );
4496     aSign = extractFloat128Sign( a );
4497     if ( aExp == 0x7FFF ) {
4498         if ( aSig0 | aSig1 ) {
4499             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4500         }
4501         return packFloat64( aSign, 0x7FF, 0 );
4502     }
4503     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4504     aSig0 |= ( aSig1 != 0 );
4505     if ( aExp || aSig0 ) {
4506         aSig0 |= LIT64( 0x4000000000000000 );
4507         aExp -= 0x3C01;
4508     }
4509     return roundAndPackFloat64( aSign, aExp, aSig0 );
4510
4511 }
4512
4513 #ifdef FLOATX80
4514
4515 /*
4516 -------------------------------------------------------------------------------
4517 Returns the result of converting the quadruple-precision floating-point
4518 value `a' to the extended double-precision floating-point format.  The
4519 conversion is performed according to the IEC/IEEE Standard for Binary
4520 Floating-Point Arithmetic.
4521 -------------------------------------------------------------------------------
4522 */
4523 floatx80 float128_to_floatx80( float128 a )
4524 {
4525     flag aSign;
4526     int32 aExp;
4527     bits64 aSig0, aSig1;
4528
4529     aSig1 = extractFloat128Frac1( a );
4530     aSig0 = extractFloat128Frac0( a );
4531     aExp = extractFloat128Exp( a );
4532     aSign = extractFloat128Sign( a );
4533     if ( aExp == 0x7FFF ) {
4534         if ( aSig0 | aSig1 ) {
4535             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4536         }
4537         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4538     }
4539     if ( aExp == 0 ) {
4540         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4541         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4542     }
4543     else {
4544         aSig0 |= LIT64( 0x0001000000000000 );
4545     }
4546     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4547     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4548
4549 }
4550
4551 #endif
4552
4553 /*
4554 -------------------------------------------------------------------------------
4555 Rounds the quadruple-precision floating-point value `a' to an integer, and
4556 returns the result as a quadruple-precision floating-point value.  The
4557 operation is performed according to the IEC/IEEE Standard for Binary
4558 Floating-Point Arithmetic.
4559 -------------------------------------------------------------------------------
4560 */
4561 float128 float128_round_to_int( float128 a )
4562 {
4563     flag aSign;
4564     int32 aExp;
4565     bits64 lastBitMask, roundBitsMask;
4566     int8 roundingMode;
4567     float128 z;
4568
4569     aExp = extractFloat128Exp( a );
4570     if ( 0x402F <= aExp ) {
4571         if ( 0x406F <= aExp ) {
4572             if (    ( aExp == 0x7FFF )
4573                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4574                ) {
4575                 return propagateFloat128NaN( a, a );
4576             }
4577             return a;
4578         }
4579         lastBitMask = 1;
4580         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4581         roundBitsMask = lastBitMask - 1;
4582         z = a;
4583         roundingMode = float_rounding_mode();
4584         if ( roundingMode == float_round_nearest_even ) {
4585             if ( lastBitMask ) {
4586                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4587                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4588             }
4589             else {
4590                 if ( (sbits64) z.low < 0 ) {
4591                     ++z.high;
4592                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4593                 }
4594             }
4595         }
4596         else if ( roundingMode != float_round_to_zero ) {
4597             if (   extractFloat128Sign( z )
4598                  ^ ( roundingMode == float_round_up ) ) {
4599                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4600             }
4601         }
4602         z.low &= ~ roundBitsMask;
4603     }
4604     else {
4605         if ( aExp < 0x3FFF ) {
4606             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4607             float_set_inexact();
4608             aSign = extractFloat128Sign( a );
4609             switch ( float_rounding_mode() ) {
4610              case float_round_nearest_even:
4611                 if (    ( aExp == 0x3FFE )
4612                      && (   extractFloat128Frac0( a )
4613                           | extractFloat128Frac1( a ) )
4614                    ) {
4615                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4616                 }
4617                 break;
4618              case float_round_down:
4619                 return
4620                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4621                     : packFloat128( 0, 0, 0, 0 );
4622              case float_round_up:
4623                 return
4624                       aSign ? packFloat128( 1, 0, 0, 0 )
4625                     : packFloat128( 0, 0x3FFF, 0, 0 );
4626             }
4627             return packFloat128( aSign, 0, 0, 0 );
4628         }
4629         lastBitMask = 1;
4630         lastBitMask <<= 0x402F - aExp;
4631         roundBitsMask = lastBitMask - 1;
4632         z.low = 0;
4633         z.high = a.high;
4634         roundingMode = float_rounding_mode();
4635         if ( roundingMode == float_round_nearest_even ) {
4636             z.high += lastBitMask>>1;
4637             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4638                 z.high &= ~ lastBitMask;
4639             }
4640         }
4641         else if ( roundingMode != float_round_to_zero ) {
4642             if (   extractFloat128Sign( z )
4643                  ^ ( roundingMode == float_round_up ) ) {
4644                 z.high |= ( a.low != 0 );
4645                 z.high += roundBitsMask;
4646             }
4647         }
4648         z.high &= ~ roundBitsMask;
4649     }
4650     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4651         float_set_inexact();
4652     }
4653     return z;
4654
4655 }
4656
4657 /*
4658 -------------------------------------------------------------------------------
4659 Returns the result of adding the absolute values of the quadruple-precision
4660 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4661 before being returned.  `zSign' is ignored if the result is a NaN.
4662 The addition is performed according to the IEC/IEEE Standard for Binary
4663 Floating-Point Arithmetic.
4664 -------------------------------------------------------------------------------
4665 */
4666 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4667 {
4668     int32 aExp, bExp, zExp;
4669     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4670     int32 expDiff;
4671
4672     aSig1 = extractFloat128Frac1( a );
4673     aSig0 = extractFloat128Frac0( a );
4674     aExp = extractFloat128Exp( a );
4675     bSig1 = extractFloat128Frac1( b );
4676     bSig0 = extractFloat128Frac0( b );
4677     bExp = extractFloat128Exp( b );
4678     expDiff = aExp - bExp;
4679     if ( 0 < expDiff ) {
4680         if ( aExp == 0x7FFF ) {
4681             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4682             return a;
4683         }
4684         if ( bExp == 0 ) {
4685             --expDiff;
4686         }
4687         else {
4688             bSig0 |= LIT64( 0x0001000000000000 );
4689         }
4690         shift128ExtraRightJamming(
4691             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4692         zExp = aExp;
4693     }
4694     else if ( expDiff < 0 ) {
4695         if ( bExp == 0x7FFF ) {
4696             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4697             return packFloat128( zSign, 0x7FFF, 0, 0 );
4698         }
4699         if ( aExp == 0 ) {
4700             ++expDiff;
4701         }
4702         else {
4703             aSig0 |= LIT64( 0x0001000000000000 );
4704         }
4705         shift128ExtraRightJamming(
4706             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4707         zExp = bExp;
4708     }
4709     else {
4710         if ( aExp == 0x7FFF ) {
4711             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4712                 return propagateFloat128NaN( a, b );
4713             }
4714             return a;
4715         }
4716         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4717         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4718         zSig2 = 0;
4719         zSig0 |= LIT64( 0x0002000000000000 );
4720         zExp = aExp;
4721         goto shiftRight1;
4722     }
4723     aSig0 |= LIT64( 0x0001000000000000 );
4724     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4725     --zExp;
4726     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4727     ++zExp;
4728  shiftRight1:
4729     shift128ExtraRightJamming(
4730         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4731  roundAndPack:
4732     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4733
4734 }
4735
4736 /*
4737 -------------------------------------------------------------------------------
4738 Returns the result of subtracting the absolute values of the quadruple-
4739 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4740 difference is negated before being returned.  `zSign' is ignored if the
4741 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4742 Standard for Binary Floating-Point Arithmetic.
4743 -------------------------------------------------------------------------------
4744 */
4745 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4746 {
4747     int32 aExp, bExp, zExp;
4748     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4749     int32 expDiff;
4750     float128 z;
4751
4752     aSig1 = extractFloat128Frac1( a );
4753     aSig0 = extractFloat128Frac0( a );
4754     aExp = extractFloat128Exp( a );
4755     bSig1 = extractFloat128Frac1( b );
4756     bSig0 = extractFloat128Frac0( b );
4757     bExp = extractFloat128Exp( b );
4758     expDiff = aExp - bExp;
4759     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4760     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4761     if ( 0 < expDiff ) goto aExpBigger;
4762     if ( expDiff < 0 ) goto bExpBigger;
4763     if ( aExp == 0x7FFF ) {
4764         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4765             return propagateFloat128NaN( a, b );
4766         }
4767         float_raise( float_flag_invalid );
4768         z.low = float128_default_nan_low;
4769         z.high = float128_default_nan_high;
4770         return z;
4771     }
4772     if ( aExp == 0 ) {
4773         aExp = 1;
4774         bExp = 1;
4775     }
4776     if ( bSig0 < aSig0 ) goto aBigger;
4777     if ( aSig0 < bSig0 ) goto bBigger;
4778     if ( bSig1 < aSig1 ) goto aBigger;
4779     if ( aSig1 < bSig1 ) goto bBigger;
4780     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4781  bExpBigger:
4782     if ( bExp == 0x7FFF ) {
4783         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4784         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4785     }
4786     if ( aExp == 0 ) {
4787         ++expDiff;
4788     }
4789     else {
4790         aSig0 |= LIT64( 0x4000000000000000 );
4791     }
4792     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4793     bSig0 |= LIT64( 0x4000000000000000 );
4794  bBigger:
4795     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4796     zExp = bExp;
4797     zSign ^= 1;
4798     goto normalizeRoundAndPack;
4799  aExpBigger:
4800     if ( aExp == 0x7FFF ) {
4801         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4802         return a;
4803     }
4804     if ( bExp == 0 ) {
4805         --expDiff;
4806     }
4807     else {
4808         bSig0 |= LIT64( 0x4000000000000000 );
4809     }
4810     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4811     aSig0 |= LIT64( 0x4000000000000000 );
4812  aBigger:
4813     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4814     zExp = aExp;
4815  normalizeRoundAndPack:
4816     --zExp;
4817     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4818
4819 }
4820
4821 /*
4822 -------------------------------------------------------------------------------
4823 Returns the result of adding the quadruple-precision floating-point values
4824 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4825 for Binary Floating-Point Arithmetic.
4826 -------------------------------------------------------------------------------
4827 */
4828 float128 float128_add( float128 a, float128 b )
4829 {
4830     flag aSign, bSign;
4831
4832     aSign = extractFloat128Sign( a );
4833     bSign = extractFloat128Sign( b );
4834     if ( aSign == bSign ) {
4835         return addFloat128Sigs( a, b, aSign );
4836     }
4837     else {
4838         return subFloat128Sigs( a, b, aSign );
4839     }
4840
4841 }
4842
4843 /*
4844 -------------------------------------------------------------------------------
4845 Returns the result of subtracting the quadruple-precision floating-point
4846 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4847 Standard for Binary Floating-Point Arithmetic.
4848 -------------------------------------------------------------------------------
4849 */
4850 float128 float128_sub( float128 a, float128 b )
4851 {
4852     flag aSign, bSign;
4853
4854     aSign = extractFloat128Sign( a );
4855     bSign = extractFloat128Sign( b );
4856     if ( aSign == bSign ) {
4857         return subFloat128Sigs( a, b, aSign );
4858     }
4859     else {
4860         return addFloat128Sigs( a, b, aSign );
4861     }
4862
4863 }
4864
4865 /*
4866 -------------------------------------------------------------------------------
4867 Returns the result of multiplying the quadruple-precision floating-point
4868 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4869 Standard for Binary Floating-Point Arithmetic.
4870 -------------------------------------------------------------------------------
4871 */
4872 float128 float128_mul( float128 a, float128 b )
4873 {
4874     flag aSign, bSign, zSign;
4875     int32 aExp, bExp, zExp;
4876     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4877     float128 z;
4878
4879     aSig1 = extractFloat128Frac1( a );
4880     aSig0 = extractFloat128Frac0( a );
4881     aExp = extractFloat128Exp( a );
4882     aSign = extractFloat128Sign( a );
4883     bSig1 = extractFloat128Frac1( b );
4884     bSig0 = extractFloat128Frac0( b );
4885     bExp = extractFloat128Exp( b );
4886     bSign = extractFloat128Sign( b );
4887     zSign = aSign ^ bSign;
4888     if ( aExp == 0x7FFF ) {
4889         if (    ( aSig0 | aSig1 )
4890              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4891             return propagateFloat128NaN( a, b );
4892         }
4893         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4894         return packFloat128( zSign, 0x7FFF, 0, 0 );
4895     }
4896     if ( bExp == 0x7FFF ) {
4897         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4898         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4899  invalid:
4900             float_raise( float_flag_invalid );
4901             z.low = float128_default_nan_low;
4902             z.high = float128_default_nan_high;
4903             return z;
4904         }
4905         return packFloat128( zSign, 0x7FFF, 0, 0 );
4906     }
4907     if ( aExp == 0 ) {
4908         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4909         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4910     }
4911     if ( bExp == 0 ) {
4912         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4913         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4914     }
4915     zExp = aExp + bExp - 0x4000;
4916     aSig0 |= LIT64( 0x0001000000000000 );
4917     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4918     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4919     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4920     zSig2 |= ( zSig3 != 0 );
4921     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4922         shift128ExtraRightJamming(
4923             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4924         ++zExp;
4925     }
4926     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4927
4928 }
4929
4930 /*
4931 -------------------------------------------------------------------------------
4932 Returns the result of dividing the quadruple-precision floating-point value
4933 `a' by the corresponding value `b'.  The operation is performed according to
4934 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4935 -------------------------------------------------------------------------------
4936 */
4937 float128 float128_div( float128 a, float128 b )
4938 {
4939     flag aSign, bSign, zSign;
4940     int32 aExp, bExp, zExp;
4941     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4942     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4943     float128 z;
4944
4945     aSig1 = extractFloat128Frac1( a );
4946     aSig0 = extractFloat128Frac0( a );
4947     aExp = extractFloat128Exp( a );
4948     aSign = extractFloat128Sign( a );
4949     bSig1 = extractFloat128Frac1( b );
4950     bSig0 = extractFloat128Frac0( b );
4951     bExp = extractFloat128Exp( b );
4952     bSign = extractFloat128Sign( b );
4953     zSign = aSign ^ bSign;
4954     if ( aExp == 0x7FFF ) {
4955         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4956         if ( bExp == 0x7FFF ) {
4957             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4958             goto invalid;
4959         }
4960         return packFloat128( zSign, 0x7FFF, 0, 0 );
4961     }
4962     if ( bExp == 0x7FFF ) {
4963         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4964         return packFloat128( zSign, 0, 0, 0 );
4965     }
4966     if ( bExp == 0 ) {
4967         if ( ( bSig0 | bSig1 ) == 0 ) {
4968             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4969  invalid:
4970                 float_raise( float_flag_invalid );
4971                 z.low = float128_default_nan_low;
4972                 z.high = float128_default_nan_high;
4973                 return z;
4974             }
4975             float_raise( float_flag_divbyzero );
4976             return packFloat128( zSign, 0x7FFF, 0, 0 );
4977         }
4978         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4979     }
4980     if ( aExp == 0 ) {
4981         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4982         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4983     }
4984     zExp = aExp - bExp + 0x3FFD;
4985     shortShift128Left(
4986         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4987     shortShift128Left(
4988         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4989     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4990         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4991         ++zExp;
4992     }
4993     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4994     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4995     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4996     while ( (sbits64) rem0 < 0 ) {
4997         --zSig0;
4998         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4999     }
5000     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5001     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5002         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5003         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5004         while ( (sbits64) rem1 < 0 ) {
5005             --zSig1;
5006             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5007         }
5008         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5009     }
5010     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5011     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5012
5013 }
5014
5015 /*
5016 -------------------------------------------------------------------------------
5017 Returns the remainder of the quadruple-precision floating-point value `a'
5018 with respect to the corresponding value `b'.  The operation is performed
5019 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5020 -------------------------------------------------------------------------------
5021 */
5022 float128 float128_rem( float128 a, float128 b )
5023 {
5024     flag aSign, bSign, zSign;
5025     int32 aExp, bExp, expDiff;
5026     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5027     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5028     sbits64 sigMean0;
5029     float128 z;
5030
5031     aSig1 = extractFloat128Frac1( a );
5032     aSig0 = extractFloat128Frac0( a );
5033     aExp = extractFloat128Exp( a );
5034     aSign = extractFloat128Sign( a );
5035     bSig1 = extractFloat128Frac1( b );
5036     bSig0 = extractFloat128Frac0( b );
5037     bExp = extractFloat128Exp( b );
5038     bSign = extractFloat128Sign( b );
5039     if ( aExp == 0x7FFF ) {
5040         if (    ( aSig0 | aSig1 )
5041              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5042             return propagateFloat128NaN( a, b );
5043         }
5044         goto invalid;
5045     }
5046     if ( bExp == 0x7FFF ) {
5047         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5048         return a;
5049     }
5050     if ( bExp == 0 ) {
5051         if ( ( bSig0 | bSig1 ) == 0 ) {
5052  invalid:
5053             float_raise( float_flag_invalid );
5054             z.low = float128_default_nan_low;
5055             z.high = float128_default_nan_high;
5056             return z;
5057         }
5058         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5059     }
5060     if ( aExp == 0 ) {
5061         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5062         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5063     }
5064     expDiff = aExp - bExp;
5065     if ( expDiff < -1 ) return a;
5066     shortShift128Left(
5067         aSig0 | LIT64( 0x0001000000000000 ),
5068         aSig1,
5069         15 - ( expDiff < 0 ),
5070         &aSig0,
5071         &aSig1
5072     );
5073     shortShift128Left(
5074         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5075     q = le128( bSig0, bSig1, aSig0, aSig1 );
5076     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5077     expDiff -= 64;
5078     while ( 0 < expDiff ) {
5079         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5080         q = ( 4 < q ) ? q - 4 : 0;
5081         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5082         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5083         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5084         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5085         expDiff -= 61;
5086     }
5087     if ( -64 < expDiff ) {
5088         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5089         q = ( 4 < q ) ? q - 4 : 0;
5090         q >>= - expDiff;
5091         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5092         expDiff += 52;
5093         if ( expDiff < 0 ) {
5094             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5095         }
5096         else {
5097             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5098         }
5099         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5100         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5101     }
5102     else {
5103         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5104         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5105     }
5106     do {
5107         alternateASig0 = aSig0;
5108         alternateASig1 = aSig1;
5109         ++q;
5110         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5111     } while ( 0 <= (sbits64) aSig0 );
5112     add128(
5113         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5114     if (    ( sigMean0 < 0 )
5115          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5116         aSig0 = alternateASig0;
5117         aSig1 = alternateASig1;
5118     }
5119     zSign = ( (sbits64) aSig0 < 0 );
5120     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5121     return
5122         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5123
5124 }
5125
5126 /*
5127 -------------------------------------------------------------------------------
5128 Returns the square root of the quadruple-precision floating-point value `a'.
5129 The operation is performed according to the IEC/IEEE Standard for Binary
5130 Floating-Point Arithmetic.
5131 -------------------------------------------------------------------------------
5132 */
5133 float128 float128_sqrt( float128 a )
5134 {
5135     flag aSign;
5136     int32 aExp, zExp;
5137     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5138     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5139     float128 z;
5140
5141     aSig1 = extractFloat128Frac1( a );
5142     aSig0 = extractFloat128Frac0( a );
5143     aExp = extractFloat128Exp( a );
5144     aSign = extractFloat128Sign( a );
5145     if ( aExp == 0x7FFF ) {
5146         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5147         if ( ! aSign ) return a;
5148         goto invalid;
5149     }
5150     if ( aSign ) {
5151         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5152  invalid:
5153         float_raise( float_flag_invalid );
5154         z.low = float128_default_nan_low;
5155         z.high = float128_default_nan_high;
5156         return z;
5157     }
5158     if ( aExp == 0 ) {
5159         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5160         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5161     }
5162     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5163     aSig0 |= LIT64( 0x0001000000000000 );
5164     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5165     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5166     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5167     doubleZSig0 = zSig0<<1;
5168     mul64To128( zSig0, zSig0, &term0, &term1 );
5169     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5170     while ( (sbits64) rem0 < 0 ) {
5171         --zSig0;
5172         doubleZSig0 -= 2;
5173         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5174     }
5175     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5176     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5177         if ( zSig1 == 0 ) zSig1 = 1;
5178         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5179         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5180         mul64To128( zSig1, zSig1, &term2, &term3 );
5181         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5182         while ( (sbits64) rem1 < 0 ) {
5183             --zSig1;
5184             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5185             term3 |= 1;
5186             term2 |= doubleZSig0;
5187             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5188         }
5189         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5190     }
5191     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5192     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5193
5194 }
5195
5196 /*
5197 -------------------------------------------------------------------------------
5198 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5199 the corresponding value `b', and 0 otherwise.  The comparison is performed
5200 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5201 -------------------------------------------------------------------------------
5202 */
5203 flag float128_eq( float128 a, float128 b )
5204 {
5205
5206     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5207               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5208          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5209               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5210        ) {
5211         if (    float128_is_signaling_nan( a )
5212              || float128_is_signaling_nan( b ) ) {
5213             float_raise( float_flag_invalid );
5214         }
5215         return 0;
5216     }
5217     return
5218            ( a.low == b.low )
5219         && (    ( a.high == b.high )
5220              || (    ( a.low == 0 )
5221                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5222            );
5223
5224 }
5225
5226 /*
5227 -------------------------------------------------------------------------------
5228 Returns 1 if the quadruple-precision floating-point value `a' is less than
5229 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5230 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5231 Arithmetic.
5232 -------------------------------------------------------------------------------
5233 */
5234 flag float128_le( float128 a, float128 b )
5235 {
5236     flag aSign, bSign;
5237
5238     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5239               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5240          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5241               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5242        ) {
5243         float_raise( float_flag_invalid );
5244         return 0;
5245     }
5246     aSign = extractFloat128Sign( a );
5247     bSign = extractFloat128Sign( b );
5248     if ( aSign != bSign ) {
5249         return
5250                aSign
5251             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5252                  == 0 );
5253     }
5254     return
5255           aSign ? le128( b.high, b.low, a.high, a.low )
5256         : le128( a.high, a.low, b.high, b.low );
5257
5258 }
5259
5260 /*
5261 -------------------------------------------------------------------------------
5262 Returns 1 if the quadruple-precision floating-point value `a' is less than
5263 the corresponding value `b', and 0 otherwise.  The comparison is performed
5264 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5265 -------------------------------------------------------------------------------
5266 */
5267 flag float128_lt( float128 a, float128 b )
5268 {
5269     flag aSign, bSign;
5270
5271     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5272               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5273          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5274               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5275        ) {
5276         float_raise( float_flag_invalid );
5277         return 0;
5278     }
5279     aSign = extractFloat128Sign( a );
5280     bSign = extractFloat128Sign( b );
5281     if ( aSign != bSign ) {
5282         return
5283                aSign
5284             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5285                  != 0 );
5286     }
5287     return
5288           aSign ? lt128( b.high, b.low, a.high, a.low )
5289         : lt128( a.high, a.low, b.high, b.low );
5290
5291 }
5292
5293 /*
5294 -------------------------------------------------------------------------------
5295 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5296 the corresponding value `b', and 0 otherwise.  The invalid exception is
5297 raised if either operand is a NaN.  Otherwise, the comparison is performed
5298 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5299 -------------------------------------------------------------------------------
5300 */
5301 flag float128_eq_signaling( float128 a, float128 b )
5302 {
5303
5304     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5305               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5306          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5307               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5308        ) {
5309         float_raise( float_flag_invalid );
5310         return 0;
5311     }
5312     return
5313            ( a.low == b.low )
5314         && (    ( a.high == b.high )
5315              || (    ( a.low == 0 )
5316                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5317            );
5318
5319 }
5320
5321 /*
5322 -------------------------------------------------------------------------------
5323 Returns 1 if the quadruple-precision floating-point value `a' is less than
5324 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5325 cause an exception.  Otherwise, the comparison is performed according to the
5326 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5327 -------------------------------------------------------------------------------
5328 */
5329 flag float128_le_quiet( float128 a, float128 b )
5330 {
5331     flag aSign, bSign;
5332
5333     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5334               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5335          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5336               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5337        ) {
5338         if (    float128_is_signaling_nan( a )
5339              || float128_is_signaling_nan( b ) ) {
5340             float_raise( float_flag_invalid );
5341         }
5342         return 0;
5343     }
5344     aSign = extractFloat128Sign( a );
5345     bSign = extractFloat128Sign( b );
5346     if ( aSign != bSign ) {
5347         return
5348                aSign
5349             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5350                  == 0 );
5351     }
5352     return
5353           aSign ? le128( b.high, b.low, a.high, a.low )
5354         : le128( a.high, a.low, b.high, b.low );
5355
5356 }
5357
5358 /*
5359 -------------------------------------------------------------------------------
5360 Returns 1 if the quadruple-precision floating-point value `a' is less than
5361 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5362 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5363 Standard for Binary Floating-Point Arithmetic.
5364 -------------------------------------------------------------------------------
5365 */
5366 flag float128_lt_quiet( float128 a, float128 b )
5367 {
5368     flag aSign, bSign;
5369
5370     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5371               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5372          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5373               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5374        ) {
5375         if (    float128_is_signaling_nan( a )
5376              || float128_is_signaling_nan( b ) ) {
5377             float_raise( float_flag_invalid );
5378         }
5379         return 0;
5380     }
5381     aSign = extractFloat128Sign( a );
5382     bSign = extractFloat128Sign( b );
5383     if ( aSign != bSign ) {
5384         return
5385                aSign
5386             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5387                  != 0 );
5388     }
5389     return
5390           aSign ? lt128( b.high, b.low, a.high, a.low )
5391         : lt128( a.high, a.low, b.high, b.low );
5392
5393 }
5394
5395 #endif
5396
5397
5398 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5399
5400 /*
5401  * These two routines are not part of the original softfloat distribution.
5402  *
5403  * They are based on the corresponding conversions to integer but return
5404  * unsigned numbers instead since these functions are required by GCC.
5405  *
5406  * Added by Mark Brinicombe <mark@NetBSD.org>   27/09/97
5407  *
5408  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5409  */
5410
5411 /*
5412 -------------------------------------------------------------------------------
5413 Returns the result of converting the double-precision floating-point value
5414 `a' to the 32-bit unsigned integer format.  The conversion is
5415 performed according to the IEC/IEEE Standard for Binary Floating-point
5416 Arithmetic, except that the conversion is always rounded toward zero.  If
5417 `a' is a NaN, the largest positive integer is returned.  If the conversion
5418 overflows, the largest integer positive is returned.
5419 -------------------------------------------------------------------------------
5420 */
5421 uint32 float64_to_uint32_round_to_zero( float64 a )
5422 {
5423     flag aSign;
5424     int16 aExp, shiftCount;
5425     bits64 aSig, savedASig;
5426     uint32 z;
5427
5428     aSig = extractFloat64Frac( a );
5429     aExp = extractFloat64Exp( a );
5430     aSign = extractFloat64Sign( a );
5431
5432     if (aSign) {
5433         float_raise( float_flag_invalid );
5434         return(0);
5435     }
5436
5437     if ( 0x41E < aExp ) {
5438         float_raise( float_flag_invalid );
5439         return 0xffffffff;
5440     }
5441     else if ( aExp < 0x3FF ) {
5442         if ( aExp || aSig ) float_set_inexact();
5443         return 0;
5444     }
5445     aSig |= LIT64( 0x0010000000000000 );
5446     shiftCount = 0x433 - aExp;
5447     savedASig = aSig;
5448     aSig >>= shiftCount;
5449     z = aSig;
5450     if ( ( aSig<<shiftCount ) != savedASig ) {
5451         float_set_inexact();
5452     }
5453     return z;
5454
5455 }
5456
5457 /*
5458 -------------------------------------------------------------------------------
5459 Returns the result of converting the single-precision floating-point value
5460 `a' to the 32-bit unsigned integer format.  The conversion is
5461 performed according to the IEC/IEEE Standard for Binary Floating-point
5462 Arithmetic, except that the conversion is always rounded toward zero.  If
5463 `a' is a NaN, the largest positive integer is returned.  If the conversion
5464 overflows, the largest positive integer is returned.
5465 -------------------------------------------------------------------------------
5466 */
5467 uint32 float32_to_uint32_round_to_zero( float32 a )
5468 {
5469     flag aSign;
5470     int16 aExp, shiftCount;
5471     bits32 aSig;
5472     uint32 z;
5473
5474     aSig = extractFloat32Frac( a );
5475     aExp = extractFloat32Exp( a );
5476     aSign = extractFloat32Sign( a );
5477     shiftCount = aExp - 0x9E;
5478
5479     if (aSign) {
5480         float_raise( float_flag_invalid );
5481         return(0);
5482     }
5483     if ( 0 < shiftCount ) {
5484         float_raise( float_flag_invalid );
5485         return 0xFFFFFFFF;
5486     }
5487     else if ( aExp <= 0x7E ) {
5488         if ( aExp | aSig ) float_set_inexact();
5489         return 0;
5490     }
5491     aSig = ( aSig | 0x800000 )<<8;
5492     z = aSig>>( - shiftCount );
5493     if ( aSig<<( shiftCount & 31 ) ) {
5494         float_set_inexact();
5495     }
5496     return z;
5497
5498 }
5499
5500 #endif
5501
5502 #endif /* _STANDALONE */