lib/libc/softfloat/bits64/softfloat.c

   1 /* $NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $ */
   2
   3 /*
   4  * This version hacked for use with gcc -msoft-float by bjh21.
   5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
   6  *  itself).
   7  */
   8
   9 /*
  10  * Things you may want to define:
  11  *
  12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
  13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
  14  *   properly renamed.
  15  */
  16
  17 /*
  18 ===============================================================================
  19
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 #include <sys/cdefs.h>
  48 #if defined(LIBC_SCCS) && !defined(lint)
  49 __RCSID("$NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $");
  50 #endif /* LIBC_SCCS and not lint */
  51
  52 #ifdef SOFTFLOAT_FOR_GCC
  53 #include "softfloat-for-gcc.h"
  54 #endif
  55
  56 #include "milieu.h"
  57 #include "softfloat.h"
  58
  59 /*
  60  * Conversions between floats as stored in memory and floats as
  61  * SoftFloat uses them
  62  */
  63 #ifndef FLOAT64_DEMANGLE
  64 #define FLOAT64_DEMANGLE(a)     (a)
  65 #endif
  66 #ifndef FLOAT64_MANGLE
  67 #define FLOAT64_MANGLE(a)       (a)
  68 #endif
  69
  70 /*
  71 -------------------------------------------------------------------------------
  72 Floating-point rounding mode, extended double-precision rounding precision,
  73 and exception flags.
  74 -------------------------------------------------------------------------------
  75 */
  76 #ifndef set_float_rounding_mode
  77 fp_rnd float_rounding_mode = float_round_nearest_even;
  78 fp_except float_exception_flags = 0;
  79 #endif
  80 #ifndef set_float_exception_inexact_flag
  81 #define set_float_exception_inexact_flag() \
  82         ((void)(float_exception_flags |= float_flag_inexact))
  83 #endif
  84 #ifdef FLOATX80
  85 int8 floatx80_rounding_precision = 80;
  86 #endif
  87
  88 /*
  89 -------------------------------------------------------------------------------
  90 Primitive arithmetic functions, including multi-word arithmetic, and
  91 division and square root approximations.  (Can be specialized to target if
  92 desired.)
  93 -------------------------------------------------------------------------------
  94 */
  95 #include "softfloat-macros"
  96
  97 /*
  98 -------------------------------------------------------------------------------
  99 Functions and definitions to determine:  (1) whether tininess for underflow
 100 is detected before or after rounding by default, (2) what (if anything)
 101 happens when exceptions are raised, (3) how signaling NaNs are distinguished
 102 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 103 are propagated from function inputs to output.  These details are target-
 104 specific.
 105 -------------------------------------------------------------------------------
 106 */
 107 #include "softfloat-specialize"
 108
 109 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
 110 /*
 111 -------------------------------------------------------------------------------
 112 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 113 and 7, and returns the properly rounded 32-bit integer corresponding to the
 114 input.  If `zSign' is 1, the input is negated before being converted to an
 115 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 116 is simply rounded to an integer, with the inexact exception raised if the
 117 input cannot be represented exactly as an integer.  However, if the fixed-
 118 point input is too large, the invalid exception is raised and the largest
 119 positive or negative integer is returned.
 120 -------------------------------------------------------------------------------
 121 */
 122 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
 123 {
 124     int8 roundingMode;
 125     flag roundNearestEven;
 126     int8 roundIncrement, roundBits;
 127     int32 z;
 128
 129     roundingMode = float_rounding_mode;
 130     roundNearestEven = ( roundingMode == float_round_nearest_even );
 131     roundIncrement = 0x40;
 132     if ( ! roundNearestEven ) {
 133         if ( roundingMode == float_round_to_zero ) {
 134             roundIncrement = 0;
 135         }
 136         else {
 137             roundIncrement = 0x7F;
 138             if ( zSign ) {
 139                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 140             }
 141             else {
 142                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 143             }
 144         }
 145     }
 146     roundBits = (int8)(absZ & 0x7F);
 147     absZ = ( absZ + roundIncrement )>>7;
 148     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 149     z = (int32)absZ;
 150     if ( zSign ) z = - z;
 151     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 152         float_raise( float_flag_invalid );
 153         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 154     }
 155     if ( roundBits ) set_float_exception_inexact_flag();
 156     return z;
 157
 158 }
 159
 160 /*
 161 -------------------------------------------------------------------------------
 162 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 163 `absZ1', with binary point between bits 63 and 64 (between the input words),
 164 and returns the properly rounded 64-bit integer corresponding to the input.
 165 If `zSign' is 1, the input is negated before being converted to an integer.
 166 Ordinarily, the fixed-point input is simply rounded to an integer, with
 167 the inexact exception raised if the input cannot be represented exactly as
 168 an integer.  However, if the fixed-point input is too large, the invalid
 169 exception is raised and the largest positive or negative integer is
 170 returned.
 171 -------------------------------------------------------------------------------
 172 */
 173 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
 174 {
 175     int8 roundingMode;
 176     flag roundNearestEven, increment;
 177     int64 z;
 178
 179     roundingMode = float_rounding_mode;
 180     roundNearestEven = ( roundingMode == float_round_nearest_even );
 181     increment = ( (sbits64) absZ1 < 0 );
 182     if ( ! roundNearestEven ) {
 183         if ( roundingMode == float_round_to_zero ) {
 184             increment = 0;
 185         }
 186         else {
 187             if ( zSign ) {
 188                 increment = ( roundingMode == float_round_down ) && absZ1;
 189             }
 190             else {
 191                 increment = ( roundingMode == float_round_up ) && absZ1;
 192             }
 193         }
 194     }
 195     if ( increment ) {
 196         ++absZ0;
 197         if ( absZ0 == 0 ) goto overflow;
 198         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 199     }
 200     z = absZ0;
 201     if ( zSign ) z = - z;
 202     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 203  overflow:
 204         float_raise( float_flag_invalid );
 205         return
 206               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 207             : LIT64( 0x7FFFFFFFFFFFFFFF );
 208     }
 209     if ( absZ1 ) set_float_exception_inexact_flag();
 210     return z;
 211
 212 }
 213 #endif
 214
 215 /*
 216 -------------------------------------------------------------------------------
 217 Returns the fraction bits of the single-precision floating-point value `a'.
 218 -------------------------------------------------------------------------------
 219 */
 220 INLINE bits32 extractFloat32Frac( float32 a )
 221 {
 222
 223     return a & 0x007FFFFF;
 224
 225 }
 226
 227 /*
 228 -------------------------------------------------------------------------------
 229 Returns the exponent bits of the single-precision floating-point value `a'.
 230 -------------------------------------------------------------------------------
 231 */
 232 INLINE int16 extractFloat32Exp( float32 a )
 233 {
 234
 235     return ( a>>23 ) & 0xFF;
 236
 237 }
 238
 239 /*
 240 -------------------------------------------------------------------------------
 241 Returns the sign bit of the single-precision floating-point value `a'.
 242 -------------------------------------------------------------------------------
 243 */
 244 INLINE flag extractFloat32Sign( float32 a )
 245 {
 246
 247     return a>>31;
 248
 249 }
 250
 251 /*
 252 -------------------------------------------------------------------------------
 253 Normalizes the subnormal single-precision floating-point value represented
 254 by the denormalized significand `aSig'.  The normalized exponent and
 255 significand are stored at the locations pointed to by `zExpPtr' and
 256 `zSigPtr', respectively.
 257 -------------------------------------------------------------------------------
 258 */
 259 static void
 260  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 261 {
 262     int8 shiftCount;
 263
 264     shiftCount = countLeadingZeros32( aSig ) - 8;
 265     *zSigPtr = aSig<<shiftCount;
 266     *zExpPtr = 1 - shiftCount;
 267
 268 }
 269
 270 /*
 271 -------------------------------------------------------------------------------
 272 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 273 single-precision floating-point value, returning the result.  After being
 274 shifted into the proper positions, the three fields are simply added
 275 together to form the result.  This means that any integer portion of `zSig'
 276 will be added into the exponent.  Since a properly normalized significand
 277 will have an integer portion equal to 1, the `zExp' input should be 1 less
 278 than the desired result exponent whenever `zSig' is a complete, normalized
 279 significand.
 280 -------------------------------------------------------------------------------
 281 */
 282 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 283 {
 284
 285     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 286
 287 }
 288
 289 /*
 290 -------------------------------------------------------------------------------
 291 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 292 and significand `zSig', and returns the proper single-precision floating-
 293 point value corresponding to the abstract input.  Ordinarily, the abstract
 294 value is simply rounded and packed into the single-precision format, with
 295 the inexact exception raised if the abstract input cannot be represented
 296 exactly.  However, if the abstract value is too large, the overflow and
 297 inexact exceptions are raised and an infinity or maximal finite value is
 298 returned.  If the abstract value is too small, the input value is rounded to
 299 a subnormal number, and the underflow and inexact exceptions are raised if
 300 the abstract input cannot be represented exactly as a subnormal single-
 301 precision floating-point number.
 302     The input significand `zSig' has its binary point between bits 30
 303 and 29, which is 7 bits to the left of the usual location.  This shifted
 304 significand must be normalized or smaller.  If `zSig' is not normalized,
 305 `zExp' must be 0; in that case, the result returned is a subnormal number,
 306 and it must not require rounding.  In the usual case that `zSig' is
 307 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 308 The handling of underflow and overflow follows the IEC/IEEE Standard for
 309 Binary Floating-Point Arithmetic.
 310 -------------------------------------------------------------------------------
 311 */
 312 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 313 {
 314     int8 roundingMode;
 315     flag roundNearestEven;
 316     int8 roundIncrement, roundBits;
 317     flag isTiny;
 318
 319     roundingMode = float_rounding_mode;
 320     roundNearestEven = ( roundingMode == float_round_nearest_even );
 321     roundIncrement = 0x40;
 322     if ( ! roundNearestEven ) {
 323         if ( roundingMode == float_round_to_zero ) {
 324             roundIncrement = 0;
 325         }
 326         else {
 327             roundIncrement = 0x7F;
 328             if ( zSign ) {
 329                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 330             }
 331             else {
 332                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 333             }
 334         }
 335     }
 336     roundBits = zSig & 0x7F;
 337     if ( 0xFD <= (bits16) zExp ) {
 338         if (    ( 0xFD < zExp )
 339              || (    ( zExp == 0xFD )
 340                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 341            ) {
 342             float_raise( float_flag_overflow | float_flag_inexact );
 343             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 344         }
 345         if ( zExp < 0 ) {
 346             isTiny =
 347                    ( float_detect_tininess == float_tininess_before_rounding )
 348                 || ( zExp < -1 )
 349                 || ( zSig + roundIncrement < 0x80000000U );
 350             shift32RightJamming( zSig, - zExp, &zSig );
 351             zExp = 0;
 352             roundBits = zSig & 0x7F;
 353             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 354         }
 355     }
 356     if ( roundBits ) set_float_exception_inexact_flag();
 357     zSig = ( zSig + roundIncrement )>>7;
 358     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 359     if ( zSig == 0 ) zExp = 0;
 360     return packFloat32( zSign, zExp, zSig );
 361
 362 }
 363
 364 /*
 365 -------------------------------------------------------------------------------
 366 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 367 and significand `zSig', and returns the proper single-precision floating-
 368 point value corresponding to the abstract input.  This routine is just like
 369 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 370 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 371 floating-point exponent.
 372 -------------------------------------------------------------------------------
 373 */
 374 static float32
 375  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 376 {
 377     int8 shiftCount;
 378
 379     shiftCount = countLeadingZeros32( zSig ) - 1;
 380     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
 381
 382 }
 383
 384 /*
 385 -------------------------------------------------------------------------------
 386 Returns the fraction bits of the double-precision floating-point value `a'.
 387 -------------------------------------------------------------------------------
 388 */
 389 INLINE bits64 extractFloat64Frac( float64 a )
 390 {
 391
 392     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
 393
 394 }
 395
 396 /*
 397 -------------------------------------------------------------------------------
 398 Returns the exponent bits of the double-precision floating-point value `a'.
 399 -------------------------------------------------------------------------------
 400 */
 401 INLINE int16 extractFloat64Exp( float64 a )
 402 {
 403
 404     return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF);
 405
 406 }
 407
 408 /*
 409 -------------------------------------------------------------------------------
 410 Returns the sign bit of the double-precision floating-point value `a'.
 411 -------------------------------------------------------------------------------
 412 */
 413 INLINE flag extractFloat64Sign( float64 a )
 414 {
 415
 416     return (flag)(FLOAT64_DEMANGLE(a) >> 63);
 417
 418 }
 419
 420 /*
 421 -------------------------------------------------------------------------------
 422 Normalizes the subnormal double-precision floating-point value represented
 423 by the denormalized significand `aSig'.  The normalized exponent and
 424 significand are stored at the locations pointed to by `zExpPtr' and
 425 `zSigPtr', respectively.
 426 -------------------------------------------------------------------------------
 427 */
 428 static void
 429  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 430 {
 431     int8 shiftCount;
 432
 433     shiftCount = countLeadingZeros64( aSig ) - 11;
 434     *zSigPtr = aSig<<shiftCount;
 435     *zExpPtr = 1 - shiftCount;
 436
 437 }
 438
 439 /*
 440 -------------------------------------------------------------------------------
 441 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 442 double-precision floating-point value, returning the result.  After being
 443 shifted into the proper positions, the three fields are simply added
 444 together to form the result.  This means that any integer portion of `zSig'
 445 will be added into the exponent.  Since a properly normalized significand
 446 will have an integer portion equal to 1, the `zExp' input should be 1 less
 447 than the desired result exponent whenever `zSig' is a complete, normalized
 448 significand.
 449 -------------------------------------------------------------------------------
 450 */
 451 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 452 {
 453
 454     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
 455                            ( ( (bits64) zExp )<<52 ) + zSig );
 456
 457 }
 458
 459 /*
 460 -------------------------------------------------------------------------------
 461 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 462 and significand `zSig', and returns the proper double-precision floating-
 463 point value corresponding to the abstract input.  Ordinarily, the abstract
 464 value is simply rounded and packed into the double-precision format, with
 465 the inexact exception raised if the abstract input cannot be represented
 466 exactly.  However, if the abstract value is too large, the overflow and
 467 inexact exceptions are raised and an infinity or maximal finite value is
 468 returned.  If the abstract value is too small, the input value is rounded to
 469 a subnormal number, and the underflow and inexact exceptions are raised if
 470 the abstract input cannot be represented exactly as a subnormal double-
 471 precision floating-point number.
 472     The input significand `zSig' has its binary point between bits 62
 473 and 61, which is 10 bits to the left of the usual location.  This shifted
 474 significand must be normalized or smaller.  If `zSig' is not normalized,
 475 `zExp' must be 0; in that case, the result returned is a subnormal number,
 476 and it must not require rounding.  In the usual case that `zSig' is
 477 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 478 The handling of underflow and overflow follows the IEC/IEEE Standard for
 479 Binary Floating-Point Arithmetic.
 480 -------------------------------------------------------------------------------
 481 */
 482 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 483 {
 484     int8 roundingMode;
 485     flag roundNearestEven;
 486     int16 roundIncrement, roundBits;
 487     flag isTiny;
 488
 489     roundingMode = float_rounding_mode;
 490     roundNearestEven = ( roundingMode == float_round_nearest_even );
 491     roundIncrement = 0x200;
 492     if ( ! roundNearestEven ) {
 493         if ( roundingMode == float_round_to_zero ) {
 494             roundIncrement = 0;
 495         }
 496         else {
 497             roundIncrement = 0x3FF;
 498             if ( zSign ) {
 499                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 500             }
 501             else {
 502                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 503             }
 504         }
 505     }
 506     roundBits = (int16)(zSig & 0x3FF);
 507     if ( 0x7FD <= (bits16) zExp ) {
 508         if (    ( 0x7FD < zExp )
 509              || (    ( zExp == 0x7FD )
 510                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 511            ) {
 512             float_raise( float_flag_overflow | float_flag_inexact );
 513             return FLOAT64_MANGLE(
 514                 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
 515                 ( roundIncrement == 0 ));
 516         }
 517         if ( zExp < 0 ) {
 518             isTiny =
 519                    ( float_detect_tininess == float_tininess_before_rounding )
 520                 || ( zExp < -1 )
 521                 || ( zSig + roundIncrement < (bits64)LIT64( 0x8000000000000000 ) );
 522             shift64RightJamming( zSig, - zExp, &zSig );
 523             zExp = 0;
 524             roundBits = (int16)(zSig & 0x3FF);
 525             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 526         }
 527     }
 528     if ( roundBits ) set_float_exception_inexact_flag();
 529     zSig = ( zSig + roundIncrement )>>10;
 530     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 531     if ( zSig == 0 ) zExp = 0;
 532     return packFloat64( zSign, zExp, zSig );
 533
 534 }
 535
 536 /*
 537 -------------------------------------------------------------------------------
 538 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 539 and significand `zSig', and returns the proper double-precision floating-
 540 point value corresponding to the abstract input.  This routine is just like
 541 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 542 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 543 floating-point exponent.
 544 -------------------------------------------------------------------------------
 545 */
 546 static float64
 547  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 548 {
 549     int8 shiftCount;
 550
 551     shiftCount = countLeadingZeros64( zSig ) - 1;
 552     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
 553
 554 }
 555
 556 #ifdef FLOATX80
 557
 558 /*
 559 -------------------------------------------------------------------------------
 560 Returns the fraction bits of the extended double-precision floating-point
 561 value `a'.
 562 -------------------------------------------------------------------------------
 563 */
 564 INLINE bits64 extractFloatx80Frac( floatx80 a )
 565 {
 566
 567     return a.low;
 568
 569 }
 570
 571 /*
 572 -------------------------------------------------------------------------------
 573 Returns the exponent bits of the extended double-precision floating-point
 574 value `a'.
 575 -------------------------------------------------------------------------------
 576 */
 577 INLINE int32 extractFloatx80Exp( floatx80 a )
 578 {
 579
 580     return a.high & 0x7FFF;
 581
 582 }
 583
 584 /*
 585 -------------------------------------------------------------------------------
 586 Returns the sign bit of the extended double-precision floating-point value
 587 `a'.
 588 -------------------------------------------------------------------------------
 589 */
 590 INLINE flag extractFloatx80Sign( floatx80 a )
 591 {
 592
 593     return a.high>>15;
 594
 595 }
 596
 597 /*
 598 -------------------------------------------------------------------------------
 599 Normalizes the subnormal extended double-precision floating-point value
 600 represented by the denormalized significand `aSig'.  The normalized exponent
 601 and significand are stored at the locations pointed to by `zExpPtr' and
 602 `zSigPtr', respectively.
 603 -------------------------------------------------------------------------------
 604 */
 605 static void
 606  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 607 {
 608     int8 shiftCount;
 609
 610     shiftCount = countLeadingZeros64( aSig );
 611     *zSigPtr = aSig<<shiftCount;
 612     *zExpPtr = 1 - shiftCount;
 613
 614 }
 615
 616 /*
 617 -------------------------------------------------------------------------------
 618 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 619 extended double-precision floating-point value, returning the result.
 620 -------------------------------------------------------------------------------
 621 */
 622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 623 {
 624     floatx80 z;
 625
 626     z.low = zSig;
 627     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 628     return z;
 629
 630 }
 631
 632 /*
 633 -------------------------------------------------------------------------------
 634 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 635 and extended significand formed by the concatenation of `zSig0' and `zSig1',
 636 and returns the proper extended double-precision floating-point value
 637 corresponding to the abstract input.  Ordinarily, the abstract value is
 638 rounded and packed into the extended double-precision format, with the
 639 inexact exception raised if the abstract input cannot be represented
 640 exactly.  However, if the abstract value is too large, the overflow and
 641 inexact exceptions are raised and an infinity or maximal finite value is
 642 returned.  If the abstract value is too small, the input value is rounded to
 643 a subnormal number, and the underflow and inexact exceptions are raised if
 644 the abstract input cannot be represented exactly as a subnormal extended
 645 double-precision floating-point number.
 646     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 647 number of bits as single or double precision, respectively.  Otherwise, the
 648 result is rounded to the full precision of the extended double-precision
 649 format.
 650     The input significand must be normalized or smaller.  If the input
 651 significand is not normalized, `zExp' must be 0; in that case, the result
 652 returned is a subnormal number, and it must not require rounding.  The
 653 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 654 Floating-Point Arithmetic.
 655 -------------------------------------------------------------------------------
 656 */
 657 static floatx80
 658  roundAndPackFloatx80(
 659      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 660  )
 661 {
 662     int8 roundingMode;
 663     flag roundNearestEven, increment, isTiny;
 664     int64 roundIncrement, roundMask, roundBits;
 665
 666     roundingMode = float_rounding_mode;
 667     roundNearestEven = ( roundingMode == float_round_nearest_even );
 668     if ( roundingPrecision == 80 ) goto precision80;
 669     if ( roundingPrecision == 64 ) {
 670         roundIncrement = LIT64( 0x0000000000000400 );
 671         roundMask = LIT64( 0x00000000000007FF );
 672     }
 673     else if ( roundingPrecision == 32 ) {
 674         roundIncrement = LIT64( 0x0000008000000000 );
 675         roundMask = LIT64( 0x000000FFFFFFFFFF );
 676     }
 677     else {
 678         goto precision80;
 679     }
 680     zSig0 |= ( zSig1 != 0 );
 681     if ( ! roundNearestEven ) {
 682         if ( roundingMode == float_round_to_zero ) {
 683             roundIncrement = 0;
 684         }
 685         else {
 686             roundIncrement = roundMask;
 687             if ( zSign ) {
 688                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 689             }
 690             else {
 691                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 692             }
 693         }
 694     }
 695     roundBits = zSig0 & roundMask;
 696     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 697         if (    ( 0x7FFE < zExp )
 698              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 699            ) {
 700             goto overflow;
 701         }
 702         if ( zExp <= 0 ) {
 703             isTiny =
 704                    ( float_detect_tininess == float_tininess_before_rounding )
 705                 || ( zExp < 0 )
 706                 || ( zSig0 <= zSig0 + roundIncrement );
 707             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 708             zExp = 0;
 709             roundBits = zSig0 & roundMask;
 710             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 711             if ( roundBits ) set_float_exception_inexact_flag();
 712             zSig0 += roundIncrement;
 713             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 714             roundIncrement = roundMask + 1;
 715             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 716                 roundMask |= roundIncrement;
 717             }
 718             zSig0 &= ~ roundMask;
 719             return packFloatx80( zSign, zExp, zSig0 );
 720         }
 721     }
 722     if ( roundBits ) set_float_exception_inexact_flag();
 723     zSig0 += roundIncrement;
 724     if ( zSig0 < roundIncrement ) {
 725         ++zExp;
 726         zSig0 = LIT64( 0x8000000000000000 );
 727     }
 728     roundIncrement = roundMask + 1;
 729     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 730         roundMask |= roundIncrement;
 731     }
 732     zSig0 &= ~ roundMask;
 733     if ( zSig0 == 0 ) zExp = 0;
 734     return packFloatx80( zSign, zExp, zSig0 );
 735  precision80:
 736     increment = ( (sbits64) zSig1 < 0 );
 737     if ( ! roundNearestEven ) {
 738         if ( roundingMode == float_round_to_zero ) {
 739             increment = 0;
 740         }
 741         else {
 742             if ( zSign ) {
 743                 increment = ( roundingMode == float_round_down ) && zSig1;
 744             }
 745             else {
 746                 increment = ( roundingMode == float_round_up ) && zSig1;
 747             }
 748         }
 749     }
 750     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 751         if (    ( 0x7FFE < zExp )
 752              || (    ( zExp == 0x7FFE )
 753                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 754                   && increment
 755                 )
 756            ) {
 757             roundMask = 0;
 758  overflow:
 759             float_raise( float_flag_overflow | float_flag_inexact );
 760             if (    ( roundingMode == float_round_to_zero )
 761                  || ( zSign && ( roundingMode == float_round_up ) )
 762                  || ( ! zSign && ( roundingMode == float_round_down ) )
 763                ) {
 764                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 765             }
 766             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 767         }
 768         if ( zExp <= 0 ) {
 769             isTiny =
 770                    ( float_detect_tininess == float_tininess_before_rounding )
 771                 || ( zExp < 0 )
 772                 || ! increment
 773                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 774             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 775             zExp = 0;
 776             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
 777             if ( zSig1 ) set_float_exception_inexact_flag();
 778             if ( roundNearestEven ) {
 779                 increment = ( (sbits64) zSig1 < 0 );
 780             }
 781             else {
 782                 if ( zSign ) {
 783                     increment = ( roundingMode == float_round_down ) && zSig1;
 784                 }
 785                 else {
 786                     increment = ( roundingMode == float_round_up ) && zSig1;
 787                 }
 788             }
 789             if ( increment ) {
 790                 ++zSig0;
 791                 zSig0 &=
 792                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 793                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 794             }
 795             return packFloatx80( zSign, zExp, zSig0 );
 796         }
 797     }
 798     if ( zSig1 ) set_float_exception_inexact_flag();
 799     if ( increment ) {
 800         ++zSig0;
 801         if ( zSig0 == 0 ) {
 802             ++zExp;
 803             zSig0 = LIT64( 0x8000000000000000 );
 804         }
 805         else {
 806             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 807         }
 808     }
 809     else {
 810         if ( zSig0 == 0 ) zExp = 0;
 811     }
 812     return packFloatx80( zSign, zExp, zSig0 );
 813
 814 }
 815
 816 /*
 817 -------------------------------------------------------------------------------
 818 Takes an abstract floating-point value having sign `zSign', exponent
 819 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 820 and returns the proper extended double-precision floating-point value
 821 corresponding to the abstract input.  This routine is just like
 822 `roundAndPackFloatx80' except that the input significand does not have to be
 823 normalized.
 824 -------------------------------------------------------------------------------
 825 */
 826 static floatx80
 827  normalizeRoundAndPackFloatx80(
 828      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 829  )
 830 {
 831     int8 shiftCount;
 832
 833     if ( zSig0 == 0 ) {
 834         zSig0 = zSig1;
 835         zSig1 = 0;
 836         zExp -= 64;
 837     }
 838     shiftCount = countLeadingZeros64( zSig0 );
 839     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 840     zExp -= shiftCount;
 841     return
 842         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
 843
 844 }
 845
 846 #endif
 847
 848 #ifdef FLOAT128
 849
 850 /*
 851 -------------------------------------------------------------------------------
 852 Returns the least-significant 64 fraction bits of the quadruple-precision
 853 floating-point value `a'.
 854 -------------------------------------------------------------------------------
 855 */
 856 INLINE bits64 extractFloat128Frac1( float128 a )
 857 {
 858
 859     return a.low;
 860
 861 }
 862
 863 /*
 864 -------------------------------------------------------------------------------
 865 Returns the most-significant 48 fraction bits of the quadruple-precision
 866 floating-point value `a'.
 867 -------------------------------------------------------------------------------
 868 */
 869 INLINE bits64 extractFloat128Frac0( float128 a )
 870 {
 871
 872     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 873
 874 }
 875
 876 /*
 877 -------------------------------------------------------------------------------
 878 Returns the exponent bits of the quadruple-precision floating-point value
 879 `a'.
 880 -------------------------------------------------------------------------------
 881 */
 882 INLINE int32 extractFloat128Exp( float128 a )
 883 {
 884
 885     return (int32)((a.high >> 48) & 0x7FFF);
 886
 887 }
 888
 889 /*
 890 -------------------------------------------------------------------------------
 891 Returns the sign bit of the quadruple-precision floating-point value `a'.
 892 -------------------------------------------------------------------------------
 893 */
 894 INLINE flag extractFloat128Sign( float128 a )
 895 {
 896
 897     return (flag)(a.high >> 63);
 898
 899 }
 900
 901 /*
 902 -------------------------------------------------------------------------------
 903 Normalizes the subnormal quadruple-precision floating-point value
 904 represented by the denormalized significand formed by the concatenation of
 905 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 906 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 907 significand are stored at the location pointed to by `zSig0Ptr', and the
 908 least significant 64 bits of the normalized significand are stored at the
 909 location pointed to by `zSig1Ptr'.
 910 -------------------------------------------------------------------------------
 911 */
 912 static void
 913  normalizeFloat128Subnormal(
 914      bits64 aSig0,
 915      bits64 aSig1,
 916      int32 *zExpPtr,
 917      bits64 *zSig0Ptr,
 918      bits64 *zSig1Ptr
 919  )
 920 {
 921     int8 shiftCount;
 922
 923     if ( aSig0 == 0 ) {
 924         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 925         if ( shiftCount < 0 ) {
 926             *zSig0Ptr = aSig1>>( - shiftCount );
 927             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 928         }
 929         else {
 930             *zSig0Ptr = aSig1<<shiftCount;
 931             *zSig1Ptr = 0;
 932         }
 933         *zExpPtr = - shiftCount - 63;
 934     }
 935     else {
 936         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 937         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 938         *zExpPtr = 1 - shiftCount;
 939     }
 940
 941 }
 942
 943 /*
 944 -------------------------------------------------------------------------------
 945 Packs the sign `zSign', the exponent `zExp', and the significand formed
 946 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 947 floating-point value, returning the result.  After being shifted into the
 948 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 949 added together to form the most significant 32 bits of the result.  This
 950 means that any integer portion of `zSig0' will be added into the exponent.
 951 Since a properly normalized significand will have an integer portion equal
 952 to 1, the `zExp' input should be 1 less than the desired result exponent
 953 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 954 significand.
 955 -------------------------------------------------------------------------------
 956 */
 957 INLINE float128
 958  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 959 {
 960     float128 z;
 961
 962     z.low = zSig1;
 963     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 964     return z;
 965
 966 }
 967
 968 /*
 969 -------------------------------------------------------------------------------
 970 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 971 and extended significand formed by the concatenation of `zSig0', `zSig1',
 972 and `zSig2', and returns the proper quadruple-precision floating-point value
 973 corresponding to the abstract input.  Ordinarily, the abstract value is
 974 simply rounded and packed into the quadruple-precision format, with the
 975 inexact exception raised if the abstract input cannot be represented
 976 exactly.  However, if the abstract value is too large, the overflow and
 977 inexact exceptions are raised and an infinity or maximal finite value is
 978 returned.  If the abstract value is too small, the input value is rounded to
 979 a subnormal number, and the underflow and inexact exceptions are raised if
 980 the abstract input cannot be represented exactly as a subnormal quadruple-
 981 precision floating-point number.
 982     The input significand must be normalized or smaller.  If the input
 983 significand is not normalized, `zExp' must be 0; in that case, the result
 984 returned is a subnormal number, and it must not require rounding.  In the
 985 usual case that the input significand is normalized, `zExp' must be 1 less
 986 than the ``true'' floating-point exponent.  The handling of underflow and
 987 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 988 -------------------------------------------------------------------------------
 989 */
 990 static float128
 991  roundAndPackFloat128(
 992      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
 993 {
 994     int8 roundingMode;
 995     flag roundNearestEven, increment, isTiny;
 996
 997     roundingMode = float_rounding_mode;
 998     roundNearestEven = ( roundingMode == float_round_nearest_even );
 999     increment = ( (sbits64) zSig2 < 0 );
1000     if ( ! roundNearestEven ) {
1001         if ( roundingMode == float_round_to_zero ) {
1002             increment = 0;
1003         }
1004         else {
1005             if ( zSign ) {
1006                 increment = ( roundingMode == float_round_down ) && zSig2;
1007             }
1008             else {
1009                 increment = ( roundingMode == float_round_up ) && zSig2;
1010             }
1011         }
1012     }
1013     if ( 0x7FFD <= (bits32) zExp ) {
1014         if (    ( 0x7FFD < zExp )
1015              || (    ( zExp == 0x7FFD )
1016                   && eq128(
1017                          LIT64( 0x0001FFFFFFFFFFFF ),
1018                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1019                          zSig0,
1020                          zSig1
1021                      )
1022                   && increment
1023                 )
1024            ) {
1025             float_raise( float_flag_overflow | float_flag_inexact );
1026             if (    ( roundingMode == float_round_to_zero )
1027                  || ( zSign && ( roundingMode == float_round_up ) )
1028                  || ( ! zSign && ( roundingMode == float_round_down ) )
1029                ) {
1030                 return
1031                     packFloat128(
1032                         zSign,
1033                         0x7FFE,
1034                         LIT64( 0x0000FFFFFFFFFFFF ),
1035                         LIT64( 0xFFFFFFFFFFFFFFFF )
1036                     );
1037             }
1038             return packFloat128( zSign, 0x7FFF, 0, 0 );
1039         }
1040         if ( zExp < 0 ) {
1041             isTiny =
1042                    ( float_detect_tininess == float_tininess_before_rounding )
1043                 || ( zExp < -1 )
1044                 || ! increment
1045                 || lt128(
1046                        zSig0,
1047                        zSig1,
1048                        LIT64( 0x0001FFFFFFFFFFFF ),
1049                        LIT64( 0xFFFFFFFFFFFFFFFF )
1050                    );
1051             shift128ExtraRightJamming(
1052                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1053             zExp = 0;
1054             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1055             if ( roundNearestEven ) {
1056                 increment = ( (sbits64) zSig2 < 0 );
1057             }
1058             else {
1059                 if ( zSign ) {
1060                     increment = ( roundingMode == float_round_down ) && zSig2;
1061                 }
1062                 else {
1063                     increment = ( roundingMode == float_round_up ) && zSig2;
1064                 }
1065             }
1066         }
1067     }
1068     if ( zSig2 ) set_float_exception_inexact_flag();
1069     if ( increment ) {
1070         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1071         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1072     }
1073     else {
1074         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1075     }
1076     return packFloat128( zSign, zExp, zSig0, zSig1 );
1077
1078 }
1079
1080 /*
1081 -------------------------------------------------------------------------------
1082 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1083 and significand formed by the concatenation of `zSig0' and `zSig1', and
1084 returns the proper quadruple-precision floating-point value corresponding
1085 to the abstract input.  This routine is just like `roundAndPackFloat128'
1086 except that the input significand has fewer bits and does not have to be
1087 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1088 point exponent.
1089 -------------------------------------------------------------------------------
1090 */
1091 static float128
1092  normalizeRoundAndPackFloat128(
1093      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1094 {
1095     int8 shiftCount;
1096     bits64 zSig2;
1097
1098     if ( zSig0 == 0 ) {
1099         zSig0 = zSig1;
1100         zSig1 = 0;
1101         zExp -= 64;
1102     }
1103     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1104     if ( 0 <= shiftCount ) {
1105         zSig2 = 0;
1106         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1107     }
1108     else {
1109         shift128ExtraRightJamming(
1110             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1111     }
1112     zExp -= shiftCount;
1113     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1114
1115 }
1116
1117 #endif
1118
1119 /*
1120 -------------------------------------------------------------------------------
1121 Returns the result of converting the 32-bit two's complement integer `a'
1122 to the single-precision floating-point format.  The conversion is performed
1123 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1124 -------------------------------------------------------------------------------
1125 */
1126 float32 int32_to_float32( int32 a )
1127 {
1128     flag zSign;
1129
1130     if ( a == 0 ) return 0;
1131     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1132     zSign = ( a < 0 );
1133     return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a));
1134
1135 }
1136
1137 float32 uint32_to_float32( uint32 a )
1138 {
1139     if ( a == 0 ) return 0;
1140     if ( a & (bits32) 0x80000000 )
1141         return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1142     return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1143 }
1144
1145
1146 /*
1147 -------------------------------------------------------------------------------
1148 Returns the result of converting the 32-bit two's complement integer `a'
1149 to the double-precision floating-point format.  The conversion is performed
1150 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1151 -------------------------------------------------------------------------------
1152 */
1153 float64 int32_to_float64( int32 a )
1154 {
1155     flag zSign;
1156     uint32 absA;
1157     int8 shiftCount;
1158     bits64 zSig;
1159
1160     if ( a == 0 ) return 0;
1161     zSign = ( a < 0 );
1162     absA = zSign ? - a : a;
1163     shiftCount = countLeadingZeros32( absA ) + 21;
1164     zSig = absA;
1165     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1166
1167 }
1168
1169 float64 uint32_to_float64( uint32 a )
1170 {
1171     int8 shiftCount;
1172     bits64 zSig = a;
1173
1174     if ( a == 0 ) return 0;
1175     shiftCount = countLeadingZeros32( a ) + 21;
1176     return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1177
1178 }
1179
1180 #ifdef FLOATX80
1181
1182 /*
1183 -------------------------------------------------------------------------------
1184 Returns the result of converting the 32-bit two's complement integer `a'
1185 to the extended double-precision floating-point format.  The conversion
1186 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1187 Arithmetic.
1188 -------------------------------------------------------------------------------
1189 */
1190 floatx80 int32_to_floatx80( int32 a )
1191 {
1192     flag zSign;
1193     uint32 absA;
1194     int8 shiftCount;
1195     bits64 zSig;
1196
1197     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1198     zSign = ( a < 0 );
1199     absA = zSign ? - a : a;
1200     shiftCount = countLeadingZeros32( absA ) + 32;
1201     zSig = absA;
1202     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1203
1204 }
1205
1206 floatx80 uint32_to_floatx80( uint32 a )
1207 {
1208     int8 shiftCount;
1209     bits64 zSig = a;
1210
1211     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1212     shiftCount = countLeadingZeros32( a ) + 32;
1213     return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1214
1215 }
1216
1217 #endif
1218
1219 #ifdef FLOAT128
1220
1221 /*
1222 -------------------------------------------------------------------------------
1223 Returns the result of converting the 32-bit two's complement integer `a' to
1224 the quadruple-precision floating-point format.  The conversion is performed
1225 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1226 -------------------------------------------------------------------------------
1227 */
1228 float128 int32_to_float128( int32 a )
1229 {
1230     flag zSign;
1231     uint32 absA;
1232     int8 shiftCount;
1233     bits64 zSig0;
1234
1235     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1236     zSign = ( a < 0 );
1237     absA = zSign ? - a : a;
1238     shiftCount = countLeadingZeros32( absA ) + 17;
1239     zSig0 = absA;
1240     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1241
1242 }
1243
1244 float128 uint32_to_float128( uint32 a )
1245 {
1246     int8 shiftCount;
1247     bits64 zSig0 = a;
1248
1249     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1250     shiftCount = countLeadingZeros32( a ) + 17;
1251     return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1252
1253 }
1254
1255 #endif
1256
1257 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1258 /*
1259 -------------------------------------------------------------------------------
1260 Returns the result of converting the 64-bit two's complement integer `a'
1261 to the single-precision floating-point format.  The conversion is performed
1262 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1263 -------------------------------------------------------------------------------
1264 */
1265 float32 int64_to_float32( int64 a )
1266 {
1267     flag zSign;
1268     uint64 absA;
1269     int8 shiftCount;
1270
1271     if ( a == 0 ) return 0;
1272     zSign = ( a < 0 );
1273     absA = zSign ? - a : a;
1274     shiftCount = countLeadingZeros64( absA ) - 40;
1275     if ( 0 <= shiftCount ) {
1276         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1277     }
1278     else {
1279         shiftCount += 7;
1280         if ( shiftCount < 0 ) {
1281             shift64RightJamming( absA, - shiftCount, &absA );
1282         }
1283         else {
1284             absA <<= shiftCount;
1285         }
1286         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1287     }
1288
1289 }
1290
1291 /*
1292 -------------------------------------------------------------------------------
1293 Returns the result of converting the 64-bit two's complement integer `a'
1294 to the double-precision floating-point format.  The conversion is performed
1295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 -------------------------------------------------------------------------------
1297 */
1298 float64 int64_to_float64( int64 a )
1299 {
1300     flag zSign;
1301
1302     if ( a == 0 ) return 0;
1303     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1304         return packFloat64( 1, 0x43E, 0 );
1305     }
1306     zSign = ( a < 0 );
1307     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1308
1309 }
1310
1311 #ifdef FLOATX80
1312
1313 /*
1314 -------------------------------------------------------------------------------
1315 Returns the result of converting the 64-bit two's complement integer `a'
1316 to the extended double-precision floating-point format.  The conversion
1317 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 Arithmetic.
1319 -------------------------------------------------------------------------------
1320 */
1321 floatx80 int64_to_floatx80( int64 a )
1322 {
1323     flag zSign;
1324     uint64 absA;
1325     int8 shiftCount;
1326
1327     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1328     zSign = ( a < 0 );
1329     absA = zSign ? - a : a;
1330     shiftCount = countLeadingZeros64( absA );
1331     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1332
1333 }
1334
1335 #endif
1336
1337 #endif /* !SOFTFLOAT_FOR_GCC */
1338
1339 #ifdef FLOAT128
1340
1341 /*
1342 -------------------------------------------------------------------------------
1343 Returns the result of converting the 64-bit two's complement integer `a' to
1344 the quadruple-precision floating-point format.  The conversion is performed
1345 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1346 -------------------------------------------------------------------------------
1347 */
1348 float128 int64_to_float128( int64 a )
1349 {
1350     flag zSign;
1351     uint64 absA;
1352     int8 shiftCount;
1353     int32 zExp;
1354     bits64 zSig0, zSig1;
1355
1356     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1357     zSign = ( a < 0 );
1358     absA = zSign ? - a : a;
1359     shiftCount = countLeadingZeros64( absA ) + 49;
1360     zExp = 0x406E - shiftCount;
1361     if ( 64 <= shiftCount ) {
1362         zSig1 = 0;
1363         zSig0 = absA;
1364         shiftCount -= 64;
1365     }
1366     else {
1367         zSig1 = absA;
1368         zSig0 = 0;
1369     }
1370     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1371     return packFloat128( zSign, zExp, zSig0, zSig1 );
1372
1373 }
1374
1375 #endif
1376
1377 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1378 /*
1379 -------------------------------------------------------------------------------
1380 Returns the result of converting the single-precision floating-point value
1381 `a' to the 32-bit two's complement integer format.  The conversion is
1382 performed according to the IEC/IEEE Standard for Binary Floating-Point
1383 Arithmetic---which means in particular that the conversion is rounded
1384 according to the current rounding mode.  If `a' is a NaN, the largest
1385 positive integer is returned.  Otherwise, if the conversion overflows, the
1386 largest integer with the same sign as `a' is returned.
1387 -------------------------------------------------------------------------------
1388 */
1389 int32 float32_to_int32( float32 a )
1390 {
1391     flag aSign;
1392     int16 aExp, shiftCount;
1393     bits32 aSig;
1394     bits64 aSig64;
1395
1396     aSig = extractFloat32Frac( a );
1397     aExp = extractFloat32Exp( a );
1398     aSign = extractFloat32Sign( a );
1399     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1400     if ( aExp ) aSig |= 0x00800000;
1401     shiftCount = 0xAF - aExp;
1402     aSig64 = aSig;
1403     aSig64 <<= 32;
1404     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1405     return roundAndPackInt32( aSign, aSig64 );
1406
1407 }
1408 #endif /* !SOFTFLOAT_FOR_GCC */
1409
1410 /*
1411 -------------------------------------------------------------------------------
1412 Returns the result of converting the single-precision floating-point value
1413 `a' to the 32-bit two's complement integer format.  The conversion is
1414 performed according to the IEC/IEEE Standard for Binary Floating-Point
1415 Arithmetic, except that the conversion is always rounded toward zero.
1416 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1417 the conversion overflows, the largest integer with the same sign as `a' is
1418 returned.
1419 -------------------------------------------------------------------------------
1420 */
1421 int32 float32_to_int32_round_to_zero( float32 a )
1422 {
1423     flag aSign;
1424     int16 aExp, shiftCount;
1425     bits32 aSig;
1426     int32 z;
1427
1428     aSig = extractFloat32Frac( a );
1429     aExp = extractFloat32Exp( a );
1430     aSign = extractFloat32Sign( a );
1431     shiftCount = aExp - 0x9E;
1432     if ( 0 <= shiftCount ) {
1433         if ( a != 0xCF000000 ) {
1434             float_raise( float_flag_invalid );
1435             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1436         }
1437         return (sbits32) 0x80000000;
1438     }
1439     else if ( aExp <= 0x7E ) {
1440         if ( aExp | aSig ) set_float_exception_inexact_flag();
1441         return 0;
1442     }
1443     aSig = ( aSig | 0x00800000 )<<8;
1444     z = aSig>>( - shiftCount );
1445     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1446         set_float_exception_inexact_flag();
1447     }
1448     if ( aSign ) z = - z;
1449     return z;
1450
1451 }
1452
1453 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1454 /*
1455 -------------------------------------------------------------------------------
1456 Returns the result of converting the single-precision floating-point value
1457 `a' to the 64-bit two's complement integer format.  The conversion is
1458 performed according to the IEC/IEEE Standard for Binary Floating-Point
1459 Arithmetic---which means in particular that the conversion is rounded
1460 according to the current rounding mode.  If `a' is a NaN, the largest
1461 positive integer is returned.  Otherwise, if the conversion overflows, the
1462 largest integer with the same sign as `a' is returned.
1463 -------------------------------------------------------------------------------
1464 */
1465 int64 float32_to_int64( float32 a )
1466 {
1467     flag aSign;
1468     int16 aExp, shiftCount;
1469     bits32 aSig;
1470     bits64 aSig64, aSigExtra;
1471
1472     aSig = extractFloat32Frac( a );
1473     aExp = extractFloat32Exp( a );
1474     aSign = extractFloat32Sign( a );
1475     shiftCount = 0xBE - aExp;
1476     if ( shiftCount < 0 ) {
1477         float_raise( float_flag_invalid );
1478         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1479             return LIT64( 0x7FFFFFFFFFFFFFFF );
1480         }
1481         return (sbits64) LIT64( 0x8000000000000000 );
1482     }
1483     if ( aExp ) aSig |= 0x00800000;
1484     aSig64 = aSig;
1485     aSig64 <<= 40;
1486     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1487     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1488
1489 }
1490
1491 /*
1492 -------------------------------------------------------------------------------
1493 Returns the result of converting the single-precision floating-point value
1494 `a' to the 64-bit two's complement integer format.  The conversion is
1495 performed according to the IEC/IEEE Standard for Binary Floating-Point
1496 Arithmetic, except that the conversion is always rounded toward zero.  If
1497 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1498 conversion overflows, the largest integer with the same sign as `a' is
1499 returned.
1500 -------------------------------------------------------------------------------
1501 */
1502 int64 float32_to_int64_round_to_zero( float32 a )
1503 {
1504     flag aSign;
1505     int16 aExp, shiftCount;
1506     bits32 aSig;
1507     bits64 aSig64;
1508     int64 z;
1509
1510     aSig = extractFloat32Frac( a );
1511     aExp = extractFloat32Exp( a );
1512     aSign = extractFloat32Sign( a );
1513     shiftCount = aExp - 0xBE;
1514     if ( 0 <= shiftCount ) {
1515         if ( a != 0xDF000000 ) {
1516             float_raise( float_flag_invalid );
1517             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1518                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1519             }
1520         }
1521         return (sbits64) LIT64( 0x8000000000000000 );
1522     }
1523     else if ( aExp <= 0x7E ) {
1524         if ( aExp | aSig ) set_float_exception_inexact_flag();
1525         return 0;
1526     }
1527     aSig64 = aSig | 0x00800000;
1528     aSig64 <<= 40;
1529     z = aSig64>>( - shiftCount );
1530     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1531         set_float_exception_inexact_flag();
1532     }
1533     if ( aSign ) z = - z;
1534     return z;
1535
1536 }
1537 #endif /* !SOFTFLOAT_FOR_GCC */
1538
1539 /*
1540 -------------------------------------------------------------------------------
1541 Returns the result of converting the single-precision floating-point value
1542 `a' to the double-precision floating-point format.  The conversion is
1543 performed according to the IEC/IEEE Standard for Binary Floating-Point
1544 Arithmetic.
1545 -------------------------------------------------------------------------------
1546 */
1547 float64 float32_to_float64( float32 a )
1548 {
1549     flag aSign;
1550     int16 aExp;
1551     bits32 aSig;
1552
1553     aSig = extractFloat32Frac( a );
1554     aExp = extractFloat32Exp( a );
1555     aSign = extractFloat32Sign( a );
1556     if ( aExp == 0xFF ) {
1557         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1558         return packFloat64( aSign, 0x7FF, 0 );
1559     }
1560     if ( aExp == 0 ) {
1561         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1562         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1563         --aExp;
1564     }
1565     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1566
1567 }
1568
1569 #ifdef FLOATX80
1570
1571 /*
1572 -------------------------------------------------------------------------------
1573 Returns the result of converting the single-precision floating-point value
1574 `a' to the extended double-precision floating-point format.  The conversion
1575 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1576 Arithmetic.
1577 -------------------------------------------------------------------------------
1578 */
1579 floatx80 float32_to_floatx80( float32 a )
1580 {
1581     flag aSign;
1582     int16 aExp;
1583     bits32 aSig;
1584
1585     aSig = extractFloat32Frac( a );
1586     aExp = extractFloat32Exp( a );
1587     aSign = extractFloat32Sign( a );
1588     if ( aExp == 0xFF ) {
1589         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1590         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1591     }
1592     if ( aExp == 0 ) {
1593         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1594         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1595     }
1596     aSig |= 0x00800000;
1597     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1598
1599 }
1600
1601 #endif
1602
1603 #ifdef FLOAT128
1604
1605 /*
1606 -------------------------------------------------------------------------------
1607 Returns the result of converting the single-precision floating-point value
1608 `a' to the double-precision floating-point format.  The conversion is
1609 performed according to the IEC/IEEE Standard for Binary Floating-Point
1610 Arithmetic.
1611 -------------------------------------------------------------------------------
1612 */
1613 float128 float32_to_float128( float32 a )
1614 {
1615     flag aSign;
1616     int16 aExp;
1617     bits32 aSig;
1618
1619     aSig = extractFloat32Frac( a );
1620     aExp = extractFloat32Exp( a );
1621     aSign = extractFloat32Sign( a );
1622     if ( aExp == 0xFF ) {
1623         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1624         return packFloat128( aSign, 0x7FFF, 0, 0 );
1625     }
1626     if ( aExp == 0 ) {
1627         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1628         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1629         --aExp;
1630     }
1631     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1632
1633 }
1634
1635 #endif
1636
1637 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1638 /*
1639 -------------------------------------------------------------------------------
1640 Rounds the single-precision floating-point value `a' to an integer, and
1641 returns the result as a single-precision floating-point value.  The
1642 operation is performed according to the IEC/IEEE Standard for Binary
1643 Floating-Point Arithmetic.
1644 -------------------------------------------------------------------------------
1645 */
1646 float32 float32_round_to_int( float32 a )
1647 {
1648     flag aSign;
1649     int16 aExp;
1650     bits32 lastBitMask, roundBitsMask;
1651     int8 roundingMode;
1652     float32 z;
1653
1654     aExp = extractFloat32Exp( a );
1655     if ( 0x96 <= aExp ) {
1656         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1657             return propagateFloat32NaN( a, a );
1658         }
1659         return a;
1660     }
1661     if ( aExp <= 0x7E ) {
1662         if ( (bits32) ( a<<1 ) == 0 ) return a;
1663         set_float_exception_inexact_flag();
1664         aSign = extractFloat32Sign( a );
1665         switch ( float_rounding_mode ) {
1666          case float_round_nearest_even:
1667             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1668                 return packFloat32( aSign, 0x7F, 0 );
1669             }
1670             break;
1671          case float_round_to_zero:
1672             break;
1673          case float_round_down:
1674             return aSign ? 0xBF800000 : 0;
1675          case float_round_up:
1676             return aSign ? 0x80000000 : 0x3F800000;
1677         }
1678         return packFloat32( aSign, 0, 0 );
1679     }
1680     lastBitMask = 1;
1681     lastBitMask <<= 0x96 - aExp;
1682     roundBitsMask = lastBitMask - 1;
1683     z = a;
1684     roundingMode = float_rounding_mode;
1685     if ( roundingMode == float_round_nearest_even ) {
1686         z += lastBitMask>>1;
1687         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1688     }
1689     else if ( roundingMode != float_round_to_zero ) {
1690         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1691             z += roundBitsMask;
1692         }
1693     }
1694     z &= ~ roundBitsMask;
1695     if ( z != a ) set_float_exception_inexact_flag();
1696     return z;
1697
1698 }
1699 #endif /* !SOFTFLOAT_FOR_GCC */
1700
1701 /*
1702 -------------------------------------------------------------------------------
1703 Returns the result of adding the absolute values of the single-precision
1704 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1705 before being returned.  `zSign' is ignored if the result is a NaN.
1706 The addition is performed according to the IEC/IEEE Standard for Binary
1707 Floating-Point Arithmetic.
1708 -------------------------------------------------------------------------------
1709 */
1710 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1711 {
1712     int16 aExp, bExp, zExp;
1713     bits32 aSig, bSig, zSig;
1714     int16 expDiff;
1715
1716     aSig = extractFloat32Frac( a );
1717     aExp = extractFloat32Exp( a );
1718     bSig = extractFloat32Frac( b );
1719     bExp = extractFloat32Exp( b );
1720     expDiff = aExp - bExp;
1721     aSig <<= 6;
1722     bSig <<= 6;
1723     if ( 0 < expDiff ) {
1724         if ( aExp == 0xFF ) {
1725             if ( aSig ) return propagateFloat32NaN( a, b );
1726             return a;
1727         }
1728         if ( bExp == 0 ) {
1729             --expDiff;
1730         }
1731         else {
1732             bSig |= 0x20000000;
1733         }
1734         shift32RightJamming( bSig, expDiff, &bSig );
1735         zExp = aExp;
1736     }
1737     else if ( expDiff < 0 ) {
1738         if ( bExp == 0xFF ) {
1739             if ( bSig ) return propagateFloat32NaN( a, b );
1740             return packFloat32( zSign, 0xFF, 0 );
1741         }
1742         if ( aExp == 0 ) {
1743             ++expDiff;
1744         }
1745         else {
1746             aSig |= 0x20000000;
1747         }
1748         shift32RightJamming( aSig, - expDiff, &aSig );
1749         zExp = bExp;
1750     }
1751     else {
1752         if ( aExp == 0xFF ) {
1753             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1754             return a;
1755         }
1756         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1757         zSig = 0x40000000 + aSig + bSig;
1758         zExp = aExp;
1759         goto roundAndPack;
1760     }
1761     aSig |= 0x20000000;
1762     zSig = ( aSig + bSig )<<1;
1763     --zExp;
1764     if ( (sbits32) zSig < 0 ) {
1765         zSig = aSig + bSig;
1766         ++zExp;
1767     }
1768  roundAndPack:
1769     return roundAndPackFloat32( zSign, zExp, zSig );
1770
1771 }
1772
1773 /*
1774 -------------------------------------------------------------------------------
1775 Returns the result of subtracting the absolute values of the single-
1776 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1777 difference is negated before being returned.  `zSign' is ignored if the
1778 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1779 Standard for Binary Floating-Point Arithmetic.
1780 -------------------------------------------------------------------------------
1781 */
1782 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1783 {
1784     int16 aExp, bExp, zExp;
1785     bits32 aSig, bSig, zSig;
1786     int16 expDiff;
1787
1788     aSig = extractFloat32Frac( a );
1789     aExp = extractFloat32Exp( a );
1790     bSig = extractFloat32Frac( b );
1791     bExp = extractFloat32Exp( b );
1792     expDiff = aExp - bExp;
1793     aSig <<= 7;
1794     bSig <<= 7;
1795     if ( 0 < expDiff ) goto aExpBigger;
1796     if ( expDiff < 0 ) goto bExpBigger;
1797     if ( aExp == 0xFF ) {
1798         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1799         float_raise( float_flag_invalid );
1800         return float32_default_nan;
1801     }
1802     if ( aExp == 0 ) {
1803         aExp = 1;
1804         bExp = 1;
1805     }
1806     if ( bSig < aSig ) goto aBigger;
1807     if ( aSig < bSig ) goto bBigger;
1808     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1809  bExpBigger:
1810     if ( bExp == 0xFF ) {
1811         if ( bSig ) return propagateFloat32NaN( a, b );
1812         return packFloat32( zSign ^ 1, 0xFF, 0 );
1813     }
1814     if ( aExp == 0 ) {
1815         ++expDiff;
1816     }
1817     else {
1818         aSig |= 0x40000000;
1819     }
1820     shift32RightJamming( aSig, - expDiff, &aSig );
1821     bSig |= 0x40000000;
1822  bBigger:
1823     zSig = bSig - aSig;
1824     zExp = bExp;
1825     zSign ^= 1;
1826     goto normalizeRoundAndPack;
1827  aExpBigger:
1828     if ( aExp == 0xFF ) {
1829         if ( aSig ) return propagateFloat32NaN( a, b );
1830         return a;
1831     }
1832     if ( bExp == 0 ) {
1833         --expDiff;
1834     }
1835     else {
1836         bSig |= 0x40000000;
1837     }
1838     shift32RightJamming( bSig, expDiff, &bSig );
1839     aSig |= 0x40000000;
1840  aBigger:
1841     zSig = aSig - bSig;
1842     zExp = aExp;
1843  normalizeRoundAndPack:
1844     --zExp;
1845     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1846
1847 }
1848
1849 /*
1850 -------------------------------------------------------------------------------
1851 Returns the result of adding the single-precision floating-point values `a'
1852 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1853 Binary Floating-Point Arithmetic.
1854 -------------------------------------------------------------------------------
1855 */
1856 float32 float32_add( float32 a, float32 b )
1857 {
1858     flag aSign, bSign;
1859
1860     aSign = extractFloat32Sign( a );
1861     bSign = extractFloat32Sign( b );
1862     if ( aSign == bSign ) {
1863         return addFloat32Sigs( a, b, aSign );
1864     }
1865     else {
1866         return subFloat32Sigs( a, b, aSign );
1867     }
1868
1869 }
1870
1871 /*
1872 -------------------------------------------------------------------------------
1873 Returns the result of subtracting the single-precision floating-point values
1874 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1875 for Binary Floating-Point Arithmetic.
1876 -------------------------------------------------------------------------------
1877 */
1878 float32 float32_sub( float32 a, float32 b )
1879 {
1880     flag aSign, bSign;
1881
1882     aSign = extractFloat32Sign( a );
1883     bSign = extractFloat32Sign( b );
1884     if ( aSign == bSign ) {
1885         return subFloat32Sigs( a, b, aSign );
1886     }
1887     else {
1888         return addFloat32Sigs( a, b, aSign );
1889     }
1890
1891 }
1892
1893 /*
1894 -------------------------------------------------------------------------------
1895 Returns the result of multiplying the single-precision floating-point values
1896 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1897 for Binary Floating-Point Arithmetic.
1898 -------------------------------------------------------------------------------
1899 */
1900 float32 float32_mul( float32 a, float32 b )
1901 {
1902     flag aSign, bSign, zSign;
1903     int16 aExp, bExp, zExp;
1904     bits32 aSig, bSig;
1905     bits64 zSig64;
1906     bits32 zSig;
1907
1908     aSig = extractFloat32Frac( a );
1909     aExp = extractFloat32Exp( a );
1910     aSign = extractFloat32Sign( a );
1911     bSig = extractFloat32Frac( b );
1912     bExp = extractFloat32Exp( b );
1913     bSign = extractFloat32Sign( b );
1914     zSign = aSign ^ bSign;
1915     if ( aExp == 0xFF ) {
1916         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1917             return propagateFloat32NaN( a, b );
1918         }
1919         if ( ( bExp | bSig ) == 0 ) {
1920             float_raise( float_flag_invalid );
1921             return float32_default_nan;
1922         }
1923         return packFloat32( zSign, 0xFF, 0 );
1924     }
1925     if ( bExp == 0xFF ) {
1926         if ( bSig ) return propagateFloat32NaN( a, b );
1927         if ( ( aExp | aSig ) == 0 ) {
1928             float_raise( float_flag_invalid );
1929             return float32_default_nan;
1930         }
1931         return packFloat32( zSign, 0xFF, 0 );
1932     }
1933     if ( aExp == 0 ) {
1934         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1935         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1936     }
1937     if ( bExp == 0 ) {
1938         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1939         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1940     }
1941     zExp = aExp + bExp - 0x7F;
1942     aSig = ( aSig | 0x00800000 )<<7;
1943     bSig = ( bSig | 0x00800000 )<<8;
1944     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1945     zSig = (bits32)zSig64;
1946     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1947         zSig <<= 1;
1948         --zExp;
1949     }
1950     return roundAndPackFloat32( zSign, zExp, zSig );
1951
1952 }
1953
1954 /*
1955 -------------------------------------------------------------------------------
1956 Returns the result of dividing the single-precision floating-point value `a'
1957 by the corresponding value `b'.  The operation is performed according to the
1958 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959 -------------------------------------------------------------------------------
1960 */
1961 float32 float32_div( float32 a, float32 b )
1962 {
1963     flag aSign, bSign, zSign;
1964     int16 aExp, bExp, zExp;
1965     bits32 aSig, bSig, zSig;
1966
1967     aSig = extractFloat32Frac( a );
1968     aExp = extractFloat32Exp( a );
1969     aSign = extractFloat32Sign( a );
1970     bSig = extractFloat32Frac( b );
1971     bExp = extractFloat32Exp( b );
1972     bSign = extractFloat32Sign( b );
1973     zSign = aSign ^ bSign;
1974     if ( aExp == 0xFF ) {
1975         if ( aSig ) return propagateFloat32NaN( a, b );
1976         if ( bExp == 0xFF ) {
1977             if ( bSig ) return propagateFloat32NaN( a, b );
1978             float_raise( float_flag_invalid );
1979             return float32_default_nan;
1980         }
1981         return packFloat32( zSign, 0xFF, 0 );
1982     }
1983     if ( bExp == 0xFF ) {
1984         if ( bSig ) return propagateFloat32NaN( a, b );
1985         return packFloat32( zSign, 0, 0 );
1986     }
1987     if ( bExp == 0 ) {
1988         if ( bSig == 0 ) {
1989             if ( ( aExp | aSig ) == 0 ) {
1990                 float_raise( float_flag_invalid );
1991                 return float32_default_nan;
1992             }
1993             float_raise( float_flag_divbyzero );
1994             return packFloat32( zSign, 0xFF, 0 );
1995         }
1996         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1997     }
1998     if ( aExp == 0 ) {
1999         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2000         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2001     }
2002     zExp = aExp - bExp + 0x7D;
2003     aSig = ( aSig | 0x00800000 )<<7;
2004     bSig = ( bSig | 0x00800000 )<<8;
2005     if ( bSig <= ( aSig + aSig ) ) {
2006         aSig >>= 1;
2007         ++zExp;
2008     }
2009     zSig = (bits32)((((bits64) aSig) << 32) / bSig);
2010     if ( ( zSig & 0x3F ) == 0 ) {
2011         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2012     }
2013     return roundAndPackFloat32( zSign, zExp, zSig );
2014
2015 }
2016
2017 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2018 /*
2019 -------------------------------------------------------------------------------
2020 Returns the remainder of the single-precision floating-point value `a'
2021 with respect to the corresponding value `b'.  The operation is performed
2022 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023 -------------------------------------------------------------------------------
2024 */
2025 float32 float32_rem( float32 a, float32 b )
2026 {
2027     flag aSign, bSign, zSign;
2028     int16 aExp, bExp, expDiff;
2029     bits32 aSig, bSig;
2030     bits32 q;
2031     bits64 aSig64, bSig64, q64;
2032     bits32 alternateASig;
2033     sbits32 sigMean;
2034
2035     aSig = extractFloat32Frac( a );
2036     aExp = extractFloat32Exp( a );
2037     aSign = extractFloat32Sign( a );
2038     bSig = extractFloat32Frac( b );
2039     bExp = extractFloat32Exp( b );
2040     bSign = extractFloat32Sign( b );
2041     if ( aExp == 0xFF ) {
2042         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2043             return propagateFloat32NaN( a, b );
2044         }
2045         float_raise( float_flag_invalid );
2046         return float32_default_nan;
2047     }
2048     if ( bExp == 0xFF ) {
2049         if ( bSig ) return propagateFloat32NaN( a, b );
2050         return a;
2051     }
2052     if ( bExp == 0 ) {
2053         if ( bSig == 0 ) {
2054             float_raise( float_flag_invalid );
2055             return float32_default_nan;
2056         }
2057         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2058     }
2059     if ( aExp == 0 ) {
2060         if ( aSig == 0 ) return a;
2061         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2062     }
2063     expDiff = aExp - bExp;
2064     aSig |= 0x00800000;
2065     bSig |= 0x00800000;
2066     if ( expDiff < 32 ) {
2067         aSig <<= 8;
2068         bSig <<= 8;
2069         if ( expDiff < 0 ) {
2070             if ( expDiff < -1 ) return a;
2071             aSig >>= 1;
2072         }
2073         q = ( bSig <= aSig );
2074         if ( q ) aSig -= bSig;
2075         if ( 0 < expDiff ) {
2076             q = ( ( (bits64) aSig )<<32 ) / bSig;
2077             q >>= 32 - expDiff;
2078             bSig >>= 2;
2079             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2080         }
2081         else {
2082             aSig >>= 2;
2083             bSig >>= 2;
2084         }
2085     }
2086     else {
2087         if ( bSig <= aSig ) aSig -= bSig;
2088         aSig64 = ( (bits64) aSig )<<40;
2089         bSig64 = ( (bits64) bSig )<<40;
2090         expDiff -= 64;
2091         while ( 0 < expDiff ) {
2092             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2093             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2094             aSig64 = - ( ( bSig * q64 )<<38 );
2095             expDiff -= 62;
2096         }
2097         expDiff += 64;
2098         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2099         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2100         q = q64>>( 64 - expDiff );
2101         bSig <<= 6;
2102         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2103     }
2104     do {
2105         alternateASig = aSig;
2106         ++q;
2107         aSig -= bSig;
2108     } while ( 0 <= (sbits32) aSig );
2109     sigMean = aSig + alternateASig;
2110     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2111         aSig = alternateASig;
2112     }
2113     zSign = ( (sbits32) aSig < 0 );
2114     if ( zSign ) aSig = - aSig;
2115     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2116
2117 }
2118 #endif /* !SOFTFLOAT_FOR_GCC */
2119
2120 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2121 /*
2122 -------------------------------------------------------------------------------
2123 Returns the square root of the single-precision floating-point value `a'.
2124 The operation is performed according to the IEC/IEEE Standard for Binary
2125 Floating-Point Arithmetic.
2126 -------------------------------------------------------------------------------
2127 */
2128 float32 float32_sqrt( float32 a )
2129 {
2130     flag aSign;
2131     int16 aExp, zExp;
2132     bits32 aSig, zSig;
2133     bits64 rem, term;
2134
2135     aSig = extractFloat32Frac( a );
2136     aExp = extractFloat32Exp( a );
2137     aSign = extractFloat32Sign( a );
2138     if ( aExp == 0xFF ) {
2139         if ( aSig ) return propagateFloat32NaN( a, 0 );
2140         if ( ! aSign ) return a;
2141         float_raise( float_flag_invalid );
2142         return float32_default_nan;
2143     }
2144     if ( aSign ) {
2145         if ( ( aExp | aSig ) == 0 ) return a;
2146         float_raise( float_flag_invalid );
2147         return float32_default_nan;
2148     }
2149     if ( aExp == 0 ) {
2150         if ( aSig == 0 ) return 0;
2151         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2152     }
2153     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2154     aSig = ( aSig | 0x00800000 )<<8;
2155     zSig = estimateSqrt32( aExp, aSig ) + 2;
2156     if ( ( zSig & 0x7F ) <= 5 ) {
2157         if ( zSig < 2 ) {
2158             zSig = 0x7FFFFFFF;
2159             goto roundAndPack;
2160         }
2161         aSig >>= aExp & 1;
2162         term = ( (bits64) zSig ) * zSig;
2163         rem = ( ( (bits64) aSig )<<32 ) - term;
2164         while ( (sbits64) rem < 0 ) {
2165             --zSig;
2166             rem += ( ( (bits64) zSig )<<1 ) | 1;
2167         }
2168         zSig |= ( rem != 0 );
2169     }
2170     shift32RightJamming( zSig, 1, &zSig );
2171  roundAndPack:
2172     return roundAndPackFloat32( 0, zExp, zSig );
2173
2174 }
2175 #endif /* !SOFTFLOAT_FOR_GCC */
2176
2177 /*
2178 -------------------------------------------------------------------------------
2179 Returns 1 if the single-precision floating-point value `a' is equal to
2180 the corresponding value `b', and 0 otherwise.  The comparison is performed
2181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2182 -------------------------------------------------------------------------------
2183 */
2184 flag float32_eq( float32 a, float32 b )
2185 {
2186
2187     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2188          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2189        ) {
2190         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2191             float_raise( float_flag_invalid );
2192         }
2193         return 0;
2194     }
2195     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2196
2197 }
2198
2199 /*
2200 -------------------------------------------------------------------------------
2201 Returns 1 if the single-precision floating-point value `a' is less than
2202 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2203 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2204 Arithmetic.
2205 -------------------------------------------------------------------------------
2206 */
2207 flag float32_le( float32 a, float32 b )
2208 {
2209     flag aSign, bSign;
2210
2211     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2212          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2213        ) {
2214         float_raise( float_flag_invalid );
2215         return 0;
2216     }
2217     aSign = extractFloat32Sign( a );
2218     bSign = extractFloat32Sign( b );
2219     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2220     return ( a == b ) || ( aSign ^ ( a < b ) );
2221
2222 }
2223
2224 /*
2225 -------------------------------------------------------------------------------
2226 Returns 1 if the single-precision floating-point value `a' is less than
2227 the corresponding value `b', and 0 otherwise.  The comparison is performed
2228 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2229 -------------------------------------------------------------------------------
2230 */
2231 flag float32_lt( float32 a, float32 b )
2232 {
2233     flag aSign, bSign;
2234
2235     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2236          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2237        ) {
2238         float_raise( float_flag_invalid );
2239         return 0;
2240     }
2241     aSign = extractFloat32Sign( a );
2242     bSign = extractFloat32Sign( b );
2243     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2244     return ( a != b ) && ( aSign ^ ( a < b ) );
2245
2246 }
2247
2248 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2249 /*
2250 -------------------------------------------------------------------------------
2251 Returns 1 if the single-precision floating-point value `a' is equal to
2252 the corresponding value `b', and 0 otherwise.  The invalid exception is
2253 raised if either operand is a NaN.  Otherwise, the comparison is performed
2254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2255 -------------------------------------------------------------------------------
2256 */
2257 flag float32_eq_signaling( float32 a, float32 b )
2258 {
2259
2260     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2261          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2262        ) {
2263         float_raise( float_flag_invalid );
2264         return 0;
2265     }
2266     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2267
2268 }
2269
2270 /*
2271 -------------------------------------------------------------------------------
2272 Returns 1 if the single-precision floating-point value `a' is less than or
2273 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2274 cause an exception.  Otherwise, the comparison is performed according to the
2275 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 -------------------------------------------------------------------------------
2277 */
2278 flag float32_le_quiet( float32 a, float32 b )
2279 {
2280     flag aSign, bSign;
2281
2282     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2283          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2284        ) {
2285         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2286             float_raise( float_flag_invalid );
2287         }
2288         return 0;
2289     }
2290     aSign = extractFloat32Sign( a );
2291     bSign = extractFloat32Sign( b );
2292     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2293     return ( a == b ) || ( aSign ^ ( a < b ) );
2294
2295 }
2296
2297 /*
2298 -------------------------------------------------------------------------------
2299 Returns 1 if the single-precision floating-point value `a' is less than
2300 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2301 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2302 Standard for Binary Floating-Point Arithmetic.
2303 -------------------------------------------------------------------------------
2304 */
2305 flag float32_lt_quiet( float32 a, float32 b )
2306 {
2307     flag aSign, bSign;
2308
2309     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2310          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2311        ) {
2312         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2313             float_raise( float_flag_invalid );
2314         }
2315         return 0;
2316     }
2317     aSign = extractFloat32Sign( a );
2318     bSign = extractFloat32Sign( b );
2319     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2320     return ( a != b ) && ( aSign ^ ( a < b ) );
2321
2322 }
2323 #endif /* !SOFTFLOAT_FOR_GCC */
2324
2325 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2326 /*
2327 -------------------------------------------------------------------------------
2328 Returns the result of converting the double-precision floating-point value
2329 `a' to the 32-bit two's complement integer format.  The conversion is
2330 performed according to the IEC/IEEE Standard for Binary Floating-Point
2331 Arithmetic---which means in particular that the conversion is rounded
2332 according to the current rounding mode.  If `a' is a NaN, the largest
2333 positive integer is returned.  Otherwise, if the conversion overflows, the
2334 largest integer with the same sign as `a' is returned.
2335 -------------------------------------------------------------------------------
2336 */
2337 int32 float64_to_int32( float64 a )
2338 {
2339     flag aSign;
2340     int16 aExp, shiftCount;
2341     bits64 aSig;
2342
2343     aSig = extractFloat64Frac( a );
2344     aExp = extractFloat64Exp( a );
2345     aSign = extractFloat64Sign( a );
2346     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2347     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2348     shiftCount = 0x42C - aExp;
2349     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2350     return roundAndPackInt32( aSign, aSig );
2351
2352 }
2353 #endif /* !SOFTFLOAT_FOR_GCC */
2354
2355 /*
2356 -------------------------------------------------------------------------------
2357 Returns the result of converting the double-precision floating-point value
2358 `a' to the 32-bit two's complement integer format.  The conversion is
2359 performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 Arithmetic, except that the conversion is always rounded toward zero.
2361 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2362 the conversion overflows, the largest integer with the same sign as `a' is
2363 returned.
2364 -------------------------------------------------------------------------------
2365 */
2366 int32 float64_to_int32_round_to_zero( float64 a )
2367 {
2368     flag aSign;
2369     int16 aExp, shiftCount;
2370     bits64 aSig, savedASig;
2371     int32 z;
2372
2373     aSig = extractFloat64Frac( a );
2374     aExp = extractFloat64Exp( a );
2375     aSign = extractFloat64Sign( a );
2376     if ( 0x41E < aExp ) {
2377         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2378         goto invalid;
2379     }
2380     else if ( aExp < 0x3FF ) {
2381         if ( aExp || aSig ) set_float_exception_inexact_flag();
2382         return 0;
2383     }
2384     aSig |= LIT64( 0x0010000000000000 );
2385     shiftCount = 0x433 - aExp;
2386     savedASig = aSig;
2387     aSig >>= shiftCount;
2388     z = (int32)aSig;
2389     if ( aSign ) z = - z;
2390     if ( ( z < 0 ) ^ aSign ) {
2391  invalid:
2392         float_raise( float_flag_invalid );
2393         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2394     }
2395     if ( ( aSig<<shiftCount ) != savedASig ) {
2396         set_float_exception_inexact_flag();
2397     }
2398     return z;
2399
2400 }
2401
2402 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2403 /*
2404 -------------------------------------------------------------------------------
2405 Returns the result of converting the double-precision floating-point value
2406 `a' to the 64-bit two's complement integer format.  The conversion is
2407 performed according to the IEC/IEEE Standard for Binary Floating-Point
2408 Arithmetic---which means in particular that the conversion is rounded
2409 according to the current rounding mode.  If `a' is a NaN, the largest
2410 positive integer is returned.  Otherwise, if the conversion overflows, the
2411 largest integer with the same sign as `a' is returned.
2412 -------------------------------------------------------------------------------
2413 */
2414 int64 float64_to_int64( float64 a )
2415 {
2416     flag aSign;
2417     int16 aExp, shiftCount;
2418     bits64 aSig, aSigExtra;
2419
2420     aSig = extractFloat64Frac( a );
2421     aExp = extractFloat64Exp( a );
2422     aSign = extractFloat64Sign( a );
2423     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2424     shiftCount = 0x433 - aExp;
2425     if ( shiftCount <= 0 ) {
2426         if ( 0x43E < aExp ) {
2427             float_raise( float_flag_invalid );
2428             if (    ! aSign
2429                  || (    ( aExp == 0x7FF )
2430                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2431                ) {
2432                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2433             }
2434             return (sbits64) LIT64( 0x8000000000000000 );
2435         }
2436         aSigExtra = 0;
2437         aSig <<= - shiftCount;
2438     }
2439     else {
2440         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2441     }
2442     return roundAndPackInt64( aSign, aSig, aSigExtra );
2443
2444 }
2445
2446 /*
2447 -------------------------------------------------------------------------------
2448 Returns the result of converting the double-precision floating-point value
2449 `a' to the 64-bit two's complement integer format.  The conversion is
2450 performed according to the IEC/IEEE Standard for Binary Floating-Point
2451 Arithmetic, except that the conversion is always rounded toward zero.
2452 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2453 the conversion overflows, the largest integer with the same sign as `a' is
2454 returned.
2455 -------------------------------------------------------------------------------
2456 */
2457 int64 float64_to_int64_round_to_zero( float64 a )
2458 {
2459     flag aSign;
2460     int16 aExp, shiftCount;
2461     bits64 aSig;
2462     int64 z;
2463
2464     aSig = extractFloat64Frac( a );
2465     aExp = extractFloat64Exp( a );
2466     aSign = extractFloat64Sign( a );
2467     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2468     shiftCount = aExp - 0x433;
2469     if ( 0 <= shiftCount ) {
2470         if ( 0x43E <= aExp ) {
2471             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2472                 float_raise( float_flag_invalid );
2473                 if (    ! aSign
2474                      || (    ( aExp == 0x7FF )
2475                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2476                    ) {
2477                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2478                 }
2479             }
2480             return (sbits64) LIT64( 0x8000000000000000 );
2481         }
2482         z = aSig<<shiftCount;
2483     }
2484     else {
2485         if ( aExp < 0x3FE ) {
2486             if ( aExp | aSig ) set_float_exception_inexact_flag();
2487             return 0;
2488         }
2489         z = aSig>>( - shiftCount );
2490         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2491             set_float_exception_inexact_flag();
2492         }
2493     }
2494     if ( aSign ) z = - z;
2495     return z;
2496
2497 }
2498 #endif /* !SOFTFLOAT_FOR_GCC */
2499
2500 /*
2501 -------------------------------------------------------------------------------
2502 Returns the result of converting the double-precision floating-point value
2503 `a' to the single-precision floating-point format.  The conversion is
2504 performed according to the IEC/IEEE Standard for Binary Floating-Point
2505 Arithmetic.
2506 -------------------------------------------------------------------------------
2507 */
2508 float32 float64_to_float32( float64 a )
2509 {
2510     flag aSign;
2511     int16 aExp;
2512     bits64 aSig;
2513     bits32 zSig;
2514
2515     aSig = extractFloat64Frac( a );
2516     aExp = extractFloat64Exp( a );
2517     aSign = extractFloat64Sign( a );
2518     if ( aExp == 0x7FF ) {
2519         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2520         return packFloat32( aSign, 0xFF, 0 );
2521     }
2522     shift64RightJamming( aSig, 22, &aSig );
2523     zSig = (bits32)aSig;
2524     if ( aExp || zSig ) {
2525         zSig |= 0x40000000;
2526         aExp -= 0x381;
2527     }
2528     return roundAndPackFloat32( aSign, aExp, zSig );
2529
2530 }
2531
2532 #ifdef FLOATX80
2533
2534 /*
2535 -------------------------------------------------------------------------------
2536 Returns the result of converting the double-precision floating-point value
2537 `a' to the extended double-precision floating-point format.  The conversion
2538 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2539 Arithmetic.
2540 -------------------------------------------------------------------------------
2541 */
2542 floatx80 float64_to_floatx80( float64 a )
2543 {
2544     flag aSign;
2545     int16 aExp;
2546     bits64 aSig;
2547
2548     aSig = extractFloat64Frac( a );
2549     aExp = extractFloat64Exp( a );
2550     aSign = extractFloat64Sign( a );
2551     if ( aExp == 0x7FF ) {
2552         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2553         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2554     }
2555     if ( aExp == 0 ) {
2556         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2557         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2558     }
2559     return
2560         packFloatx80(
2561             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2562
2563 }
2564
2565 #endif
2566
2567 #ifdef FLOAT128
2568
2569 /*
2570 -------------------------------------------------------------------------------
2571 Returns the result of converting the double-precision floating-point value
2572 `a' to the quadruple-precision floating-point format.  The conversion is
2573 performed according to the IEC/IEEE Standard for Binary Floating-Point
2574 Arithmetic.
2575 -------------------------------------------------------------------------------
2576 */
2577 float128 float64_to_float128( float64 a )
2578 {
2579     flag aSign;
2580     int16 aExp;
2581     bits64 aSig, zSig0, zSig1;
2582
2583     aSig = extractFloat64Frac( a );
2584     aExp = extractFloat64Exp( a );
2585     aSign = extractFloat64Sign( a );
2586     if ( aExp == 0x7FF ) {
2587         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2588         return packFloat128( aSign, 0x7FFF, 0, 0 );
2589     }
2590     if ( aExp == 0 ) {
2591         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2592         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2593         --aExp;
2594     }
2595     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2596     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2597
2598 }
2599
2600 #endif
2601
2602 #ifndef SOFTFLOAT_FOR_GCC
2603 /*
2604 -------------------------------------------------------------------------------
2605 Rounds the double-precision floating-point value `a' to an integer, and
2606 returns the result as a double-precision floating-point value.  The
2607 operation is performed according to the IEC/IEEE Standard for Binary
2608 Floating-Point Arithmetic.
2609 -------------------------------------------------------------------------------
2610 */
2611 float64 float64_round_to_int( float64 a )
2612 {
2613     flag aSign;
2614     int16 aExp;
2615     bits64 lastBitMask, roundBitsMask;
2616     int8 roundingMode;
2617     float64 z;
2618
2619     aExp = extractFloat64Exp( a );
2620     if ( 0x433 <= aExp ) {
2621         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2622             return propagateFloat64NaN( a, a );
2623         }
2624         return a;
2625     }
2626     if ( aExp < 0x3FF ) {
2627         if ( (bits64) ( a<<1 ) == 0 ) return a;
2628         set_float_exception_inexact_flag();
2629         aSign = extractFloat64Sign( a );
2630         switch ( float_rounding_mode ) {
2631          case float_round_nearest_even:
2632             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2633                 return packFloat64( aSign, 0x3FF, 0 );
2634             }
2635             break;
2636          case float_round_to_zero:
2637             break;
2638          case float_round_down:
2639             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2640          case float_round_up:
2641             return
2642             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2643         }
2644         return packFloat64( aSign, 0, 0 );
2645     }
2646     lastBitMask = 1;
2647     lastBitMask <<= 0x433 - aExp;
2648     roundBitsMask = lastBitMask - 1;
2649     z = a;
2650     roundingMode = float_rounding_mode;
2651     if ( roundingMode == float_round_nearest_even ) {
2652         z += lastBitMask>>1;
2653         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2654     }
2655     else if ( roundingMode != float_round_to_zero ) {
2656         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2657             z += roundBitsMask;
2658         }
2659     }
2660     z &= ~ roundBitsMask;
2661     if ( z != a ) set_float_exception_inexact_flag();
2662     return z;
2663
2664 }
2665 #endif
2666
2667 /*
2668 -------------------------------------------------------------------------------
2669 Returns the result of adding the absolute values of the double-precision
2670 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2671 before being returned.  `zSign' is ignored if the result is a NaN.
2672 The addition is performed according to the IEC/IEEE Standard for Binary
2673 Floating-Point Arithmetic.
2674 -------------------------------------------------------------------------------
2675 */
2676 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2677 {
2678     int16 aExp, bExp, zExp;
2679     bits64 aSig, bSig, zSig;
2680     int16 expDiff;
2681
2682     aSig = extractFloat64Frac( a );
2683     aExp = extractFloat64Exp( a );
2684     bSig = extractFloat64Frac( b );
2685     bExp = extractFloat64Exp( b );
2686     expDiff = aExp - bExp;
2687     aSig <<= 9;
2688     bSig <<= 9;
2689     if ( 0 < expDiff ) {
2690         if ( aExp == 0x7FF ) {
2691             if ( aSig ) return propagateFloat64NaN( a, b );
2692             return a;
2693         }
2694         if ( bExp == 0 ) {
2695             --expDiff;
2696         }
2697         else {
2698             bSig |= LIT64( 0x2000000000000000 );
2699         }
2700         shift64RightJamming( bSig, expDiff, &bSig );
2701         zExp = aExp;
2702     }
2703     else if ( expDiff < 0 ) {
2704         if ( bExp == 0x7FF ) {
2705             if ( bSig ) return propagateFloat64NaN( a, b );
2706             return packFloat64( zSign, 0x7FF, 0 );
2707         }
2708         if ( aExp == 0 ) {
2709             ++expDiff;
2710         }
2711         else {
2712             aSig |= LIT64( 0x2000000000000000 );
2713         }
2714         shift64RightJamming( aSig, - expDiff, &aSig );
2715         zExp = bExp;
2716     }
2717     else {
2718         if ( aExp == 0x7FF ) {
2719             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2720             return a;
2721         }
2722         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2723         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2724         zExp = aExp;
2725         goto roundAndPack;
2726     }
2727     aSig |= LIT64( 0x2000000000000000 );
2728     zSig = ( aSig + bSig )<<1;
2729     --zExp;
2730     if ( (sbits64) zSig < 0 ) {
2731         zSig = aSig + bSig;
2732         ++zExp;
2733     }
2734  roundAndPack:
2735     return roundAndPackFloat64( zSign, zExp, zSig );
2736
2737 }
2738
2739 /*
2740 -------------------------------------------------------------------------------
2741 Returns the result of subtracting the absolute values of the double-
2742 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2743 difference is negated before being returned.  `zSign' is ignored if the
2744 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2745 Standard for Binary Floating-Point Arithmetic.
2746 -------------------------------------------------------------------------------
2747 */
2748 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2749 {
2750     int16 aExp, bExp, zExp;
2751     bits64 aSig, bSig, zSig;
2752     int16 expDiff;
2753
2754     aSig = extractFloat64Frac( a );
2755     aExp = extractFloat64Exp( a );
2756     bSig = extractFloat64Frac( b );
2757     bExp = extractFloat64Exp( b );
2758     expDiff = aExp - bExp;
2759     aSig <<= 10;
2760     bSig <<= 10;
2761     if ( 0 < expDiff ) goto aExpBigger;
2762     if ( expDiff < 0 ) goto bExpBigger;
2763     if ( aExp == 0x7FF ) {
2764         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2765         float_raise( float_flag_invalid );
2766         return float64_default_nan;
2767     }
2768     if ( aExp == 0 ) {
2769         aExp = 1;
2770         bExp = 1;
2771     }
2772     if ( bSig < aSig ) goto aBigger;
2773     if ( aSig < bSig ) goto bBigger;
2774     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2775  bExpBigger:
2776     if ( bExp == 0x7FF ) {
2777         if ( bSig ) return propagateFloat64NaN( a, b );
2778         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2779     }
2780     if ( aExp == 0 ) {
2781         ++expDiff;
2782     }
2783     else {
2784         aSig |= LIT64( 0x4000000000000000 );
2785     }
2786     shift64RightJamming( aSig, - expDiff, &aSig );
2787     bSig |= LIT64( 0x4000000000000000 );
2788  bBigger:
2789     zSig = bSig - aSig;
2790     zExp = bExp;
2791     zSign ^= 1;
2792     goto normalizeRoundAndPack;
2793  aExpBigger:
2794     if ( aExp == 0x7FF ) {
2795         if ( aSig ) return propagateFloat64NaN( a, b );
2796         return a;
2797     }
2798     if ( bExp == 0 ) {
2799         --expDiff;
2800     }
2801     else {
2802         bSig |= LIT64( 0x4000000000000000 );
2803     }
2804     shift64RightJamming( bSig, expDiff, &bSig );
2805     aSig |= LIT64( 0x4000000000000000 );
2806  aBigger:
2807     zSig = aSig - bSig;
2808     zExp = aExp;
2809  normalizeRoundAndPack:
2810     --zExp;
2811     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2812
2813 }
2814
2815 /*
2816 -------------------------------------------------------------------------------
2817 Returns the result of adding the double-precision floating-point values `a'
2818 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2819 Binary Floating-Point Arithmetic.
2820 -------------------------------------------------------------------------------
2821 */
2822 float64 float64_add( float64 a, float64 b )
2823 {
2824     flag aSign, bSign;
2825
2826     aSign = extractFloat64Sign( a );
2827     bSign = extractFloat64Sign( b );
2828     if ( aSign == bSign ) {
2829         return addFloat64Sigs( a, b, aSign );
2830     }
2831     else {
2832         return subFloat64Sigs( a, b, aSign );
2833     }
2834
2835 }
2836
2837 /*
2838 -------------------------------------------------------------------------------
2839 Returns the result of subtracting the double-precision floating-point values
2840 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2841 for Binary Floating-Point Arithmetic.
2842 -------------------------------------------------------------------------------
2843 */
2844 float64 float64_sub( float64 a, float64 b )
2845 {
2846     flag aSign, bSign;
2847
2848     aSign = extractFloat64Sign( a );
2849     bSign = extractFloat64Sign( b );
2850     if ( aSign == bSign ) {
2851         return subFloat64Sigs( a, b, aSign );
2852     }
2853     else {
2854         return addFloat64Sigs( a, b, aSign );
2855     }
2856
2857 }
2858
2859 /*
2860 -------------------------------------------------------------------------------
2861 Returns the result of multiplying the double-precision floating-point values
2862 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2863 for Binary Floating-Point Arithmetic.
2864 -------------------------------------------------------------------------------
2865 */
2866 float64 float64_mul( float64 a, float64 b )
2867 {
2868     flag aSign, bSign, zSign;
2869     int16 aExp, bExp, zExp;
2870     bits64 aSig, bSig, zSig0, zSig1;
2871
2872     aSig = extractFloat64Frac( a );
2873     aExp = extractFloat64Exp( a );
2874     aSign = extractFloat64Sign( a );
2875     bSig = extractFloat64Frac( b );
2876     bExp = extractFloat64Exp( b );
2877     bSign = extractFloat64Sign( b );
2878     zSign = aSign ^ bSign;
2879     if ( aExp == 0x7FF ) {
2880         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2881             return propagateFloat64NaN( a, b );
2882         }
2883         if ( ( bExp | bSig ) == 0 ) {
2884             float_raise( float_flag_invalid );
2885             return float64_default_nan;
2886         }
2887         return packFloat64( zSign, 0x7FF, 0 );
2888     }
2889     if ( bExp == 0x7FF ) {
2890         if ( bSig ) return propagateFloat64NaN( a, b );
2891         if ( ( aExp | aSig ) == 0 ) {
2892             float_raise( float_flag_invalid );
2893             return float64_default_nan;
2894         }
2895         return packFloat64( zSign, 0x7FF, 0 );
2896     }
2897     if ( aExp == 0 ) {
2898         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2899         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2900     }
2901     if ( bExp == 0 ) {
2902         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2903         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2904     }
2905     zExp = aExp + bExp - 0x3FF;
2906     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2907     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2908     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2909     zSig0 |= ( zSig1 != 0 );
2910     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2911         zSig0 <<= 1;
2912         --zExp;
2913     }
2914     return roundAndPackFloat64( zSign, zExp, zSig0 );
2915
2916 }
2917
2918 /*
2919 -------------------------------------------------------------------------------
2920 Returns the result of dividing the double-precision floating-point value `a'
2921 by the corresponding value `b'.  The operation is performed according to
2922 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2923 -------------------------------------------------------------------------------
2924 */
2925 float64 float64_div( float64 a, float64 b )
2926 {
2927     flag aSign, bSign, zSign;
2928     int16 aExp, bExp, zExp;
2929     bits64 aSig, bSig, zSig;
2930     bits64 rem0, rem1;
2931     bits64 term0, term1;
2932
2933     aSig = extractFloat64Frac( a );
2934     aExp = extractFloat64Exp( a );
2935     aSign = extractFloat64Sign( a );
2936     bSig = extractFloat64Frac( b );
2937     bExp = extractFloat64Exp( b );
2938     bSign = extractFloat64Sign( b );
2939     zSign = aSign ^ bSign;
2940     if ( aExp == 0x7FF ) {
2941         if ( aSig ) return propagateFloat64NaN( a, b );
2942         if ( bExp == 0x7FF ) {
2943             if ( bSig ) return propagateFloat64NaN( a, b );
2944             float_raise( float_flag_invalid );
2945             return float64_default_nan;
2946         }
2947         return packFloat64( zSign, 0x7FF, 0 );
2948     }
2949     if ( bExp == 0x7FF ) {
2950         if ( bSig ) return propagateFloat64NaN( a, b );
2951         return packFloat64( zSign, 0, 0 );
2952     }
2953     if ( bExp == 0 ) {
2954         if ( bSig == 0 ) {
2955             if ( ( aExp | aSig ) == 0 ) {
2956                 float_raise( float_flag_invalid );
2957                 return float64_default_nan;
2958             }
2959             float_raise( float_flag_divbyzero );
2960             return packFloat64( zSign, 0x7FF, 0 );
2961         }
2962         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2963     }
2964     if ( aExp == 0 ) {
2965         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2966         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2967     }
2968     zExp = aExp - bExp + 0x3FD;
2969     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2970     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2971     if ( bSig <= ( aSig + aSig ) ) {
2972         aSig >>= 1;
2973         ++zExp;
2974     }
2975     zSig = estimateDiv128To64( aSig, 0, bSig );
2976     if ( ( zSig & 0x1FF ) <= 2 ) {
2977         mul64To128( bSig, zSig, &term0, &term1 );
2978         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2979         while ( (sbits64) rem0 < 0 ) {
2980             --zSig;
2981             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2982         }
2983         zSig |= ( rem1 != 0 );
2984     }
2985     return roundAndPackFloat64( zSign, zExp, zSig );
2986
2987 }
2988
2989 #ifndef SOFTFLOAT_FOR_GCC
2990 /*
2991 -------------------------------------------------------------------------------
2992 Returns the remainder of the double-precision floating-point value `a'
2993 with respect to the corresponding value `b'.  The operation is performed
2994 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2995 -------------------------------------------------------------------------------
2996 */
2997 float64 float64_rem( float64 a, float64 b )
2998 {
2999     flag aSign, bSign, zSign;
3000     int16 aExp, bExp, expDiff;
3001     bits64 aSig, bSig;
3002     bits64 q, alternateASig;
3003     sbits64 sigMean;
3004
3005     aSig = extractFloat64Frac( a );
3006     aExp = extractFloat64Exp( a );
3007     aSign = extractFloat64Sign( a );
3008     bSig = extractFloat64Frac( b );
3009     bExp = extractFloat64Exp( b );
3010     bSign = extractFloat64Sign( b );
3011     if ( aExp == 0x7FF ) {
3012         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3013             return propagateFloat64NaN( a, b );
3014         }
3015         float_raise( float_flag_invalid );
3016         return float64_default_nan;
3017     }
3018     if ( bExp == 0x7FF ) {
3019         if ( bSig ) return propagateFloat64NaN( a, b );
3020         return a;
3021     }
3022     if ( bExp == 0 ) {
3023         if ( bSig == 0 ) {
3024             float_raise( float_flag_invalid );
3025             return float64_default_nan;
3026         }
3027         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3028     }
3029     if ( aExp == 0 ) {
3030         if ( aSig == 0 ) return a;
3031         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3032     }
3033     expDiff = aExp - bExp;
3034     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3035     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3036     if ( expDiff < 0 ) {
3037         if ( expDiff < -1 ) return a;
3038         aSig >>= 1;
3039     }
3040     q = ( bSig <= aSig );
3041     if ( q ) aSig -= bSig;
3042     expDiff -= 64;
3043     while ( 0 < expDiff ) {
3044         q = estimateDiv128To64( aSig, 0, bSig );
3045         q = ( 2 < q ) ? q - 2 : 0;
3046         aSig = - ( ( bSig>>2 ) * q );
3047         expDiff -= 62;
3048     }
3049     expDiff += 64;
3050     if ( 0 < expDiff ) {
3051         q = estimateDiv128To64( aSig, 0, bSig );
3052         q = ( 2 < q ) ? q - 2 : 0;
3053         q >>= 64 - expDiff;
3054         bSig >>= 2;
3055         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3056     }
3057     else {
3058         aSig >>= 2;
3059         bSig >>= 2;
3060     }
3061     do {
3062         alternateASig = aSig;
3063         ++q;
3064         aSig -= bSig;
3065     } while ( 0 <= (sbits64) aSig );
3066     sigMean = aSig + alternateASig;
3067     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3068         aSig = alternateASig;
3069     }
3070     zSign = ( (sbits64) aSig < 0 );
3071     if ( zSign ) aSig = - aSig;
3072     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3073
3074 }
3075
3076 /*
3077 -------------------------------------------------------------------------------
3078 Returns the square root of the double-precision floating-point value `a'.
3079 The operation is performed according to the IEC/IEEE Standard for Binary
3080 Floating-Point Arithmetic.
3081 -------------------------------------------------------------------------------
3082 */
3083 float64 float64_sqrt( float64 a )
3084 {
3085     flag aSign;
3086     int16 aExp, zExp;
3087     bits64 aSig, zSig, doubleZSig;
3088     bits64 rem0, rem1, term0, term1;
3089
3090     aSig = extractFloat64Frac( a );
3091     aExp = extractFloat64Exp( a );
3092     aSign = extractFloat64Sign( a );
3093     if ( aExp == 0x7FF ) {
3094         if ( aSig ) return propagateFloat64NaN( a, a );
3095         if ( ! aSign ) return a;
3096         float_raise( float_flag_invalid );
3097         return float64_default_nan;
3098     }
3099     if ( aSign ) {
3100         if ( ( aExp | aSig ) == 0 ) return a;
3101         float_raise( float_flag_invalid );
3102         return float64_default_nan;
3103     }
3104     if ( aExp == 0 ) {
3105         if ( aSig == 0 ) return 0;
3106         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3107     }
3108     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3109     aSig |= LIT64( 0x0010000000000000 );
3110     zSig = estimateSqrt32( aExp, aSig>>21 );
3111     aSig <<= 9 - ( aExp & 1 );
3112     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3113     if ( ( zSig & 0x1FF ) <= 5 ) {
3114         doubleZSig = zSig<<1;
3115         mul64To128( zSig, zSig, &term0, &term1 );
3116         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3117         while ( (sbits64) rem0 < 0 ) {
3118             --zSig;
3119             doubleZSig -= 2;
3120             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3121         }
3122         zSig |= ( ( rem0 | rem1 ) != 0 );
3123     }
3124     return roundAndPackFloat64( 0, zExp, zSig );
3125
3126 }
3127 #endif
3128
3129 /*
3130 -------------------------------------------------------------------------------
3131 Returns 1 if the double-precision floating-point value `a' is equal to the
3132 corresponding value `b', and 0 otherwise.  The comparison is performed
3133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3134 -------------------------------------------------------------------------------
3135 */
3136 flag float64_eq( float64 a, float64 b )
3137 {
3138
3139     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3140          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3141        ) {
3142         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3143             float_raise( float_flag_invalid );
3144         }
3145         return 0;
3146     }
3147     return ( a == b ) ||
3148         ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3149
3150 }
3151
3152 /*
3153 -------------------------------------------------------------------------------
3154 Returns 1 if the double-precision floating-point value `a' is less than or
3155 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3156 performed according to the IEC/IEEE Standard for Binary Floating-Point
3157 Arithmetic.
3158 -------------------------------------------------------------------------------
3159 */
3160 flag float64_le( float64 a, float64 b )
3161 {
3162     flag aSign, bSign;
3163
3164     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3165          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3166        ) {
3167         float_raise( float_flag_invalid );
3168         return 0;
3169     }
3170     aSign = extractFloat64Sign( a );
3171     bSign = extractFloat64Sign( b );
3172     if ( aSign != bSign )
3173         return aSign ||
3174             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3175               0 );
3176     return ( a == b ) ||
3177         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3178
3179 }
3180
3181 /*
3182 -------------------------------------------------------------------------------
3183 Returns 1 if the double-precision floating-point value `a' is less than
3184 the corresponding value `b', and 0 otherwise.  The comparison is performed
3185 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3186 -------------------------------------------------------------------------------
3187 */
3188 flag float64_lt( float64 a, float64 b )
3189 {
3190     flag aSign, bSign;
3191
3192     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3193          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3194        ) {
3195         float_raise( float_flag_invalid );
3196         return 0;
3197     }
3198     aSign = extractFloat64Sign( a );
3199     bSign = extractFloat64Sign( b );
3200     if ( aSign != bSign )
3201         return aSign &&
3202             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3203               0 );
3204     return ( a != b ) &&
3205         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3206
3207 }
3208
3209 #ifndef SOFTFLOAT_FOR_GCC
3210 /*
3211 -------------------------------------------------------------------------------
3212 Returns 1 if the double-precision floating-point value `a' is equal to the
3213 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3214 if either operand is a NaN.  Otherwise, the comparison is performed
3215 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3216 -------------------------------------------------------------------------------
3217 */
3218 flag float64_eq_signaling( float64 a, float64 b )
3219 {
3220
3221     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3222          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3223        ) {
3224         float_raise( float_flag_invalid );
3225         return 0;
3226     }
3227     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3228
3229 }
3230
3231 /*
3232 -------------------------------------------------------------------------------
3233 Returns 1 if the double-precision floating-point value `a' is less than or
3234 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3235 cause an exception.  Otherwise, the comparison is performed according to the
3236 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3237 -------------------------------------------------------------------------------
3238 */
3239 flag float64_le_quiet( float64 a, float64 b )
3240 {
3241     flag aSign, bSign;
3242
3243     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3244          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3245        ) {
3246         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3247             float_raise( float_flag_invalid );
3248         }
3249         return 0;
3250     }
3251     aSign = extractFloat64Sign( a );
3252     bSign = extractFloat64Sign( b );
3253     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3254     return ( a == b ) || ( aSign ^ ( a < b ) );
3255
3256 }
3257
3258 /*
3259 -------------------------------------------------------------------------------
3260 Returns 1 if the double-precision floating-point value `a' is less than
3261 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3262 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3263 Standard for Binary Floating-Point Arithmetic.
3264 -------------------------------------------------------------------------------
3265 */
3266 flag float64_lt_quiet( float64 a, float64 b )
3267 {
3268     flag aSign, bSign;
3269
3270     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3271          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3272        ) {
3273         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3274             float_raise( float_flag_invalid );
3275         }
3276         return 0;
3277     }
3278     aSign = extractFloat64Sign( a );
3279     bSign = extractFloat64Sign( b );
3280     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3281     return ( a != b ) && ( aSign ^ ( a < b ) );
3282
3283 }
3284 #endif
3285
3286 #ifdef FLOATX80
3287
3288 /*
3289 -------------------------------------------------------------------------------
3290 Returns the result of converting the extended double-precision floating-
3291 point value `a' to the 32-bit two's complement integer format.  The
3292 conversion is performed according to the IEC/IEEE Standard for Binary
3293 Floating-Point Arithmetic---which means in particular that the conversion
3294 is rounded according to the current rounding mode.  If `a' is a NaN, the
3295 largest positive integer is returned.  Otherwise, if the conversion
3296 overflows, the largest integer with the same sign as `a' is returned.
3297 -------------------------------------------------------------------------------
3298 */
3299 int32 floatx80_to_int32( floatx80 a )
3300 {
3301     flag aSign;
3302     int32 aExp, shiftCount;
3303     bits64 aSig;
3304
3305     aSig = extractFloatx80Frac( a );
3306     aExp = extractFloatx80Exp( a );
3307     aSign = extractFloatx80Sign( a );
3308     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3309     shiftCount = 0x4037 - aExp;
3310     if ( shiftCount <= 0 ) shiftCount = 1;
3311     shift64RightJamming( aSig, shiftCount, &aSig );
3312     return roundAndPackInt32( aSign, aSig );
3313
3314 }
3315
3316 /*
3317 -------------------------------------------------------------------------------
3318 Returns the result of converting the extended double-precision floating-
3319 point value `a' to the 32-bit two's complement integer format.  The
3320 conversion is performed according to the IEC/IEEE Standard for Binary
3321 Floating-Point Arithmetic, except that the conversion is always rounded
3322 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3323 Otherwise, if the conversion overflows, the largest integer with the same
3324 sign as `a' is returned.
3325 -------------------------------------------------------------------------------
3326 */
3327 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3328 {
3329     flag aSign;
3330     int32 aExp, shiftCount;
3331     bits64 aSig, savedASig;
3332     int32 z;
3333
3334     aSig = extractFloatx80Frac( a );
3335     aExp = extractFloatx80Exp( a );
3336     aSign = extractFloatx80Sign( a );
3337     if ( 0x401E < aExp ) {
3338         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3339         goto invalid;
3340     }
3341     else if ( aExp < 0x3FFF ) {
3342         if ( aExp || aSig ) set_float_exception_inexact_flag();
3343         return 0;
3344     }
3345     shiftCount = 0x403E - aExp;
3346     savedASig = aSig;
3347     aSig >>= shiftCount;
3348     z = aSig;
3349     if ( aSign ) z = - z;
3350     if ( ( z < 0 ) ^ aSign ) {
3351  invalid:
3352         float_raise( float_flag_invalid );
3353         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3354     }
3355     if ( ( aSig<<shiftCount ) != savedASig ) {
3356         set_float_exception_inexact_flag();
3357     }
3358     return z;
3359
3360 }
3361
3362 /*
3363 -------------------------------------------------------------------------------
3364 Returns the result of converting the extended double-precision floating-
3365 point value `a' to the 64-bit two's complement integer format.  The
3366 conversion is performed according to the IEC/IEEE Standard for Binary
3367 Floating-Point Arithmetic---which means in particular that the conversion
3368 is rounded according to the current rounding mode.  If `a' is a NaN,
3369 the largest positive integer is returned.  Otherwise, if the conversion
3370 overflows, the largest integer with the same sign as `a' is returned.
3371 -------------------------------------------------------------------------------
3372 */
3373 int64 floatx80_to_int64( floatx80 a )
3374 {
3375     flag aSign;
3376     int32 aExp, shiftCount;
3377     bits64 aSig, aSigExtra;
3378
3379     aSig = extractFloatx80Frac( a );
3380     aExp = extractFloatx80Exp( a );
3381     aSign = extractFloatx80Sign( a );
3382     shiftCount = 0x403E - aExp;
3383     if ( shiftCount <= 0 ) {
3384         if ( shiftCount ) {
3385             float_raise( float_flag_invalid );
3386             if (    ! aSign
3387                  || (    ( aExp == 0x7FFF )
3388                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3389                ) {
3390                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3391             }
3392             return (sbits64) LIT64( 0x8000000000000000 );
3393         }
3394         aSigExtra = 0;
3395     }
3396     else {
3397         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3398     }
3399     return roundAndPackInt64( aSign, aSig, aSigExtra );
3400
3401 }
3402
3403 /*
3404 -------------------------------------------------------------------------------
3405 Returns the result of converting the extended double-precision floating-
3406 point value `a' to the 64-bit two's complement integer format.  The
3407 conversion is performed according to the IEC/IEEE Standard for Binary
3408 Floating-Point Arithmetic, except that the conversion is always rounded
3409 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3410 Otherwise, if the conversion overflows, the largest integer with the same
3411 sign as `a' is returned.
3412 -------------------------------------------------------------------------------
3413 */
3414 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3415 {
3416     flag aSign;
3417     int32 aExp, shiftCount;
3418     bits64 aSig;
3419     int64 z;
3420
3421     aSig = extractFloatx80Frac( a );
3422     aExp = extractFloatx80Exp( a );
3423     aSign = extractFloatx80Sign( a );
3424     shiftCount = aExp - 0x403E;
3425     if ( 0 <= shiftCount ) {
3426         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3427         if ( ( a.high != 0xC03E ) || aSig ) {
3428             float_raise( float_flag_invalid );
3429             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3430                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3431             }
3432         }
3433         return (sbits64) LIT64( 0x8000000000000000 );
3434     }
3435     else if ( aExp < 0x3FFF ) {
3436         if ( aExp | aSig ) set_float_exception_inexact_flag();
3437         return 0;
3438     }
3439     z = aSig>>( - shiftCount );
3440     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3441         set_float_exception_inexact_flag();
3442     }
3443     if ( aSign ) z = - z;
3444     return z;
3445
3446 }
3447
3448 /*
3449 -------------------------------------------------------------------------------
3450 Returns the result of converting the extended double-precision floating-
3451 point value `a' to the single-precision floating-point format.  The
3452 conversion is performed according to the IEC/IEEE Standard for Binary
3453 Floating-Point Arithmetic.
3454 -------------------------------------------------------------------------------
3455 */
3456 float32 floatx80_to_float32( floatx80 a )
3457 {
3458     flag aSign;
3459     int32 aExp;
3460     bits64 aSig;
3461
3462     aSig = extractFloatx80Frac( a );
3463     aExp = extractFloatx80Exp( a );
3464     aSign = extractFloatx80Sign( a );
3465     if ( aExp == 0x7FFF ) {
3466         if ( (bits64) ( aSig<<1 ) ) {
3467             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3468         }
3469         return packFloat32( aSign, 0xFF, 0 );
3470     }
3471     shift64RightJamming( aSig, 33, &aSig );
3472     if ( aExp || aSig ) aExp -= 0x3F81;
3473     return roundAndPackFloat32( aSign, aExp, aSig );
3474
3475 }
3476
3477 /*
3478 -------------------------------------------------------------------------------
3479 Returns the result of converting the extended double-precision floating-
3480 point value `a' to the double-precision floating-point format.  The
3481 conversion is performed according to the IEC/IEEE Standard for Binary
3482 Floating-Point Arithmetic.
3483 -------------------------------------------------------------------------------
3484 */
3485 float64 floatx80_to_float64( floatx80 a )
3486 {
3487     flag aSign;
3488     int32 aExp;
3489     bits64 aSig, zSig;
3490
3491     aSig = extractFloatx80Frac( a );
3492     aExp = extractFloatx80Exp( a );
3493     aSign = extractFloatx80Sign( a );
3494     if ( aExp == 0x7FFF ) {
3495         if ( (bits64) ( aSig<<1 ) ) {
3496             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3497         }
3498         return packFloat64( aSign, 0x7FF, 0 );
3499     }
3500     shift64RightJamming( aSig, 1, &zSig );
3501     if ( aExp || aSig ) aExp -= 0x3C01;
3502     return roundAndPackFloat64( aSign, aExp, zSig );
3503
3504 }
3505
3506 #ifdef FLOAT128
3507
3508 /*
3509 -------------------------------------------------------------------------------
3510 Returns the result of converting the extended double-precision floating-
3511 point value `a' to the quadruple-precision floating-point format.  The
3512 conversion is performed according to the IEC/IEEE Standard for Binary
3513 Floating-Point Arithmetic.
3514 -------------------------------------------------------------------------------
3515 */
3516 float128 floatx80_to_float128( floatx80 a )
3517 {
3518     flag aSign;
3519     int16 aExp;
3520     bits64 aSig, zSig0, zSig1;
3521
3522     aSig = extractFloatx80Frac( a );
3523     aExp = extractFloatx80Exp( a );
3524     aSign = extractFloatx80Sign( a );
3525     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3526         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3527     }
3528     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3529     return packFloat128( aSign, aExp, zSig0, zSig1 );
3530
3531 }
3532
3533 #endif
3534
3535 /*
3536 -------------------------------------------------------------------------------
3537 Rounds the extended double-precision floating-point value `a' to an integer,
3538 and returns the result as an extended quadruple-precision floating-point
3539 value.  The operation is performed according to the IEC/IEEE Standard for
3540 Binary Floating-Point Arithmetic.
3541 -------------------------------------------------------------------------------
3542 */
3543 floatx80 floatx80_round_to_int( floatx80 a )
3544 {
3545     flag aSign;
3546     int32 aExp;
3547     bits64 lastBitMask, roundBitsMask;
3548     int8 roundingMode;
3549     floatx80 z;
3550
3551     aExp = extractFloatx80Exp( a );
3552     if ( 0x403E <= aExp ) {
3553         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3554             return propagateFloatx80NaN( a, a );
3555         }
3556         return a;
3557     }
3558     if ( aExp < 0x3FFF ) {
3559         if (    ( aExp == 0 )
3560              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3561             return a;
3562         }
3563         set_float_exception_inexact_flag();
3564         aSign = extractFloatx80Sign( a );
3565         switch ( float_rounding_mode ) {
3566          case float_round_nearest_even:
3567             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3568                ) {
3569                 return
3570                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3571             }
3572             break;
3573          case float_round_to_zero:
3574             break;
3575          case float_round_down:
3576             return
3577                   aSign ?
3578                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3579                 : packFloatx80( 0, 0, 0 );
3580          case float_round_up:
3581             return
3582                   aSign ? packFloatx80( 1, 0, 0 )
3583                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3584         }
3585         return packFloatx80( aSign, 0, 0 );
3586     }
3587     lastBitMask = 1;
3588     lastBitMask <<= 0x403E - aExp;
3589     roundBitsMask = lastBitMask - 1;
3590     z = a;
3591     roundingMode = float_rounding_mode;
3592     if ( roundingMode == float_round_nearest_even ) {
3593         z.low += lastBitMask>>1;
3594         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3595     }
3596     else if ( roundingMode != float_round_to_zero ) {
3597         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3598             z.low += roundBitsMask;
3599         }
3600     }
3601     z.low &= ~ roundBitsMask;
3602     if ( z.low == 0 ) {
3603         ++z.high;
3604         z.low = LIT64( 0x8000000000000000 );
3605     }
3606     if ( z.low != a.low ) set_float_exception_inexact_flag();
3607     return z;
3608
3609 }
3610
3611 /*
3612 -------------------------------------------------------------------------------
3613 Returns the result of adding the absolute values of the extended double-
3614 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3615 negated before being returned.  `zSign' is ignored if the result is a NaN.
3616 The addition is performed according to the IEC/IEEE Standard for Binary
3617 Floating-Point Arithmetic.
3618 -------------------------------------------------------------------------------
3619 */
3620 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3621 {
3622     int32 aExp, bExp, zExp;
3623     bits64 aSig, bSig, zSig0, zSig1;
3624     int32 expDiff;
3625
3626     aSig = extractFloatx80Frac( a );
3627     aExp = extractFloatx80Exp( a );
3628     bSig = extractFloatx80Frac( b );
3629     bExp = extractFloatx80Exp( b );
3630     expDiff = aExp - bExp;
3631     if ( 0 < expDiff ) {
3632         if ( aExp == 0x7FFF ) {
3633             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3634             return a;
3635         }
3636         if ( bExp == 0 ) --expDiff;
3637         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3638         zExp = aExp;
3639     }
3640     else if ( expDiff < 0 ) {
3641         if ( bExp == 0x7FFF ) {
3642             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3643             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3644         }
3645         if ( aExp == 0 ) ++expDiff;
3646         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3647         zExp = bExp;
3648     }
3649     else {
3650         if ( aExp == 0x7FFF ) {
3651             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3652                 return propagateFloatx80NaN( a, b );
3653             }
3654             return a;
3655         }
3656         zSig1 = 0;
3657         zSig0 = aSig + bSig;
3658         if ( aExp == 0 ) {
3659             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3660             goto roundAndPack;
3661         }
3662         zExp = aExp;
3663         goto shiftRight1;
3664     }
3665     zSig0 = aSig + bSig;
3666     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3667  shiftRight1:
3668     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3669     zSig0 |= LIT64( 0x8000000000000000 );
3670     ++zExp;
3671  roundAndPack:
3672     return
3673         roundAndPackFloatx80(
3674             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3675
3676 }
3677
3678 /*
3679 -------------------------------------------------------------------------------
3680 Returns the result of subtracting the absolute values of the extended
3681 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3682 difference is negated before being returned.  `zSign' is ignored if the
3683 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3684 Standard for Binary Floating-Point Arithmetic.
3685 -------------------------------------------------------------------------------
3686 */
3687 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3688 {
3689     int32 aExp, bExp, zExp;
3690     bits64 aSig, bSig, zSig0, zSig1;
3691     int32 expDiff;
3692     floatx80 z;
3693
3694     aSig = extractFloatx80Frac( a );
3695     aExp = extractFloatx80Exp( a );
3696     bSig = extractFloatx80Frac( b );
3697     bExp = extractFloatx80Exp( b );
3698     expDiff = aExp - bExp;
3699     if ( 0 < expDiff ) goto aExpBigger;
3700     if ( expDiff < 0 ) goto bExpBigger;
3701     if ( aExp == 0x7FFF ) {
3702         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3703             return propagateFloatx80NaN( a, b );
3704         }
3705         float_raise( float_flag_invalid );
3706         z.low = floatx80_default_nan_low;
3707         z.high = floatx80_default_nan_high;
3708         return z;
3709     }
3710     if ( aExp == 0 ) {
3711         aExp = 1;
3712         bExp = 1;
3713     }
3714     zSig1 = 0;
3715     if ( bSig < aSig ) goto aBigger;
3716     if ( aSig < bSig ) goto bBigger;
3717     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3718  bExpBigger:
3719     if ( bExp == 0x7FFF ) {
3720         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3721         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3722     }
3723     if ( aExp == 0 ) ++expDiff;
3724     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3725  bBigger:
3726     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3727     zExp = bExp;
3728     zSign ^= 1;
3729     goto normalizeRoundAndPack;
3730  aExpBigger:
3731     if ( aExp == 0x7FFF ) {
3732         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3733         return a;
3734     }
3735     if ( bExp == 0 ) --expDiff;
3736     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3737  aBigger:
3738     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3739     zExp = aExp;
3740  normalizeRoundAndPack:
3741     return
3742         normalizeRoundAndPackFloatx80(
3743             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3744
3745 }
3746
3747 /*
3748 -------------------------------------------------------------------------------
3749 Returns the result of adding the extended double-precision floating-point
3750 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3751 Standard for Binary Floating-Point Arithmetic.
3752 -------------------------------------------------------------------------------
3753 */
3754 floatx80 floatx80_add( floatx80 a, floatx80 b )
3755 {
3756     flag aSign, bSign;
3757
3758     aSign = extractFloatx80Sign( a );
3759     bSign = extractFloatx80Sign( b );
3760     if ( aSign == bSign ) {
3761         return addFloatx80Sigs( a, b, aSign );
3762     }
3763     else {
3764         return subFloatx80Sigs( a, b, aSign );
3765     }
3766
3767 }
3768
3769 /*
3770 -------------------------------------------------------------------------------
3771 Returns the result of subtracting the extended double-precision floating-
3772 point values `a' and `b'.  The operation is performed according to the
3773 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3774 -------------------------------------------------------------------------------
3775 */
3776 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3777 {
3778     flag aSign, bSign;
3779
3780     aSign = extractFloatx80Sign( a );
3781     bSign = extractFloatx80Sign( b );
3782     if ( aSign == bSign ) {
3783         return subFloatx80Sigs( a, b, aSign );
3784     }
3785     else {
3786         return addFloatx80Sigs( a, b, aSign );
3787     }
3788
3789 }
3790
3791 /*
3792 -------------------------------------------------------------------------------
3793 Returns the result of multiplying the extended double-precision floating-
3794 point values `a' and `b'.  The operation is performed according to the
3795 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3796 -------------------------------------------------------------------------------
3797 */
3798 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3799 {
3800     flag aSign, bSign, zSign;
3801     int32 aExp, bExp, zExp;
3802     bits64 aSig, bSig, zSig0, zSig1;
3803     floatx80 z;
3804
3805     aSig = extractFloatx80Frac( a );
3806     aExp = extractFloatx80Exp( a );
3807     aSign = extractFloatx80Sign( a );
3808     bSig = extractFloatx80Frac( b );
3809     bExp = extractFloatx80Exp( b );
3810     bSign = extractFloatx80Sign( b );
3811     zSign = aSign ^ bSign;
3812     if ( aExp == 0x7FFF ) {
3813         if (    (bits64) ( aSig<<1 )
3814              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3815             return propagateFloatx80NaN( a, b );
3816         }
3817         if ( ( bExp | bSig ) == 0 ) goto invalid;
3818         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3819     }
3820     if ( bExp == 0x7FFF ) {
3821         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3822         if ( ( aExp | aSig ) == 0 ) {
3823  invalid:
3824             float_raise( float_flag_invalid );
3825             z.low = floatx80_default_nan_low;
3826             z.high = floatx80_default_nan_high;
3827             return z;
3828         }
3829         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3830     }
3831     if ( aExp == 0 ) {
3832         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3833         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3834     }
3835     if ( bExp == 0 ) {
3836         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3837         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3838     }
3839     zExp = aExp + bExp - 0x3FFE;
3840     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3841     if ( 0 < (sbits64) zSig0 ) {
3842         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3843         --zExp;
3844     }
3845     return
3846         roundAndPackFloatx80(
3847             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3848
3849 }
3850
3851 /*
3852 -------------------------------------------------------------------------------
3853 Returns the result of dividing the extended double-precision floating-point
3854 value `a' by the corresponding value `b'.  The operation is performed
3855 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3856 -------------------------------------------------------------------------------
3857 */
3858 floatx80 floatx80_div( floatx80 a, floatx80 b )
3859 {
3860     flag aSign, bSign, zSign;
3861     int32 aExp, bExp, zExp;
3862     bits64 aSig, bSig, zSig0, zSig1;
3863     bits64 rem0, rem1, rem2, term0, term1, term2;
3864     floatx80 z;
3865
3866     aSig = extractFloatx80Frac( a );
3867     aExp = extractFloatx80Exp( a );
3868     aSign = extractFloatx80Sign( a );
3869     bSig = extractFloatx80Frac( b );
3870     bExp = extractFloatx80Exp( b );
3871     bSign = extractFloatx80Sign( b );
3872     zSign = aSign ^ bSign;
3873     if ( aExp == 0x7FFF ) {
3874         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3875         if ( bExp == 0x7FFF ) {
3876             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3877             goto invalid;
3878         }
3879         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3880     }
3881     if ( bExp == 0x7FFF ) {
3882         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3883         return packFloatx80( zSign, 0, 0 );
3884     }
3885     if ( bExp == 0 ) {
3886         if ( bSig == 0 ) {
3887             if ( ( aExp | aSig ) == 0 ) {
3888  invalid:
3889                 float_raise( float_flag_invalid );
3890                 z.low = floatx80_default_nan_low;
3891                 z.high = floatx80_default_nan_high;
3892                 return z;
3893             }
3894             float_raise( float_flag_divbyzero );
3895             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3896         }
3897         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3898     }
3899     if ( aExp == 0 ) {
3900         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3901         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3902     }
3903     zExp = aExp - bExp + 0x3FFE;
3904     rem1 = 0;
3905     if ( bSig <= aSig ) {
3906         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3907         ++zExp;
3908     }
3909     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3910     mul64To128( bSig, zSig0, &term0, &term1 );
3911     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3912     while ( (sbits64) rem0 < 0 ) {
3913         --zSig0;
3914         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3915     }
3916     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3917     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3918         mul64To128( bSig, zSig1, &term1, &term2 );
3919         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3920         while ( (sbits64) rem1 < 0 ) {
3921             --zSig1;
3922             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3923         }
3924         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3925     }
3926     return
3927         roundAndPackFloatx80(
3928             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3929
3930 }
3931
3932 /*
3933 -------------------------------------------------------------------------------
3934 Returns the remainder of the extended double-precision floating-point value
3935 `a' with respect to the corresponding value `b'.  The operation is performed
3936 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3937 -------------------------------------------------------------------------------
3938 */
3939 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3940 {
3941     flag aSign, bSign, zSign;
3942     int32 aExp, bExp, expDiff;
3943     bits64 aSig0, aSig1, bSig;
3944     bits64 q, term0, term1, alternateASig0, alternateASig1;
3945     floatx80 z;
3946
3947     aSig0 = extractFloatx80Frac( a );
3948     aExp = extractFloatx80Exp( a );
3949     aSign = extractFloatx80Sign( a );
3950     bSig = extractFloatx80Frac( b );
3951     bExp = extractFloatx80Exp( b );
3952     bSign = extractFloatx80Sign( b );
3953     if ( aExp == 0x7FFF ) {
3954         if (    (bits64) ( aSig0<<1 )
3955              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3956             return propagateFloatx80NaN( a, b );
3957         }
3958         goto invalid;
3959     }
3960     if ( bExp == 0x7FFF ) {
3961         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3962         return a;
3963     }
3964     if ( bExp == 0 ) {
3965         if ( bSig == 0 ) {
3966  invalid:
3967             float_raise( float_flag_invalid );
3968             z.low = floatx80_default_nan_low;
3969             z.high = floatx80_default_nan_high;
3970             return z;
3971         }
3972         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3973     }
3974     if ( aExp == 0 ) {
3975         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3976         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3977     }
3978     bSig |= LIT64( 0x8000000000000000 );
3979     zSign = aSign;
3980     expDiff = aExp - bExp;
3981     aSig1 = 0;
3982     if ( expDiff < 0 ) {
3983         if ( expDiff < -1 ) return a;
3984         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3985         expDiff = 0;
3986     }
3987     q = ( bSig <= aSig0 );
3988     if ( q ) aSig0 -= bSig;
3989     expDiff -= 64;
3990     while ( 0 < expDiff ) {
3991         q = estimateDiv128To64( aSig0, aSig1, bSig );
3992         q = ( 2 < q ) ? q - 2 : 0;
3993         mul64To128( bSig, q, &term0, &term1 );
3994         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3995         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3996         expDiff -= 62;
3997     }
3998     expDiff += 64;
3999     if ( 0 < expDiff ) {
4000         q = estimateDiv128To64( aSig0, aSig1, bSig );
4001         q = ( 2 < q ) ? q - 2 : 0;
4002         q >>= 64 - expDiff;
4003         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4004         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4005         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4006         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4007             ++q;
4008             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4009         }
4010     }
4011     else {
4012         term1 = 0;
4013         term0 = bSig;
4014     }
4015     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4016     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4017          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4018               && ( q & 1 ) )
4019        ) {
4020         aSig0 = alternateASig0;
4021         aSig1 = alternateASig1;
4022         zSign = ! zSign;
4023     }
4024     return
4025         normalizeRoundAndPackFloatx80(
4026             80, zSign, bExp + expDiff, aSig0, aSig1 );
4027
4028 }
4029
4030 /*
4031 -------------------------------------------------------------------------------
4032 Returns the square root of the extended double-precision floating-point
4033 value `a'.  The operation is performed according to the IEC/IEEE Standard
4034 for Binary Floating-Point Arithmetic.
4035 -------------------------------------------------------------------------------
4036 */
4037 floatx80 floatx80_sqrt( floatx80 a )
4038 {
4039     flag aSign;
4040     int32 aExp, zExp;
4041     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4042     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4043     floatx80 z;
4044
4045     aSig0 = extractFloatx80Frac( a );
4046     aExp = extractFloatx80Exp( a );
4047     aSign = extractFloatx80Sign( a );
4048     if ( aExp == 0x7FFF ) {
4049         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4050         if ( ! aSign ) return a;
4051         goto invalid;
4052     }
4053     if ( aSign ) {
4054         if ( ( aExp | aSig0 ) == 0 ) return a;
4055  invalid:
4056         float_raise( float_flag_invalid );
4057         z.low = floatx80_default_nan_low;
4058         z.high = floatx80_default_nan_high;
4059         return z;
4060     }
4061     if ( aExp == 0 ) {
4062         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4063         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4064     }
4065     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4066     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4067     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4068     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4069     doubleZSig0 = zSig0<<1;
4070     mul64To128( zSig0, zSig0, &term0, &term1 );
4071     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4072     while ( (sbits64) rem0 < 0 ) {
4073         --zSig0;
4074         doubleZSig0 -= 2;
4075         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4076     }
4077     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4078     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4079         if ( zSig1 == 0 ) zSig1 = 1;
4080         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4081         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4082         mul64To128( zSig1, zSig1, &term2, &term3 );
4083         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4084         while ( (sbits64) rem1 < 0 ) {
4085             --zSig1;
4086             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4087             term3 |= 1;
4088             term2 |= doubleZSig0;
4089             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4090         }
4091         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4092     }
4093     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4094     zSig0 |= doubleZSig0;
4095     return
4096         roundAndPackFloatx80(
4097             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4098
4099 }
4100
4101 /*
4102 -------------------------------------------------------------------------------
4103 Returns 1 if the extended double-precision floating-point value `a' is
4104 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4105 performed according to the IEC/IEEE Standard for Binary Floating-Point
4106 Arithmetic.
4107 -------------------------------------------------------------------------------
4108 */
4109 flag floatx80_eq( floatx80 a, floatx80 b )
4110 {
4111
4112     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4113               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4114          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4115               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4116        ) {
4117         if (    floatx80_is_signaling_nan( a )
4118              || floatx80_is_signaling_nan( b ) ) {
4119             float_raise( float_flag_invalid );
4120         }
4121         return 0;
4122     }
4123     return
4124            ( a.low == b.low )
4125         && (    ( a.high == b.high )
4126              || (    ( a.low == 0 )
4127                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4128            );
4129
4130 }
4131
4132 /*
4133 -------------------------------------------------------------------------------
4134 Returns 1 if the extended double-precision floating-point value `a' is
4135 less than or equal to the corresponding value `b', and 0 otherwise.  The
4136 comparison is performed according to the IEC/IEEE Standard for Binary
4137 Floating-Point Arithmetic.
4138 -------------------------------------------------------------------------------
4139 */
4140 flag floatx80_le( floatx80 a, floatx80 b )
4141 {
4142     flag aSign, bSign;
4143
4144     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4145               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4146          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4147               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4148        ) {
4149         float_raise( float_flag_invalid );
4150         return 0;
4151     }
4152     aSign = extractFloatx80Sign( a );
4153     bSign = extractFloatx80Sign( b );
4154     if ( aSign != bSign ) {
4155         return
4156                aSign
4157             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4158                  == 0 );
4159     }
4160     return
4161           aSign ? le128( b.high, b.low, a.high, a.low )
4162         : le128( a.high, a.low, b.high, b.low );
4163
4164 }
4165
4166 /*
4167 -------------------------------------------------------------------------------
4168 Returns 1 if the extended double-precision floating-point value `a' is
4169 less than the corresponding value `b', and 0 otherwise.  The comparison
4170 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4171 Arithmetic.
4172 -------------------------------------------------------------------------------
4173 */
4174 flag floatx80_lt( floatx80 a, floatx80 b )
4175 {
4176     flag aSign, bSign;
4177
4178     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4179               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4180          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4181               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4182        ) {
4183         float_raise( float_flag_invalid );
4184         return 0;
4185     }
4186     aSign = extractFloatx80Sign( a );
4187     bSign = extractFloatx80Sign( b );
4188     if ( aSign != bSign ) {
4189         return
4190                aSign
4191             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4192                  != 0 );
4193     }
4194     return
4195           aSign ? lt128( b.high, b.low, a.high, a.low )
4196         : lt128( a.high, a.low, b.high, b.low );
4197
4198 }
4199
4200 /*
4201 -------------------------------------------------------------------------------
4202 Returns 1 if the extended double-precision floating-point value `a' is equal
4203 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4204 raised if either operand is a NaN.  Otherwise, the comparison is performed
4205 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4206 -------------------------------------------------------------------------------
4207 */
4208 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4209 {
4210
4211     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4212               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4213          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4214               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4215        ) {
4216         float_raise( float_flag_invalid );
4217         return 0;
4218     }
4219     return
4220            ( a.low == b.low )
4221         && (    ( a.high == b.high )
4222              || (    ( a.low == 0 )
4223                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4224            );
4225
4226 }
4227
4228 /*
4229 -------------------------------------------------------------------------------
4230 Returns 1 if the extended double-precision floating-point value `a' is less
4231 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4232 do not cause an exception.  Otherwise, the comparison is performed according
4233 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4234 -------------------------------------------------------------------------------
4235 */
4236 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4237 {
4238     flag aSign, bSign;
4239
4240     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4241               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4242          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4243               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4244        ) {
4245         if (    floatx80_is_signaling_nan( a )
4246              || floatx80_is_signaling_nan( b ) ) {
4247             float_raise( float_flag_invalid );
4248         }
4249         return 0;
4250     }
4251     aSign = extractFloatx80Sign( a );
4252     bSign = extractFloatx80Sign( b );
4253     if ( aSign != bSign ) {
4254         return
4255                aSign
4256             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4257                  == 0 );
4258     }
4259     return
4260           aSign ? le128( b.high, b.low, a.high, a.low )
4261         : le128( a.high, a.low, b.high, b.low );
4262
4263 }
4264
4265 /*
4266 -------------------------------------------------------------------------------
4267 Returns 1 if the extended double-precision floating-point value `a' is less
4268 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4269 an exception.  Otherwise, the comparison is performed according to the
4270 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271 -------------------------------------------------------------------------------
4272 */
4273 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4274 {
4275     flag aSign, bSign;
4276
4277     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4278               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4279          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4280               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4281        ) {
4282         if (    floatx80_is_signaling_nan( a )
4283              || floatx80_is_signaling_nan( b ) ) {
4284             float_raise( float_flag_invalid );
4285         }
4286         return 0;
4287     }
4288     aSign = extractFloatx80Sign( a );
4289     bSign = extractFloatx80Sign( b );
4290     if ( aSign != bSign ) {
4291         return
4292                aSign
4293             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4294                  != 0 );
4295     }
4296     return
4297           aSign ? lt128( b.high, b.low, a.high, a.low )
4298         : lt128( a.high, a.low, b.high, b.low );
4299
4300 }
4301
4302 #endif
4303
4304 #ifdef FLOAT128
4305
4306 /*
4307 -------------------------------------------------------------------------------
4308 Returns the result of converting the quadruple-precision floating-point
4309 value `a' to the 32-bit two's complement integer format.  The conversion
4310 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4311 Arithmetic---which means in particular that the conversion is rounded
4312 according to the current rounding mode.  If `a' is a NaN, the largest
4313 positive integer is returned.  Otherwise, if the conversion overflows, the
4314 largest integer with the same sign as `a' is returned.
4315 -------------------------------------------------------------------------------
4316 */
4317 int32 float128_to_int32( float128 a )
4318 {
4319     flag aSign;
4320     int32 aExp, shiftCount;
4321     bits64 aSig0, aSig1;
4322
4323     aSig1 = extractFloat128Frac1( a );
4324     aSig0 = extractFloat128Frac0( a );
4325     aExp = extractFloat128Exp( a );
4326     aSign = extractFloat128Sign( a );
4327     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4328     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4329     aSig0 |= ( aSig1 != 0 );
4330     shiftCount = 0x4028 - aExp;
4331     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4332     return roundAndPackInt32( aSign, aSig0 );
4333
4334 }
4335
4336 /*
4337 -------------------------------------------------------------------------------
4338 Returns the result of converting the quadruple-precision floating-point
4339 value `a' to the 32-bit two's complement integer format.  The conversion
4340 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4341 Arithmetic, except that the conversion is always rounded toward zero.  If
4342 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4343 conversion overflows, the largest integer with the same sign as `a' is
4344 returned.
4345 -------------------------------------------------------------------------------
4346 */
4347 int32 float128_to_int32_round_to_zero( float128 a )
4348 {
4349     flag aSign;
4350     int32 aExp, shiftCount;
4351     bits64 aSig0, aSig1, savedASig;
4352     int32 z;
4353
4354     aSig1 = extractFloat128Frac1( a );
4355     aSig0 = extractFloat128Frac0( a );
4356     aExp = extractFloat128Exp( a );
4357     aSign = extractFloat128Sign( a );
4358     aSig0 |= ( aSig1 != 0 );
4359     if ( 0x401E < aExp ) {
4360         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4361         goto invalid;
4362     }
4363     else if ( aExp < 0x3FFF ) {
4364         if ( aExp || aSig0 ) set_float_exception_inexact_flag();
4365         return 0;
4366     }
4367     aSig0 |= LIT64( 0x0001000000000000 );
4368     shiftCount = 0x402F - aExp;
4369     savedASig = aSig0;
4370     aSig0 >>= shiftCount;
4371     z = (int32)aSig0;
4372     if ( aSign ) z = - z;
4373     if ( ( z < 0 ) ^ aSign ) {
4374  invalid:
4375         float_raise( float_flag_invalid );
4376         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4377     }
4378     if ( ( aSig0<<shiftCount ) != savedASig ) {
4379         set_float_exception_inexact_flag();
4380     }
4381     return z;
4382
4383 }
4384
4385 /*
4386 -------------------------------------------------------------------------------
4387 Returns the result of converting the quadruple-precision floating-point
4388 value `a' to the 64-bit two's complement integer format.  The conversion
4389 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4390 Arithmetic---which means in particular that the conversion is rounded
4391 according to the current rounding mode.  If `a' is a NaN, the largest
4392 positive integer is returned.  Otherwise, if the conversion overflows, the
4393 largest integer with the same sign as `a' is returned.
4394 -------------------------------------------------------------------------------
4395 */
4396 int64 float128_to_int64( float128 a )
4397 {
4398     flag aSign;
4399     int32 aExp, shiftCount;
4400     bits64 aSig0, aSig1;
4401
4402     aSig1 = extractFloat128Frac1( a );
4403     aSig0 = extractFloat128Frac0( a );
4404     aExp = extractFloat128Exp( a );
4405     aSign = extractFloat128Sign( a );
4406     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4407     shiftCount = 0x402F - aExp;
4408     if ( shiftCount <= 0 ) {
4409         if ( 0x403E < aExp ) {
4410             float_raise( float_flag_invalid );
4411             if (    ! aSign
4412                  || (    ( aExp == 0x7FFF )
4413                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4414                     )
4415                ) {
4416                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4417             }
4418             return (sbits64) LIT64( 0x8000000000000000 );
4419         }
4420         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4421     }
4422     else {
4423         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4424     }
4425     return roundAndPackInt64( aSign, aSig0, aSig1 );
4426
4427 }
4428
4429 /*
4430 -------------------------------------------------------------------------------
4431 Returns the result of converting the quadruple-precision floating-point
4432 value `a' to the 64-bit two's complement integer format.  The conversion
4433 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4434 Arithmetic, except that the conversion is always rounded toward zero.
4435 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4436 the conversion overflows, the largest integer with the same sign as `a' is
4437 returned.
4438 -------------------------------------------------------------------------------
4439 */
4440 int64 float128_to_int64_round_to_zero( float128 a )
4441 {
4442     flag aSign;
4443     int32 aExp, shiftCount;
4444     bits64 aSig0, aSig1;
4445     int64 z;
4446
4447     aSig1 = extractFloat128Frac1( a );
4448     aSig0 = extractFloat128Frac0( a );
4449     aExp = extractFloat128Exp( a );
4450     aSign = extractFloat128Sign( a );
4451     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4452     shiftCount = aExp - 0x402F;
4453     if ( 0 < shiftCount ) {
4454         if ( 0x403E <= aExp ) {
4455             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4456             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4457                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4458                 if ( aSig1 ) set_float_exception_inexact_flag();
4459             }
4460             else {
4461                 float_raise( float_flag_invalid );
4462                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4463                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4464                 }
4465             }
4466             return (sbits64) LIT64( 0x8000000000000000 );
4467         }
4468         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4469         if ( (bits64) ( aSig1<<shiftCount ) ) {
4470             set_float_exception_inexact_flag();
4471         }
4472     }
4473     else {
4474         if ( aExp < 0x3FFF ) {
4475             if ( aExp | aSig0 | aSig1 ) {
4476                 set_float_exception_inexact_flag();
4477             }
4478             return 0;
4479         }
4480         z = aSig0>>( - shiftCount );
4481         if (    aSig1
4482              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4483             set_float_exception_inexact_flag();
4484         }
4485     }
4486     if ( aSign ) z = - z;
4487     return z;
4488
4489 }
4490
4491 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4492     && defined(SOFTFLOAT_NEED_FIXUNS)
4493 /*
4494  * just like above - but do not care for overflow of signed results
4495  */
4496 uint64 float128_to_uint64_round_to_zero( float128 a )
4497 {
4498     flag aSign;
4499     int32 aExp, shiftCount;
4500     bits64 aSig0, aSig1;
4501     uint64 z;
4502
4503     aSig1 = extractFloat128Frac1( a );
4504     aSig0 = extractFloat128Frac0( a );
4505     aExp = extractFloat128Exp( a );
4506     aSign = extractFloat128Sign( a );
4507     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4508     shiftCount = aExp - 0x402F;
4509     if ( 0 < shiftCount ) {
4510         if ( 0x403F <= aExp ) {
4511             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4512             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4513                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4514                 if ( aSig1 ) set_float_exception_inexact_flag();
4515             }
4516             else {
4517                 float_raise( float_flag_invalid );
4518             }
4519             return LIT64( 0xFFFFFFFFFFFFFFFF );
4520         }
4521         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4522         if ( (bits64) ( aSig1<<shiftCount ) ) {
4523             set_float_exception_inexact_flag();
4524         }
4525     }
4526     else {
4527         if ( aExp < 0x3FFF ) {
4528             if ( aExp | aSig0 | aSig1 ) {
4529                 set_float_exception_inexact_flag();
4530             }
4531             return 0;
4532         }
4533         z = aSig0>>( - shiftCount );
4534         if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4535             set_float_exception_inexact_flag();
4536         }
4537     }
4538     if ( aSign ) z = - z;
4539     return z;
4540
4541 }
4542 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4543
4544 /*
4545 -------------------------------------------------------------------------------
4546 Returns the result of converting the quadruple-precision floating-point
4547 value `a' to the single-precision floating-point format.  The conversion
4548 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4549 Arithmetic.
4550 -------------------------------------------------------------------------------
4551 */
4552 float32 float128_to_float32( float128 a )
4553 {
4554     flag aSign;
4555     int32 aExp;
4556     bits64 aSig0, aSig1;
4557     bits32 zSig;
4558
4559     aSig1 = extractFloat128Frac1( a );
4560     aSig0 = extractFloat128Frac0( a );
4561     aExp = extractFloat128Exp( a );
4562     aSign = extractFloat128Sign( a );
4563     if ( aExp == 0x7FFF ) {
4564         if ( aSig0 | aSig1 ) {
4565             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4566         }
4567         return packFloat32( aSign, 0xFF, 0 );
4568     }
4569     aSig0 |= ( aSig1 != 0 );
4570     shift64RightJamming( aSig0, 18, &aSig0 );
4571     zSig = (bits32)aSig0;
4572     if ( aExp || zSig ) {
4573         zSig |= 0x40000000;
4574         aExp -= 0x3F81;
4575     }
4576     return roundAndPackFloat32( aSign, aExp, zSig );
4577
4578 }
4579
4580 /*
4581 -------------------------------------------------------------------------------
4582 Returns the result of converting the quadruple-precision floating-point
4583 value `a' to the double-precision floating-point format.  The conversion
4584 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4585 Arithmetic.
4586 -------------------------------------------------------------------------------
4587 */
4588 float64 float128_to_float64( float128 a )
4589 {
4590     flag aSign;
4591     int32 aExp;
4592     bits64 aSig0, aSig1;
4593
4594     aSig1 = extractFloat128Frac1( a );
4595     aSig0 = extractFloat128Frac0( a );
4596     aExp = extractFloat128Exp( a );
4597     aSign = extractFloat128Sign( a );
4598     if ( aExp == 0x7FFF ) {
4599         if ( aSig0 | aSig1 ) {
4600             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4601         }
4602         return packFloat64( aSign, 0x7FF, 0 );
4603     }
4604     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4605     aSig0 |= ( aSig1 != 0 );
4606     if ( aExp || aSig0 ) {
4607         aSig0 |= LIT64( 0x4000000000000000 );
4608         aExp -= 0x3C01;
4609     }
4610     return roundAndPackFloat64( aSign, aExp, aSig0 );
4611
4612 }
4613
4614 #ifdef FLOATX80
4615
4616 /*
4617 -------------------------------------------------------------------------------
4618 Returns the result of converting the quadruple-precision floating-point
4619 value `a' to the extended double-precision floating-point format.  The
4620 conversion is performed according to the IEC/IEEE Standard for Binary
4621 Floating-Point Arithmetic.
4622 -------------------------------------------------------------------------------
4623 */
4624 floatx80 float128_to_floatx80( float128 a )
4625 {
4626     flag aSign;
4627     int32 aExp;
4628     bits64 aSig0, aSig1;
4629
4630     aSig1 = extractFloat128Frac1( a );
4631     aSig0 = extractFloat128Frac0( a );
4632     aExp = extractFloat128Exp( a );
4633     aSign = extractFloat128Sign( a );
4634     if ( aExp == 0x7FFF ) {
4635         if ( aSig0 | aSig1 ) {
4636             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4637         }
4638         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4639     }
4640     if ( aExp == 0 ) {
4641         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4642         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4643     }
4644     else {
4645         aSig0 |= LIT64( 0x0001000000000000 );
4646     }
4647     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4648     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4649
4650 }
4651
4652 #endif
4653
4654 /*
4655 -------------------------------------------------------------------------------
4656 Rounds the quadruple-precision floating-point value `a' to an integer, and
4657 returns the result as a quadruple-precision floating-point value.  The
4658 operation is performed according to the IEC/IEEE Standard for Binary
4659 Floating-Point Arithmetic.
4660 -------------------------------------------------------------------------------
4661 */
4662 float128 float128_round_to_int( float128 a )
4663 {
4664     flag aSign;
4665     int32 aExp;
4666     bits64 lastBitMask, roundBitsMask;
4667     int8 roundingMode;
4668     float128 z;
4669
4670     aExp = extractFloat128Exp( a );
4671     if ( 0x402F <= aExp ) {
4672         if ( 0x406F <= aExp ) {
4673             if (    ( aExp == 0x7FFF )
4674                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4675                ) {
4676                 return propagateFloat128NaN( a, a );
4677             }
4678             return a;
4679         }
4680         lastBitMask = 1;
4681         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4682         roundBitsMask = lastBitMask - 1;
4683         z = a;
4684         roundingMode = float_rounding_mode;
4685         if ( roundingMode == float_round_nearest_even ) {
4686             if ( lastBitMask ) {
4687                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4688                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4689             }
4690             else {
4691                 if ( (sbits64) z.low < 0 ) {
4692                     ++z.high;
4693                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4694                 }
4695             }
4696         }
4697         else if ( roundingMode != float_round_to_zero ) {
4698             if (   extractFloat128Sign( z )
4699                  ^ ( roundingMode == float_round_up ) ) {
4700                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4701             }
4702         }
4703         z.low &= ~ roundBitsMask;
4704     }
4705     else {
4706         if ( aExp < 0x3FFF ) {
4707             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4708             set_float_exception_inexact_flag();
4709             aSign = extractFloat128Sign( a );
4710             switch ( float_rounding_mode ) {
4711              case float_round_nearest_even:
4712                 if (    ( aExp == 0x3FFE )
4713                      && (   extractFloat128Frac0( a )
4714                           | extractFloat128Frac1( a ) )
4715                    ) {
4716                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4717                 }
4718                 break;
4719              case float_round_to_zero:
4720                 break;
4721              case float_round_down:
4722                 return
4723                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4724                     : packFloat128( 0, 0, 0, 0 );
4725              case float_round_up:
4726                 return
4727                       aSign ? packFloat128( 1, 0, 0, 0 )
4728                     : packFloat128( 0, 0x3FFF, 0, 0 );
4729             }
4730             return packFloat128( aSign, 0, 0, 0 );
4731         }
4732         lastBitMask = 1;
4733         lastBitMask <<= 0x402F - aExp;
4734         roundBitsMask = lastBitMask - 1;
4735         z.low = 0;
4736         z.high = a.high;
4737         roundingMode = float_rounding_mode;
4738         if ( roundingMode == float_round_nearest_even ) {
4739             z.high += lastBitMask>>1;
4740             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4741                 z.high &= ~ lastBitMask;
4742             }
4743         }
4744         else if ( roundingMode != float_round_to_zero ) {
4745             if (   extractFloat128Sign( z )
4746                  ^ ( roundingMode == float_round_up ) ) {
4747                 z.high |= ( a.low != 0 );
4748                 z.high += roundBitsMask;
4749             }
4750         }
4751         z.high &= ~ roundBitsMask;
4752     }
4753     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4754         set_float_exception_inexact_flag();
4755     }
4756     return z;
4757
4758 }
4759
4760 /*
4761 -------------------------------------------------------------------------------
4762 Returns the result of adding the absolute values of the quadruple-precision
4763 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4764 before being returned.  `zSign' is ignored if the result is a NaN.
4765 The addition is performed according to the IEC/IEEE Standard for Binary
4766 Floating-Point Arithmetic.
4767 -------------------------------------------------------------------------------
4768 */
4769 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4770 {
4771     int32 aExp, bExp, zExp;
4772     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4773     int32 expDiff;
4774
4775     aSig1 = extractFloat128Frac1( a );
4776     aSig0 = extractFloat128Frac0( a );
4777     aExp = extractFloat128Exp( a );
4778     bSig1 = extractFloat128Frac1( b );
4779     bSig0 = extractFloat128Frac0( b );
4780     bExp = extractFloat128Exp( b );
4781     expDiff = aExp - bExp;
4782     if ( 0 < expDiff ) {
4783         if ( aExp == 0x7FFF ) {
4784             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4785             return a;
4786         }
4787         if ( bExp == 0 ) {
4788             --expDiff;
4789         }
4790         else {
4791             bSig0 |= LIT64( 0x0001000000000000 );
4792         }
4793         shift128ExtraRightJamming(
4794             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4795         zExp = aExp;
4796     }
4797     else if ( expDiff < 0 ) {
4798         if ( bExp == 0x7FFF ) {
4799             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4800             return packFloat128( zSign, 0x7FFF, 0, 0 );
4801         }
4802         if ( aExp == 0 ) {
4803             ++expDiff;
4804         }
4805         else {
4806             aSig0 |= LIT64( 0x0001000000000000 );
4807         }
4808         shift128ExtraRightJamming(
4809             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4810         zExp = bExp;
4811     }
4812     else {
4813         if ( aExp == 0x7FFF ) {
4814             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4815                 return propagateFloat128NaN( a, b );
4816             }
4817             return a;
4818         }
4819         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4820         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4821         zSig2 = 0;
4822         zSig0 |= LIT64( 0x0002000000000000 );
4823         zExp = aExp;
4824         goto shiftRight1;
4825     }
4826     aSig0 |= LIT64( 0x0001000000000000 );
4827     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4828     --zExp;
4829     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4830     ++zExp;
4831  shiftRight1:
4832     shift128ExtraRightJamming(
4833         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4834  roundAndPack:
4835     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4836
4837 }
4838
4839 /*
4840 -------------------------------------------------------------------------------
4841 Returns the result of subtracting the absolute values of the quadruple-
4842 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4843 difference is negated before being returned.  `zSign' is ignored if the
4844 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4845 Standard for Binary Floating-Point Arithmetic.
4846 -------------------------------------------------------------------------------
4847 */
4848 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4849 {
4850     int32 aExp, bExp, zExp;
4851     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4852     int32 expDiff;
4853     float128 z;
4854
4855     aSig1 = extractFloat128Frac1( a );
4856     aSig0 = extractFloat128Frac0( a );
4857     aExp = extractFloat128Exp( a );
4858     bSig1 = extractFloat128Frac1( b );
4859     bSig0 = extractFloat128Frac0( b );
4860     bExp = extractFloat128Exp( b );
4861     expDiff = aExp - bExp;
4862     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4863     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4864     if ( 0 < expDiff ) goto aExpBigger;
4865     if ( expDiff < 0 ) goto bExpBigger;
4866     if ( aExp == 0x7FFF ) {
4867         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4868             return propagateFloat128NaN( a, b );
4869         }
4870         float_raise( float_flag_invalid );
4871         z.low = float128_default_nan_low;
4872         z.high = float128_default_nan_high;
4873         return z;
4874     }
4875     if ( aExp == 0 ) {
4876         aExp = 1;
4877         bExp = 1;
4878     }
4879     if ( bSig0 < aSig0 ) goto aBigger;
4880     if ( aSig0 < bSig0 ) goto bBigger;
4881     if ( bSig1 < aSig1 ) goto aBigger;
4882     if ( aSig1 < bSig1 ) goto bBigger;
4883     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4884  bExpBigger:
4885     if ( bExp == 0x7FFF ) {
4886         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4887         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4888     }
4889     if ( aExp == 0 ) {
4890         ++expDiff;
4891     }
4892     else {
4893         aSig0 |= LIT64( 0x4000000000000000 );
4894     }
4895     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4896     bSig0 |= LIT64( 0x4000000000000000 );
4897  bBigger:
4898     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4899     zExp = bExp;
4900     zSign ^= 1;
4901     goto normalizeRoundAndPack;
4902  aExpBigger:
4903     if ( aExp == 0x7FFF ) {
4904         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4905         return a;
4906     }
4907     if ( bExp == 0 ) {
4908         --expDiff;
4909     }
4910     else {
4911         bSig0 |= LIT64( 0x4000000000000000 );
4912     }
4913     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4914     aSig0 |= LIT64( 0x4000000000000000 );
4915  aBigger:
4916     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4917     zExp = aExp;
4918  normalizeRoundAndPack:
4919     --zExp;
4920     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4921
4922 }
4923
4924 /*
4925 -------------------------------------------------------------------------------
4926 Returns the result of adding the quadruple-precision floating-point values
4927 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4928 for Binary Floating-Point Arithmetic.
4929 -------------------------------------------------------------------------------
4930 */
4931 float128 float128_add( float128 a, float128 b )
4932 {
4933     flag aSign, bSign;
4934
4935     aSign = extractFloat128Sign( a );
4936     bSign = extractFloat128Sign( b );
4937     if ( aSign == bSign ) {
4938         return addFloat128Sigs( a, b, aSign );
4939     }
4940     else {
4941         return subFloat128Sigs( a, b, aSign );
4942     }
4943
4944 }
4945
4946 /*
4947 -------------------------------------------------------------------------------
4948 Returns the result of subtracting the quadruple-precision floating-point
4949 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4950 Standard for Binary Floating-Point Arithmetic.
4951 -------------------------------------------------------------------------------
4952 */
4953 float128 float128_sub( float128 a, float128 b )
4954 {
4955     flag aSign, bSign;
4956
4957     aSign = extractFloat128Sign( a );
4958     bSign = extractFloat128Sign( b );
4959     if ( aSign == bSign ) {
4960         return subFloat128Sigs( a, b, aSign );
4961     }
4962     else {
4963         return addFloat128Sigs( a, b, aSign );
4964     }
4965
4966 }
4967
4968 /*
4969 -------------------------------------------------------------------------------
4970 Returns the result of multiplying the quadruple-precision floating-point
4971 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4972 Standard for Binary Floating-Point Arithmetic.
4973 -------------------------------------------------------------------------------
4974 */
4975 float128 float128_mul( float128 a, float128 b )
4976 {
4977     flag aSign, bSign, zSign;
4978     int32 aExp, bExp, zExp;
4979     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4980     float128 z;
4981
4982     aSig1 = extractFloat128Frac1( a );
4983     aSig0 = extractFloat128Frac0( a );
4984     aExp = extractFloat128Exp( a );
4985     aSign = extractFloat128Sign( a );
4986     bSig1 = extractFloat128Frac1( b );
4987     bSig0 = extractFloat128Frac0( b );
4988     bExp = extractFloat128Exp( b );
4989     bSign = extractFloat128Sign( b );
4990     zSign = aSign ^ bSign;
4991     if ( aExp == 0x7FFF ) {
4992         if (    ( aSig0 | aSig1 )
4993              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4994             return propagateFloat128NaN( a, b );
4995         }
4996         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4997         return packFloat128( zSign, 0x7FFF, 0, 0 );
4998     }
4999     if ( bExp == 0x7FFF ) {
5000         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5001         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5002  invalid:
5003             float_raise( float_flag_invalid );
5004             z.low = float128_default_nan_low;
5005             z.high = float128_default_nan_high;
5006             return z;
5007         }
5008         return packFloat128( zSign, 0x7FFF, 0, 0 );
5009     }
5010     if ( aExp == 0 ) {
5011         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5012         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5013     }
5014     if ( bExp == 0 ) {
5015         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5016         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5017     }
5018     zExp = aExp + bExp - 0x4000;
5019     aSig0 |= LIT64( 0x0001000000000000 );
5020     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5021     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5022     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5023     zSig2 |= ( zSig3 != 0 );
5024     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5025         shift128ExtraRightJamming(
5026             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5027         ++zExp;
5028     }
5029     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5030
5031 }
5032
5033 /*
5034 -------------------------------------------------------------------------------
5035 Returns the result of dividing the quadruple-precision floating-point value
5036 `a' by the corresponding value `b'.  The operation is performed according to
5037 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5038 -------------------------------------------------------------------------------
5039 */
5040 float128 float128_div( float128 a, float128 b )
5041 {
5042     flag aSign, bSign, zSign;
5043     int32 aExp, bExp, zExp;
5044     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5045     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5046     float128 z;
5047
5048     aSig1 = extractFloat128Frac1( a );
5049     aSig0 = extractFloat128Frac0( a );
5050     aExp = extractFloat128Exp( a );
5051     aSign = extractFloat128Sign( a );
5052     bSig1 = extractFloat128Frac1( b );
5053     bSig0 = extractFloat128Frac0( b );
5054     bExp = extractFloat128Exp( b );
5055     bSign = extractFloat128Sign( b );
5056     zSign = aSign ^ bSign;
5057     if ( aExp == 0x7FFF ) {
5058         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5059         if ( bExp == 0x7FFF ) {
5060             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5061             goto invalid;
5062         }
5063         return packFloat128( zSign, 0x7FFF, 0, 0 );
5064     }
5065     if ( bExp == 0x7FFF ) {
5066         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5067         return packFloat128( zSign, 0, 0, 0 );
5068     }
5069     if ( bExp == 0 ) {
5070         if ( ( bSig0 | bSig1 ) == 0 ) {
5071             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5072  invalid:
5073                 float_raise( float_flag_invalid );
5074                 z.low = float128_default_nan_low;
5075                 z.high = float128_default_nan_high;
5076                 return z;
5077             }
5078             float_raise( float_flag_divbyzero );
5079             return packFloat128( zSign, 0x7FFF, 0, 0 );
5080         }
5081         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5082     }
5083     if ( aExp == 0 ) {
5084         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5085         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5086     }
5087     zExp = aExp - bExp + 0x3FFD;
5088     shortShift128Left(
5089         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5090     shortShift128Left(
5091         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5092     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5093         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5094         ++zExp;
5095     }
5096     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5097     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5098     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5099     while ( (sbits64) rem0 < 0 ) {
5100         --zSig0;
5101         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5102     }
5103     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5104     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5105         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5106         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5107         while ( (sbits64) rem1 < 0 ) {
5108             --zSig1;
5109             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5110         }
5111         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5112     }
5113     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5114     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5115
5116 }
5117
5118 /*
5119 -------------------------------------------------------------------------------
5120 Returns the remainder of the quadruple-precision floating-point value `a'
5121 with respect to the corresponding value `b'.  The operation is performed
5122 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5123 -------------------------------------------------------------------------------
5124 */
5125 float128 float128_rem( float128 a, float128 b )
5126 {
5127     flag aSign, zSign;
5128     int32 aExp, bExp, expDiff;
5129     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5130     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5131     sbits64 sigMean0;
5132     float128 z;
5133
5134     aSig1 = extractFloat128Frac1( a );
5135     aSig0 = extractFloat128Frac0( a );
5136     aExp = extractFloat128Exp( a );
5137     aSign = extractFloat128Sign( a );
5138     bSig1 = extractFloat128Frac1( b );
5139     bSig0 = extractFloat128Frac0( b );
5140     bExp = extractFloat128Exp( b );
5141     if ( aExp == 0x7FFF ) {
5142         if (    ( aSig0 | aSig1 )
5143              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5144             return propagateFloat128NaN( a, b );
5145         }
5146         goto invalid;
5147     }
5148     if ( bExp == 0x7FFF ) {
5149         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5150         return a;
5151     }
5152     if ( bExp == 0 ) {
5153         if ( ( bSig0 | bSig1 ) == 0 ) {
5154  invalid:
5155             float_raise( float_flag_invalid );
5156             z.low = float128_default_nan_low;
5157             z.high = float128_default_nan_high;
5158             return z;
5159         }
5160         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5161     }
5162     if ( aExp == 0 ) {
5163         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5164         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5165     }
5166     expDiff = aExp - bExp;
5167     if ( expDiff < -1 ) return a;
5168     shortShift128Left(
5169         aSig0 | LIT64( 0x0001000000000000 ),
5170         aSig1,
5171         15 - ( expDiff < 0 ),
5172         &aSig0,
5173         &aSig1
5174     );
5175     shortShift128Left(
5176         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5177     q = le128( bSig0, bSig1, aSig0, aSig1 );
5178     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5179     expDiff -= 64;
5180     while ( 0 < expDiff ) {
5181         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5182         q = ( 4 < q ) ? q - 4 : 0;
5183         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5184         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5185         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5186         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5187         expDiff -= 61;
5188     }
5189     if ( -64 < expDiff ) {
5190         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5191         q = ( 4 < q ) ? q - 4 : 0;
5192         q >>= - expDiff;
5193         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5194         expDiff += 52;
5195         if ( expDiff < 0 ) {
5196             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5197         }
5198         else {
5199             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5200         }
5201         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5202         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5203     }
5204     else {
5205         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5206         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5207     }
5208     do {
5209         alternateASig0 = aSig0;
5210         alternateASig1 = aSig1;
5211         ++q;
5212         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5213     } while ( 0 <= (sbits64) aSig0 );
5214     add128(
5215         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5216     if (    ( sigMean0 < 0 )
5217          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5218         aSig0 = alternateASig0;
5219         aSig1 = alternateASig1;
5220     }
5221     zSign = ( (sbits64) aSig0 < 0 );
5222     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5223     return
5224         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5225
5226 }
5227
5228 /*
5229 -------------------------------------------------------------------------------
5230 Returns the square root of the quadruple-precision floating-point value `a'.
5231 The operation is performed according to the IEC/IEEE Standard for Binary
5232 Floating-Point Arithmetic.
5233 -------------------------------------------------------------------------------
5234 */
5235 float128 float128_sqrt( float128 a )
5236 {
5237     flag aSign;
5238     int32 aExp, zExp;
5239     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5240     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5241     float128 z;
5242
5243     aSig1 = extractFloat128Frac1( a );
5244     aSig0 = extractFloat128Frac0( a );
5245     aExp = extractFloat128Exp( a );
5246     aSign = extractFloat128Sign( a );
5247     if ( aExp == 0x7FFF ) {
5248         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5249         if ( ! aSign ) return a;
5250         goto invalid;
5251     }
5252     if ( aSign ) {
5253         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5254  invalid:
5255         float_raise( float_flag_invalid );
5256         z.low = float128_default_nan_low;
5257         z.high = float128_default_nan_high;
5258         return z;
5259     }
5260     if ( aExp == 0 ) {
5261         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5262         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5263     }
5264     zExp = (int32) ( (aExp - 0x3FFF) >> 1) + 0x3FFE;
5265     aSig0 |= LIT64( 0x0001000000000000 );
5266     zSig0 = estimateSqrt32((int16)aExp, (bits32)(aSig0>>17));
5267     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5268     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5269     doubleZSig0 = zSig0<<1;
5270     mul64To128( zSig0, zSig0, &term0, &term1 );
5271     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5272     while ( (sbits64) rem0 < 0 ) {
5273         --zSig0;
5274         doubleZSig0 -= 2;
5275         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5276     }
5277     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5278     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5279         if ( zSig1 == 0 ) zSig1 = 1;
5280         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5281         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5282         mul64To128( zSig1, zSig1, &term2, &term3 );
5283         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5284         while ( (sbits64) rem1 < 0 ) {
5285             --zSig1;
5286             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5287             term3 |= 1;
5288             term2 |= doubleZSig0;
5289             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5290         }
5291         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5292     }
5293     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5294     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5295
5296 }
5297
5298 /*
5299 -------------------------------------------------------------------------------
5300 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5301 the corresponding value `b', and 0 otherwise.  The comparison is performed
5302 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5303 -------------------------------------------------------------------------------
5304 */
5305 flag float128_eq( float128 a, float128 b )
5306 {
5307
5308     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5309               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5310          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5311               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5312        ) {
5313         if (    float128_is_signaling_nan( a )
5314              || float128_is_signaling_nan( b ) ) {
5315             float_raise( float_flag_invalid );
5316         }
5317         return 0;
5318     }
5319     return
5320            ( a.low == b.low )
5321         && (    ( a.high == b.high )
5322              || (    ( a.low == 0 )
5323                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5324            );
5325
5326 }
5327
5328 /*
5329 -------------------------------------------------------------------------------
5330 Returns 1 if the quadruple-precision floating-point value `a' is less than
5331 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5332 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5333 Arithmetic.
5334 -------------------------------------------------------------------------------
5335 */
5336 flag float128_le( float128 a, float128 b )
5337 {
5338     flag aSign, bSign;
5339
5340     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5341               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5342          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5343               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5344        ) {
5345         float_raise( float_flag_invalid );
5346         return 0;
5347     }
5348     aSign = extractFloat128Sign( a );
5349     bSign = extractFloat128Sign( b );
5350     if ( aSign != bSign ) {
5351         return
5352                aSign
5353             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5354                  == 0 );
5355     }
5356     return
5357           aSign ? le128( b.high, b.low, a.high, a.low )
5358         : le128( a.high, a.low, b.high, b.low );
5359
5360 }
5361
5362 /*
5363 -------------------------------------------------------------------------------
5364 Returns 1 if the quadruple-precision floating-point value `a' is less than
5365 the corresponding value `b', and 0 otherwise.  The comparison is performed
5366 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5367 -------------------------------------------------------------------------------
5368 */
5369 flag float128_lt( float128 a, float128 b )
5370 {
5371     flag aSign, bSign;
5372
5373     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5374               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5375          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5376               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5377        ) {
5378         float_raise( float_flag_invalid );
5379         return 0;
5380     }
5381     aSign = extractFloat128Sign( a );
5382     bSign = extractFloat128Sign( b );
5383     if ( aSign != bSign ) {
5384         return
5385                aSign
5386             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5387                  != 0 );
5388     }
5389     return
5390           aSign ? lt128( b.high, b.low, a.high, a.low )
5391         : lt128( a.high, a.low, b.high, b.low );
5392
5393 }
5394
5395 /*
5396 -------------------------------------------------------------------------------
5397 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5398 the corresponding value `b', and 0 otherwise.  The invalid exception is
5399 raised if either operand is a NaN.  Otherwise, the comparison is performed
5400 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5401 -------------------------------------------------------------------------------
5402 */
5403 flag float128_eq_signaling( float128 a, float128 b )
5404 {
5405
5406     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5407               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5408          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5409               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5410        ) {
5411         float_raise( float_flag_invalid );
5412         return 0;
5413     }
5414     return
5415            ( a.low == b.low )
5416         && (    ( a.high == b.high )
5417              || (    ( a.low == 0 )
5418                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5419            );
5420
5421 }
5422
5423 /*
5424 -------------------------------------------------------------------------------
5425 Returns 1 if the quadruple-precision floating-point value `a' is less than
5426 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5427 cause an exception.  Otherwise, the comparison is performed according to the
5428 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5429 -------------------------------------------------------------------------------
5430 */
5431 flag float128_le_quiet( float128 a, float128 b )
5432 {
5433     flag aSign, bSign;
5434
5435     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5436               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5437          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5438               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5439        ) {
5440         if (    float128_is_signaling_nan( a )
5441              || float128_is_signaling_nan( b ) ) {
5442             float_raise( float_flag_invalid );
5443         }
5444         return 0;
5445     }
5446     aSign = extractFloat128Sign( a );
5447     bSign = extractFloat128Sign( b );
5448     if ( aSign != bSign ) {
5449         return
5450                aSign
5451             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5452                  == 0 );
5453     }
5454     return
5455           aSign ? le128( b.high, b.low, a.high, a.low )
5456         : le128( a.high, a.low, b.high, b.low );
5457
5458 }
5459
5460 /*
5461 -------------------------------------------------------------------------------
5462 Returns 1 if the quadruple-precision floating-point value `a' is less than
5463 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5464 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5465 Standard for Binary Floating-Point Arithmetic.
5466 -------------------------------------------------------------------------------
5467 */
5468 flag float128_lt_quiet( float128 a, float128 b )
5469 {
5470     flag aSign, bSign;
5471
5472     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5473               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5474          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5475               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5476        ) {
5477         if (    float128_is_signaling_nan( a )
5478              || float128_is_signaling_nan( b ) ) {
5479             float_raise( float_flag_invalid );
5480         }
5481         return 0;
5482     }
5483     aSign = extractFloat128Sign( a );
5484     bSign = extractFloat128Sign( b );
5485     if ( aSign != bSign ) {
5486         return
5487                aSign
5488             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5489                  != 0 );
5490     }
5491     return
5492           aSign ? lt128( b.high, b.low, a.high, a.low )
5493         : lt128( a.high, a.low, b.high, b.low );
5494
5495 }
5496
5497 #endif
5498
5499
5500 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5501
5502 /*
5503  * These two routines are not part of the original softfloat distribution.
5504  *
5505  * They are based on the corresponding conversions to integer but return
5506  * unsigned numbers instead since these functions are required by GCC.
5507  *
5508  * Added by Mark Brinicombe <mark@NetBSD.org>   27/09/97
5509  *
5510  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5511  */
5512
5513 /*
5514 -------------------------------------------------------------------------------
5515 Returns the result of converting the double-precision floating-point value
5516 `a' to the 32-bit unsigned integer format.  The conversion is
5517 performed according to the IEC/IEEE Standard for Binary Floating-point
5518 Arithmetic, except that the conversion is always rounded toward zero.  If
5519 `a' is a NaN, the largest positive integer is returned.  If the conversion
5520 overflows, the largest integer positive is returned.
5521 -------------------------------------------------------------------------------
5522 */
5523 uint32 float64_to_uint32_round_to_zero( float64 a )
5524 {
5525     flag aSign;
5526     int16 aExp, shiftCount;
5527     bits64 aSig, savedASig;
5528     uint32 z;
5529
5530     aSig = extractFloat64Frac( a );
5531     aExp = extractFloat64Exp( a );
5532     aSign = extractFloat64Sign( a );
5533
5534     if (aSign) {
5535         float_raise( float_flag_invalid );
5536         return(0);
5537     }
5538
5539     if ( 0x41E < aExp ) {
5540         float_raise( float_flag_invalid );
5541         return 0xffffffff;
5542     }
5543     else if ( aExp < 0x3FF ) {
5544         if ( aExp || aSig ) set_float_exception_inexact_flag();
5545         return 0;
5546     }
5547     aSig |= LIT64( 0x0010000000000000 );
5548     shiftCount = 0x433 - aExp;
5549     savedASig = aSig;
5550     aSig >>= shiftCount;
5551     z = (uint32)aSig;
5552     if ( ( aSig<<shiftCount ) != savedASig ) {
5553         set_float_exception_inexact_flag();
5554     }
5555     return z;
5556
5557 }
5558
5559 /*
5560 -------------------------------------------------------------------------------
5561 Returns the result of converting the single-precision floating-point value
5562 `a' to the 32-bit unsigned integer format.  The conversion is
5563 performed according to the IEC/IEEE Standard for Binary Floating-point
5564 Arithmetic, except that the conversion is always rounded toward zero.  If
5565 `a' is a NaN, the largest positive integer is returned.  If the conversion
5566 overflows, the largest positive integer is returned.
5567 -------------------------------------------------------------------------------
5568 */
5569 uint32 float32_to_uint32_round_to_zero( float32 a )
5570 {
5571     flag aSign;
5572     int16 aExp, shiftCount;
5573     bits32 aSig;
5574     uint32 z;
5575
5576     aSig = extractFloat32Frac( a );
5577     aExp = extractFloat32Exp( a );
5578     aSign = extractFloat32Sign( a );
5579     shiftCount = aExp - 0x9E;
5580
5581     if (aSign) {
5582         float_raise( float_flag_invalid );
5583         return(0);
5584     }
5585     if ( 0 < shiftCount ) {
5586         float_raise( float_flag_invalid );
5587         return 0xFFFFFFFF;
5588     }
5589     else if ( aExp <= 0x7E ) {
5590         if ( aExp | aSig ) set_float_exception_inexact_flag();
5591         return 0;
5592     }
5593     aSig = ( aSig | 0x800000 )<<8;
5594     z = aSig>>( - shiftCount );
5595     if ( aSig<<( shiftCount & 31 ) ) {
5596         set_float_exception_inexact_flag();
5597     }
5598     return z;
5599
5600 }
5601
5602 #endif