i386/linux/linux-2.3.21/arch/arm/nwfpe/softfloat.c

   1 /*
   2 ===============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
   5 Arithmetic Package, Release 2.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  15 arithmetic/softfloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22
  23 Derivative works are acceptable, even for commercial purposes, so long as
  24 (1) they include prominent notice that the work is derivative, and (2) they
  25 include prominent notice akin to these three paragraphs for those parts of
  26 this code that are retained.
  27
  28 ===============================================================================
  29 */
  30
  31 #include "milieu.h"
  32 #include "softfloat.h"
  33
  34 /*
  35 -------------------------------------------------------------------------------
  36 Floating-point rounding mode, extended double-precision rounding precision,
  37 and exception flags.
  38 -------------------------------------------------------------------------------
  39 */
  40 int8 float_rounding_mode = float_round_nearest_even;
  41 int8 floatx80_rounding_precision = 80;
  42 int8 float_exception_flags = 0;
  43
  44 /*
  45 -------------------------------------------------------------------------------
  46 Primitive arithmetic functions, including multi-word arithmetic, and
  47 division and square root approximations.  (Can be specialized to target if
  48 desired.)
  49 -------------------------------------------------------------------------------
  50 */
  51 #include "softfloat-macros"
  52
  53 /*
  54 -------------------------------------------------------------------------------
  55 Functions and definitions to determine:  (1) whether tininess for underflow
  56 is detected before or after rounding by default, (2) what (if anything)
  57 happens when exceptions are raised, (3) how signaling NaNs are distinguished
  58 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  59 are propagated from function inputs to output.  These details are target-
  60 specific.
  61 -------------------------------------------------------------------------------
  62 */
  63 #include "softfloat-specialize"
  64
  65 /*
  66 -------------------------------------------------------------------------------
  67 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  68 and 7, and returns the properly rounded 32-bit integer corresponding to the
  69 input.  If `zSign' is nonzero, the input is negated before being converted
  70 to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
  71 input is simply rounded to an integer, with the inexact exception raised if
  72 the input cannot be represented exactly as an integer.  If the fixed-point
  73 input is too large, however, the invalid exception is raised and the largest
  74 positive or negative integer is returned.
  75 -------------------------------------------------------------------------------
  76 */
  77 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
  78 {
  79     int8 roundingMode;
  80     flag roundNearestEven;
  81     int8 roundIncrement, roundBits;
  82     int32 z;
  83
  84     roundingMode = float_rounding_mode;
  85     roundNearestEven = ( roundingMode == float_round_nearest_even );
  86     roundIncrement = 0x40;
  87     if ( ! roundNearestEven ) {
  88         if ( roundingMode == float_round_to_zero ) {
  89             roundIncrement = 0;
  90         }
  91         else {
  92             roundIncrement = 0x7F;
  93             if ( zSign ) {
  94                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  95             }
  96             else {
  97                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  98             }
  99         }
 100     }
 101     roundBits = absZ & 0x7F;
 102     absZ = ( absZ + roundIncrement )>>7;
 103     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 104     z = absZ;
 105     if ( zSign ) z = - z;
 106     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 107         float_exception_flags |= float_flag_invalid;
 108         return zSign ? 0x80000000 : 0x7FFFFFFF;
 109     }
 110     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 111     return z;
 112
 113 }
 114
 115 /*
 116 -------------------------------------------------------------------------------
 117 Returns the fraction bits of the single-precision floating-point value `a'.
 118 -------------------------------------------------------------------------------
 119 */
 120 INLINE bits32 extractFloat32Frac( float32 a )
 121 {
 122
 123     return a & 0x007FFFFF;
 124
 125 }
 126
 127 /*
 128 -------------------------------------------------------------------------------
 129 Returns the exponent bits of the single-precision floating-point value `a'.
 130 -------------------------------------------------------------------------------
 131 */
 132 INLINE int16 extractFloat32Exp( float32 a )
 133 {
 134
 135     return ( a>>23 ) & 0xFF;
 136
 137 }
 138
 139 /*
 140 -------------------------------------------------------------------------------
 141 Returns the sign bit of the single-precision floating-point value `a'.
 142 -------------------------------------------------------------------------------
 143 */
 144 INLINE flag extractFloat32Sign( float32 a )
 145 {
 146
 147     return a>>31;
 148
 149 }
 150
 151 /*
 152 -------------------------------------------------------------------------------
 153 Normalizes the subnormal single-precision floating-point value represented
 154 by the denormalized significand `aSig'.  The normalized exponent and
 155 significand are stored at the locations pointed to by `zExpPtr' and
 156 `zSigPtr', respectively.
 157 -------------------------------------------------------------------------------
 158 */
 159 static void
 160  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 161 {
 162     int8 shiftCount;
 163
 164     shiftCount = countLeadingZeros32( aSig ) - 8;
 165     *zSigPtr = aSig<<shiftCount;
 166     *zExpPtr = 1 - shiftCount;
 167
 168 }
 169
 170 /*
 171 -------------------------------------------------------------------------------
 172 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 173 single-precision floating-point value, returning the result.  After being
 174 shifted into the proper positions, the three fields are simply added
 175 together to form the result.  This means that any integer portion of `zSig'
 176 will be added into the exponent.  Since a properly normalized significand
 177 will have an integer portion equal to 1, the `zExp' input should be 1 less
 178 than the desired result exponent whenever `zSig' is a complete, normalized
 179 significand.
 180 -------------------------------------------------------------------------------
 181 */
 182 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 183 {
 184 #if 0
 185    float32 f;
 186    __asm__("@ packFloat32;
 187             mov %0, %1, asl #31;
 188             orr %0, %2, asl #23;
 189             orr %0, %3"
 190             : /* no outputs */
 191             : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
 192             : "cc");
 193    return f;
 194 #else
 195     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 196 #endif
 197 }
 198
 199 /*
 200 -------------------------------------------------------------------------------
 201 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 202 and significand `zSig', and returns the proper single-precision floating-
 203 point value corresponding to the abstract input.  Ordinarily, the abstract
 204 value is simply rounded and packed into the single-precision format, with
 205 the inexact exception raised if the abstract input cannot be represented
 206 exactly.  If the abstract value is too large, however, the overflow and
 207 inexact exceptions are raised and an infinity or maximal finite value is
 208 returned.  If the abstract value is too small, the input value is rounded to
 209 a subnormal number, and the underflow and inexact exceptions are raised if
 210 the abstract input cannot be represented exactly as a subnormal single-
 211 precision floating-point number.
 212     The input significand `zSig' has its binary point between bits 30
 213 and 29, which is 7 bits to the left of the usual location.  This shifted
 214 significand must be normalized or smaller.  If `zSig' is not normalized,
 215 `zExp' must be 0; in that case, the result returned is a subnormal number,
 216 and it must not require rounding.  In the usual case that `zSig' is
 217 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 218 The handling of underflow and overflow follows the IEC/IEEE Standard for
 219 Binary Floating-point Arithmetic.
 220 -------------------------------------------------------------------------------
 221 */
 222 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 223 {
 224     int8 roundingMode;
 225     flag roundNearestEven;
 226     int8 roundIncrement, roundBits;
 227     flag isTiny;
 228
 229     roundingMode = float_rounding_mode;
 230     roundNearestEven = ( roundingMode == float_round_nearest_even );
 231     roundIncrement = 0x40;
 232     if ( ! roundNearestEven ) {
 233         if ( roundingMode == float_round_to_zero ) {
 234             roundIncrement = 0;
 235         }
 236         else {
 237             roundIncrement = 0x7F;
 238             if ( zSign ) {
 239                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 240             }
 241             else {
 242                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 243             }
 244         }
 245     }
 246     roundBits = zSig & 0x7F;
 247     if ( 0xFD <= (bits16) zExp ) {
 248         if (    ( 0xFD < zExp )
 249              || (    ( zExp == 0xFD )
 250                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 251            ) {
 252             float_raise( float_flag_overflow | float_flag_inexact );
 253             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 254         }
 255         if ( zExp < 0 ) {
 256             isTiny =
 257                    ( float_detect_tininess == float_tininess_before_rounding )
 258                 || ( zExp < -1 )
 259                 || ( zSig + roundIncrement < 0x80000000 );
 260             shift32RightJamming( zSig, - zExp, &zSig );
 261             zExp = 0;
 262             roundBits = zSig & 0x7F;
 263             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 264         }
 265     }
 266     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 267     zSig = ( zSig + roundIncrement )>>7;
 268     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 269     if ( zSig == 0 ) zExp = 0;
 270     return packFloat32( zSign, zExp, zSig );
 271
 272 }
 273
 274 /*
 275 -------------------------------------------------------------------------------
 276 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 277 and significand `zSig', and returns the proper single-precision floating-
 278 point value corresponding to the abstract input.  This routine is just like
 279 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
 280 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 281 point exponent.
 282 -------------------------------------------------------------------------------
 283 */
 284 static float32
 285  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 286 {
 287     int8 shiftCount;
 288
 289     shiftCount = countLeadingZeros32( zSig ) - 1;
 290     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
 291
 292 }
 293
 294 /*
 295 -------------------------------------------------------------------------------
 296 Returns the fraction bits of the double-precision floating-point value `a'.
 297 -------------------------------------------------------------------------------
 298 */
 299 INLINE bits64 extractFloat64Frac( float64 a )
 300 {
 301
 302     return a & LIT64( 0x000FFFFFFFFFFFFF );
 303
 304 }
 305
 306 /*
 307 -------------------------------------------------------------------------------
 308 Returns the exponent bits of the double-precision floating-point value `a'.
 309 -------------------------------------------------------------------------------
 310 */
 311 INLINE int16 extractFloat64Exp( float64 a )
 312 {
 313
 314     return ( a>>52 ) & 0x7FF;
 315
 316 }
 317
 318 /*
 319 -------------------------------------------------------------------------------
 320 Returns the sign bit of the double-precision floating-point value `a'.
 321 -------------------------------------------------------------------------------
 322 */
 323 INLINE flag extractFloat64Sign( float64 a )
 324 {
 325
 326     return a>>63;
 327
 328 }
 329
 330 /*
 331 -------------------------------------------------------------------------------
 332 Normalizes the subnormal double-precision floating-point value represented
 333 by the denormalized significand `aSig'.  The normalized exponent and
 334 significand are stored at the locations pointed to by `zExpPtr' and
 335 `zSigPtr', respectively.
 336 -------------------------------------------------------------------------------
 337 */
 338 static void
 339  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 340 {
 341     int8 shiftCount;
 342
 343     shiftCount = countLeadingZeros64( aSig ) - 11;
 344     *zSigPtr = aSig<<shiftCount;
 345     *zExpPtr = 1 - shiftCount;
 346
 347 }
 348
 349 /*
 350 -------------------------------------------------------------------------------
 351 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 352 double-precision floating-point value, returning the result.  After being
 353 shifted into the proper positions, the three fields are simply added
 354 together to form the result.  This means that any integer portion of `zSig'
 355 will be added into the exponent.  Since a properly normalized significand
 356 will have an integer portion equal to 1, the `zExp' input should be 1 less
 357 than the desired result exponent whenever `zSig' is a complete, normalized
 358 significand.
 359 -------------------------------------------------------------------------------
 360 */
 361 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 362 {
 363
 364     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
 365
 366 }
 367
 368 /*
 369 -------------------------------------------------------------------------------
 370 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 371 and significand `zSig', and returns the proper double-precision floating-
 372 point value corresponding to the abstract input.  Ordinarily, the abstract
 373 value is simply rounded and packed into the double-precision format, with
 374 the inexact exception raised if the abstract input cannot be represented
 375 exactly.  If the abstract value is too large, however, the overflow and
 376 inexact exceptions are raised and an infinity or maximal finite value is
 377 returned.  If the abstract value is too small, the input value is rounded to
 378 a subnormal number, and the underflow and inexact exceptions are raised if
 379 the abstract input cannot be represented exactly as a subnormal double-
 380 precision floating-point number.
 381     The input significand `zSig' has its binary point between bits 62
 382 and 61, which is 10 bits to the left of the usual location.  This shifted
 383 significand must be normalized or smaller.  If `zSig' is not normalized,
 384 `zExp' must be 0; in that case, the result returned is a subnormal number,
 385 and it must not require rounding.  In the usual case that `zSig' is
 386 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 387 The handling of underflow and overflow follows the IEC/IEEE Standard for
 388 Binary Floating-point Arithmetic.
 389 -------------------------------------------------------------------------------
 390 */
 391 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 392 {
 393     int8 roundingMode;
 394     flag roundNearestEven;
 395     int16 roundIncrement, roundBits;
 396     flag isTiny;
 397
 398     roundingMode = float_rounding_mode;
 399     roundNearestEven = ( roundingMode == float_round_nearest_even );
 400     roundIncrement = 0x200;
 401     if ( ! roundNearestEven ) {
 402         if ( roundingMode == float_round_to_zero ) {
 403             roundIncrement = 0;
 404         }
 405         else {
 406             roundIncrement = 0x3FF;
 407             if ( zSign ) {
 408                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 409             }
 410             else {
 411                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 412             }
 413         }
 414     }
 415     roundBits = zSig & 0x3FF;
 416     if ( 0x7FD <= (bits16) zExp ) {
 417         if (    ( 0x7FD < zExp )
 418              || (    ( zExp == 0x7FD )
 419                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 420            ) {
 421             //register int lr;
 422             //__asm__("mov %0, lr" :: "g" (lr));
 423             //fp_printk("roundAndPackFloat64 called from 0x%08x\n",lr);
 424             float_raise( float_flag_overflow | float_flag_inexact );
 425             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
 426         }
 427         if ( zExp < 0 ) {
 428             isTiny =
 429                    ( float_detect_tininess == float_tininess_before_rounding )
 430                 || ( zExp < -1 )
 431                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 432             shift64RightJamming( zSig, - zExp, &zSig );
 433             zExp = 0;
 434             roundBits = zSig & 0x3FF;
 435             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 436         }
 437     }
 438     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 439     zSig = ( zSig + roundIncrement )>>10;
 440     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 441     if ( zSig == 0 ) zExp = 0;
 442     return packFloat64( zSign, zExp, zSig );
 443
 444 }
 445
 446 /*
 447 -------------------------------------------------------------------------------
 448 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 449 and significand `zSig', and returns the proper double-precision floating-
 450 point value corresponding to the abstract input.  This routine is just like
 451 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
 452 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 453 point exponent.
 454 -------------------------------------------------------------------------------
 455 */
 456 static float64
 457  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 458 {
 459     int8 shiftCount;
 460
 461     shiftCount = countLeadingZeros64( zSig ) - 1;
 462     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
 463
 464 }
 465
 466 #ifdef FLOATX80
 467
 468 /*
 469 -------------------------------------------------------------------------------
 470 Returns the fraction bits of the extended double-precision floating-point
 471 value `a'.
 472 -------------------------------------------------------------------------------
 473 */
 474 INLINE bits64 extractFloatx80Frac( floatx80 a )
 475 {
 476
 477     return a.low;
 478
 479 }
 480
 481 /*
 482 -------------------------------------------------------------------------------
 483 Returns the exponent bits of the extended double-precision floating-point
 484 value `a'.
 485 -------------------------------------------------------------------------------
 486 */
 487 INLINE int32 extractFloatx80Exp( floatx80 a )
 488 {
 489
 490     return a.high & 0x7FFF;
 491
 492 }
 493
 494 /*
 495 -------------------------------------------------------------------------------
 496 Returns the sign bit of the extended double-precision floating-point value
 497 `a'.
 498 -------------------------------------------------------------------------------
 499 */
 500 INLINE flag extractFloatx80Sign( floatx80 a )
 501 {
 502
 503     return a.high>>15;
 504
 505 }
 506
 507 /*
 508 -------------------------------------------------------------------------------
 509 Normalizes the subnormal extended double-precision floating-point value
 510 represented by the denormalized significand `aSig'.  The normalized exponent
 511 and significand are stored at the locations pointed to by `zExpPtr' and
 512 `zSigPtr', respectively.
 513 -------------------------------------------------------------------------------
 514 */
 515 static void
 516  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 517 {
 518     int8 shiftCount;
 519
 520     shiftCount = countLeadingZeros64( aSig );
 521     *zSigPtr = aSig<<shiftCount;
 522     *zExpPtr = 1 - shiftCount;
 523
 524 }
 525
 526 /*
 527 -------------------------------------------------------------------------------
 528 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 529 extended double-precision floating-point value, returning the result.
 530 -------------------------------------------------------------------------------
 531 */
 532 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 533 {
 534     floatx80 z;
 535
 536     z.low = zSig;
 537     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 538     return z;
 539
 540 }
 541
 542 /*
 543 -------------------------------------------------------------------------------
 544 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 545 and extended significand formed by the concatenation of `zSig0' and `zSig1',
 546 and returns the proper extended double-precision floating-point value
 547 corresponding to the abstract input.  Ordinarily, the abstract value is
 548 rounded and packed into the extended double-precision format, with the
 549 inexact exception raised if the abstract input cannot be represented
 550 exactly.  If the abstract value is too large, however, the overflow and
 551 inexact exceptions are raised and an infinity or maximal finite value is
 552 returned.  If the abstract value is too small, the input value is rounded to
 553 a subnormal number, and the underflow and inexact exceptions are raised if
 554 the abstract input cannot be represented exactly as a subnormal extended
 555 double-precision floating-point number.
 556     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 557 number of bits as single or double precision, respectively.  Otherwise, the
 558 result is rounded to the full precision of the extended double-precision
 559 format.
 560     The input significand must be normalized or smaller.  If the input
 561 significand is not normalized, `zExp' must be 0; in that case, the result
 562 returned is a subnormal number, and it must not require rounding.  The
 563 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 564 Floating-point Arithmetic.
 565 -------------------------------------------------------------------------------
 566 */
 567 static floatx80
 568  roundAndPackFloatx80(
 569      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 570  )
 571 {
 572     int8 roundingMode;
 573     flag roundNearestEven, increment, isTiny;
 574     int64 roundIncrement, roundMask, roundBits;
 575
 576     roundingMode = float_rounding_mode;
 577     roundNearestEven = ( roundingMode == float_round_nearest_even );
 578     if ( roundingPrecision == 80 ) goto precision80;
 579     if ( roundingPrecision == 64 ) {
 580         roundIncrement = LIT64( 0x0000000000000400 );
 581         roundMask = LIT64( 0x00000000000007FF );
 582     }
 583     else if ( roundingPrecision == 32 ) {
 584         roundIncrement = LIT64( 0x0000008000000000 );
 585         roundMask = LIT64( 0x000000FFFFFFFFFF );
 586     }
 587     else {
 588         goto precision80;
 589     }
 590     zSig0 |= ( zSig1 != 0 );
 591     if ( ! roundNearestEven ) {
 592         if ( roundingMode == float_round_to_zero ) {
 593             roundIncrement = 0;
 594         }
 595         else {
 596             roundIncrement = roundMask;
 597             if ( zSign ) {
 598                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 599             }
 600             else {
 601                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 602             }
 603         }
 604     }
 605     roundBits = zSig0 & roundMask;
 606     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 607         if (    ( 0x7FFE < zExp )
 608              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 609            ) {
 610             goto overflow;
 611         }
 612         if ( zExp <= 0 ) {
 613             isTiny =
 614                    ( float_detect_tininess == float_tininess_before_rounding )
 615                 || ( zExp < 0 )
 616                 || ( zSig0 <= zSig0 + roundIncrement );
 617             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 618             zExp = 0;
 619             roundBits = zSig0 & roundMask;
 620             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 621             if ( roundBits ) float_exception_flags |= float_flag_inexact;
 622             zSig0 += roundIncrement;
 623             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 624             roundIncrement = roundMask + 1;
 625             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 626                 roundMask |= roundIncrement;
 627             }
 628             zSig0 &= ~ roundMask;
 629             return packFloatx80( zSign, zExp, zSig0 );
 630         }
 631     }
 632     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 633     zSig0 += roundIncrement;
 634     if ( zSig0 < roundIncrement ) {
 635         ++zExp;
 636         zSig0 = LIT64( 0x8000000000000000 );
 637     }
 638     roundIncrement = roundMask + 1;
 639     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 640         roundMask |= roundIncrement;
 641     }
 642     zSig0 &= ~ roundMask;
 643     if ( zSig0 == 0 ) zExp = 0;
 644     return packFloatx80( zSign, zExp, zSig0 );
 645  precision80:
 646     increment = ( (sbits64) zSig1 < 0 );
 647     if ( ! roundNearestEven ) {
 648         if ( roundingMode == float_round_to_zero ) {
 649             increment = 0;
 650         }
 651         else {
 652             if ( zSign ) {
 653                 increment = ( roundingMode == float_round_down ) && zSig1;
 654             }
 655             else {
 656                 increment = ( roundingMode == float_round_up ) && zSig1;
 657             }
 658         }
 659     }
 660     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 661         if (    ( 0x7FFE < zExp )
 662              || (    ( zExp == 0x7FFE )
 663                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 664                   && increment
 665                 )
 666            ) {
 667             roundMask = 0;
 668  overflow:
 669             float_raise( float_flag_overflow | float_flag_inexact );
 670             if (    ( roundingMode == float_round_to_zero )
 671                  || ( zSign && ( roundingMode == float_round_up ) )
 672                  || ( ! zSign && ( roundingMode == float_round_down ) )
 673                ) {
 674                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 675             }
 676             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 677         }
 678         if ( zExp <= 0 ) {
 679             isTiny =
 680                    ( float_detect_tininess == float_tininess_before_rounding )
 681                 || ( zExp < 0 )
 682                 || ! increment
 683                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 684             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 685             zExp = 0;
 686             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
 687             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 688             if ( roundNearestEven ) {
 689                 increment = ( (sbits64) zSig1 < 0 );
 690             }
 691             else {
 692                 if ( zSign ) {
 693                     increment = ( roundingMode == float_round_down ) && zSig1;
 694                 }
 695                 else {
 696                     increment = ( roundingMode == float_round_up ) && zSig1;
 697                 }
 698             }
 699             if ( increment ) {
 700                 ++zSig0;
 701                 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
 702                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 703             }
 704             return packFloatx80( zSign, zExp, zSig0 );
 705         }
 706     }
 707     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 708     if ( increment ) {
 709         ++zSig0;
 710         if ( zSig0 == 0 ) {
 711             ++zExp;
 712             zSig0 = LIT64( 0x8000000000000000 );
 713         }
 714         else {
 715             zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
 716         }
 717     }
 718     else {
 719         if ( zSig0 == 0 ) zExp = 0;
 720     }
 721
 722     return packFloatx80( zSign, zExp, zSig0 );
 723 }
 724
 725 /*
 726 -------------------------------------------------------------------------------
 727 Takes an abstract floating-point value having sign `zSign', exponent
 728 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 729 and returns the proper extended double-precision floating-point value
 730 corresponding to the abstract input.  This routine is just like
 731 `roundAndPackFloatx80' except that the input significand does not have to be
 732 normalized.
 733 -------------------------------------------------------------------------------
 734 */
 735 static floatx80
 736  normalizeRoundAndPackFloatx80(
 737      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 738  )
 739 {
 740     int8 shiftCount;
 741
 742     if ( zSig0 == 0 ) {
 743         zSig0 = zSig1;
 744         zSig1 = 0;
 745         zExp -= 64;
 746     }
 747     shiftCount = countLeadingZeros64( zSig0 );
 748     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 749     zExp -= shiftCount;
 750     return
 751         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
 752
 753 }
 754
 755 #endif
 756
 757 #ifdef FLOAT128
 758
 759 /*
 760 -------------------------------------------------------------------------------
 761 Returns the least-significant 64 fraction bits of the quadruple-precision
 762 floating-point value `a'.
 763 -------------------------------------------------------------------------------
 764 */
 765 INLINE bits64 extractFloat128Frac1( float128 a )
 766 {
 767
 768     return a.low;
 769
 770 }
 771
 772 /*
 773 -------------------------------------------------------------------------------
 774 Returns the most-significant 48 fraction bits of the quadruple-precision
 775 floating-point value `a'.
 776 -------------------------------------------------------------------------------
 777 */
 778 INLINE bits64 extractFloat128Frac0( float128 a )
 779 {
 780
 781     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 782
 783 }
 784
 785 /*
 786 -------------------------------------------------------------------------------
 787 Returns the exponent bits of the quadruple-precision floating-point value
 788 `a'.
 789 -------------------------------------------------------------------------------
 790 */
 791 INLINE int32 extractFloat128Exp( float128 a )
 792 {
 793
 794     return ( a.high>>48 ) & 0x7FFF;
 795
 796 }
 797
 798 /*
 799 -------------------------------------------------------------------------------
 800 Returns the sign bit of the quadruple-precision floating-point value `a'.
 801 -------------------------------------------------------------------------------
 802 */
 803 INLINE flag extractFloat128Sign( float128 a )
 804 {
 805
 806     return a.high>>63;
 807
 808 }
 809
 810 /*
 811 -------------------------------------------------------------------------------
 812 Normalizes the subnormal quadruple-precision floating-point value
 813 represented by the denormalized significand formed by the concatenation of
 814 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 815 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 816 significand are stored at the location pointed to by `zSig0Ptr', and the
 817 least significant 64 bits of the normalized significand are stored at the
 818 location pointed to by `zSig1Ptr'.
 819 -------------------------------------------------------------------------------
 820 */
 821 static void
 822  normalizeFloat128Subnormal(
 823      bits64 aSig0,
 824      bits64 aSig1,
 825      int32 *zExpPtr,
 826      bits64 *zSig0Ptr,
 827      bits64 *zSig1Ptr
 828  )
 829 {
 830     int8 shiftCount;
 831
 832     if ( aSig0 == 0 ) {
 833         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 834         if ( shiftCount < 0 ) {
 835             *zSig0Ptr = aSig1>>( - shiftCount );
 836             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 837         }
 838         else {
 839             *zSig0Ptr = aSig1<<shiftCount;
 840             *zSig1Ptr = 0;
 841         }
 842         *zExpPtr = - shiftCount - 63;
 843     }
 844     else {
 845         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 846         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 847         *zExpPtr = 1 - shiftCount;
 848     }
 849
 850 }
 851
 852 /*
 853 -------------------------------------------------------------------------------
 854 Packs the sign `zSign', the exponent `zExp', and the significand formed
 855 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 856 floating-point value, returning the result.  After being shifted into the
 857 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 858 added together to form the most significant 32 bits of the result.  This
 859 means that any integer portion of `zSig0' will be added into the exponent.
 860 Since a properly normalized significand will have an integer portion equal
 861 to 1, the `zExp' input should be 1 less than the desired result exponent
 862 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 863 significand.
 864 -------------------------------------------------------------------------------
 865 */
 866 INLINE float128
 867  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 868 {
 869     float128 z;
 870
 871     z.low = zSig1;
 872     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 873     return z;
 874
 875 }
 876
 877 /*
 878 -------------------------------------------------------------------------------
 879 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 880 and extended significand formed by the concatenation of `zSig0', `zSig1',
 881 and `zSig2', and returns the proper quadruple-precision floating-point value
 882 corresponding to the abstract input.  Ordinarily, the abstract value is
 883 simply rounded and packed into the quadruple-precision format, with the
 884 inexact exception raised if the abstract input cannot be represented
 885 exactly.  If the abstract value is too large, however, the overflow and
 886 inexact exceptions are raised and an infinity or maximal finite value is
 887 returned.  If the abstract value is too small, the input value is rounded to
 888 a subnormal number, and the underflow and inexact exceptions are raised if
 889 the abstract input cannot be represented exactly as a subnormal quadruple-
 890 precision floating-point number.
 891     The input significand must be normalized or smaller.  If the input
 892 significand is not normalized, `zExp' must be 0; in that case, the result
 893 returned is a subnormal number, and it must not require rounding.  In the
 894 usual case that the input significand is normalized, `zExp' must be 1 less
 895 than the ``true'' floating-point exponent.  The handling of underflow and
 896 overflow follows the IEC/IEEE Standard for Binary Floating-point Arithmetic.
 897 -------------------------------------------------------------------------------
 898 */
 899 static float128
 900  roundAndPackFloat128(
 901      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
 902 {
 903     int8 roundingMode;
 904     flag roundNearestEven, increment, isTiny;
 905
 906     roundingMode = float_rounding_mode;
 907     roundNearestEven = ( roundingMode == float_round_nearest_even );
 908     increment = ( (sbits64) zSig2 < 0 );
 909     if ( ! roundNearestEven ) {
 910         if ( roundingMode == float_round_to_zero ) {
 911             increment = 0;
 912         }
 913         else {
 914             if ( zSign ) {
 915                 increment = ( roundingMode == float_round_down ) && zSig2;
 916             }
 917             else {
 918                 increment = ( roundingMode == float_round_up ) && zSig2;
 919             }
 920         }
 921     }
 922     if ( 0x7FFD <= (bits32) zExp ) {
 923         if (    ( 0x7FFD < zExp )
 924              || (    ( zExp == 0x7FFD )
 925                   && eq128(
 926                          LIT64( 0x0001FFFFFFFFFFFF ),
 927                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 928                          zSig0,
 929                          zSig1
 930                      )
 931                   && increment
 932                 )
 933            ) {
 934             float_raise( float_flag_overflow | float_flag_inexact );
 935             if (    ( roundingMode == float_round_to_zero )
 936                  || ( zSign && ( roundingMode == float_round_up ) )
 937                  || ( ! zSign && ( roundingMode == float_round_down ) )
 938                ) {
 939                 return
 940                     packFloat128(
 941                         zSign,
 942                         0x7FFE,
 943                         LIT64( 0x0000FFFFFFFFFFFF ),
 944                         LIT64( 0xFFFFFFFFFFFFFFFF )
 945                     );
 946             }
 947             return packFloat128( zSign, 0x7FFF, 0, 0 );
 948         }
 949         if ( zExp < 0 ) {
 950             isTiny =
 951                    ( float_detect_tininess == float_tininess_before_rounding )
 952                 || ( zExp < -1 )
 953                 || ! increment
 954                 || lt128(
 955                        zSig0,
 956                        zSig1,
 957                        LIT64( 0x0001FFFFFFFFFFFF ),
 958                        LIT64( 0xFFFFFFFFFFFFFFFF )
 959                    );
 960             shift128ExtraRightJamming(
 961                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 962             zExp = 0;
 963             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
 964             if ( roundNearestEven ) {
 965                 increment = ( (sbits64) zSig2 < 0 );
 966             }
 967             else {
 968                 if ( zSign ) {
 969                     increment = ( roundingMode == float_round_down ) && zSig2;
 970                 }
 971                 else {
 972                     increment = ( roundingMode == float_round_up ) && zSig2;
 973                 }
 974             }
 975         }
 976     }
 977     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
 978     if ( increment ) {
 979         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
 980         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
 981     }
 982     else {
 983         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
 984     }
 985     return packFloat128( zSign, zExp, zSig0, zSig1 );
 986
 987 }
 988
 989 /*
 990 -------------------------------------------------------------------------------
 991 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 992 and significand formed by the concatenation of `zSig0' and `zSig1', and
 993 returns the proper quadruple-precision floating-point value corresponding to
 994 the abstract input.  This routine is just like `roundAndPackFloat128' except
 995 that the input significand has fewer bits and does not have to be normalized
 996 in any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 997 point exponent.
 998 -------------------------------------------------------------------------------
 999 */
1000 static float128
1001  normalizeRoundAndPackFloat128(
1002      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1003 {
1004     int8 shiftCount;
1005     bits64 zSig2;
1006
1007     if ( zSig0 == 0 ) {
1008         zSig0 = zSig1;
1009         zSig1 = 0;
1010         zExp -= 64;
1011     }
1012     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1013     if ( 0 <= shiftCount ) {
1014         zSig2 = 0;
1015         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1016     }
1017     else {
1018         shift128ExtraRightJamming(
1019             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1020     }
1021     zExp -= shiftCount;
1022     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1023
1024 }
1025
1026 #endif
1027
1028 /*
1029 -------------------------------------------------------------------------------
1030 Returns the result of converting the 32-bit two's complement integer `a' to
1031 the single-precision floating-point format.  The conversion is performed
1032 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1033 -------------------------------------------------------------------------------
1034 */
1035 float32 int32_to_float32( int32 a )
1036 {
1037     flag zSign;
1038
1039     if ( a == 0 ) return 0;
1040     if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1041     zSign = ( a < 0 );
1042     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1043
1044 }
1045
1046 /*
1047 -------------------------------------------------------------------------------
1048 Returns the result of converting the 32-bit two's complement integer `a' to
1049 the double-precision floating-point format.  The conversion is performed
1050 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1051 -------------------------------------------------------------------------------
1052 */
1053 float64 int32_to_float64( int32 a )
1054 {
1055     flag aSign;
1056     uint32 absA;
1057     int8 shiftCount;
1058     bits64 zSig;
1059
1060     if ( a == 0 ) return 0;
1061     aSign = ( a < 0 );
1062     absA = aSign ? - a : a;
1063     shiftCount = countLeadingZeros32( absA ) + 21;
1064     zSig = absA;
1065     return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
1066
1067 }
1068
1069 #ifdef FLOATX80
1070
1071 /*
1072 -------------------------------------------------------------------------------
1073 Returns the result of converting the 32-bit two's complement integer `a'
1074 to the extended double-precision floating-point format.  The conversion
1075 is performed according to the IEC/IEEE Standard for Binary Floating-point
1076 Arithmetic.
1077 -------------------------------------------------------------------------------
1078 */
1079 floatx80 int32_to_floatx80( int32 a )
1080 {
1081     flag zSign;
1082     uint32 absA;
1083     int8 shiftCount;
1084     bits64 zSig;
1085
1086     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1087     zSign = ( a < 0 );
1088     absA = zSign ? - a : a;
1089     shiftCount = countLeadingZeros32( absA ) + 32;
1090     zSig = absA;
1091     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1092
1093 }
1094
1095 #endif
1096
1097 #ifdef FLOAT128
1098
1099 /*
1100 -------------------------------------------------------------------------------
1101 Returns the result of converting the 32-bit two's complement integer `a' to
1102 the quadruple-precision floating-point format.  The conversion is performed
1103 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1104 -------------------------------------------------------------------------------
1105 */
1106 float128 int32_to_float128( int32 a )
1107 {
1108     flag zSign;
1109     uint32 absA;
1110     int8 shiftCount;
1111     bits64 zSig0;
1112
1113     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1114     zSign = ( a < 0 );
1115     absA = zSign ? - a : a;
1116     shiftCount = countLeadingZeros32( absA ) + 17;
1117     zSig0 = absA;
1118     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1119
1120 }
1121
1122 #endif
1123
1124 /*
1125 -------------------------------------------------------------------------------
1126 Returns the result of converting the single-precision floating-point value
1127 `a' to the 32-bit two's complement integer format.  The conversion is
1128 performed according to the IEC/IEEE Standard for Binary Floating-point
1129 Arithmetic---which means in particular that the conversion is rounded
1130 according to the current rounding mode.  If `a' is a NaN, the largest
1131 positive integer is returned.  Otherwise, if the conversion overflows, the
1132 largest integer with the same sign as `a' is returned.
1133 -------------------------------------------------------------------------------
1134 */
1135 int32 float32_to_int32( float32 a )
1136 {
1137     flag aSign;
1138     int16 aExp, shiftCount;
1139     bits32 aSig;
1140     bits64 zSig;
1141
1142     aSig = extractFloat32Frac( a );
1143     aExp = extractFloat32Exp( a );
1144     aSign = extractFloat32Sign( a );
1145     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1146     if ( aExp ) aSig |= 0x00800000;
1147     shiftCount = 0xAF - aExp;
1148     zSig = aSig;
1149     zSig <<= 32;
1150     if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
1151     return roundAndPackInt32( aSign, zSig );
1152
1153 }
1154
1155 /*
1156 -------------------------------------------------------------------------------
1157 Returns the result of converting the single-precision floating-point value
1158 `a' to the 32-bit two's complement integer format.  The conversion is
1159 performed according to the IEC/IEEE Standard for Binary Floating-point
1160 Arithmetic, except that the conversion is always rounded toward zero.  If
1161 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1162 conversion overflows, the largest integer with the same sign as `a' is
1163 returned.
1164 -------------------------------------------------------------------------------
1165 */
1166 int32 float32_to_int32_round_to_zero( float32 a )
1167 {
1168     flag aSign;
1169     int16 aExp, shiftCount;
1170     bits32 aSig;
1171     int32 z;
1172
1173     aSig = extractFloat32Frac( a );
1174     aExp = extractFloat32Exp( a );
1175     aSign = extractFloat32Sign( a );
1176     shiftCount = aExp - 0x9E;
1177     if ( 0 <= shiftCount ) {
1178         if ( a == 0xCF000000 ) return 0x80000000;
1179         float_raise( float_flag_invalid );
1180         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1181         return 0x80000000;
1182     }
1183     else if ( aExp <= 0x7E ) {
1184         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1185         return 0;
1186     }
1187     aSig = ( aSig | 0x00800000 )<<8;
1188     z = aSig>>( - shiftCount );
1189     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1190         float_exception_flags |= float_flag_inexact;
1191     }
1192     return aSign ? - z : z;
1193
1194 }
1195
1196 /*
1197 -------------------------------------------------------------------------------
1198 Returns the result of converting the single-precision floating-point value
1199 `a' to the double-precision floating-point format.  The conversion is
1200 performed according to the IEC/IEEE Standard for Binary Floating-point
1201 Arithmetic.
1202 -------------------------------------------------------------------------------
1203 */
1204 float64 float32_to_float64( float32 a )
1205 {
1206     flag aSign;
1207     int16 aExp;
1208     bits32 aSig;
1209
1210     aSig = extractFloat32Frac( a );
1211     aExp = extractFloat32Exp( a );
1212     aSign = extractFloat32Sign( a );
1213     if ( aExp == 0xFF ) {
1214         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1215         return packFloat64( aSign, 0x7FF, 0 );
1216     }
1217     if ( aExp == 0 ) {
1218         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1219         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1220         --aExp;
1221     }
1222     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1223
1224 }
1225
1226 #ifdef FLOATX80
1227
1228 /*
1229 -------------------------------------------------------------------------------
1230 Returns the result of converting the single-precision floating-point value
1231 `a' to the extended double-precision floating-point format.  The conversion
1232 is performed according to the IEC/IEEE Standard for Binary Floating-point
1233 Arithmetic.
1234 -------------------------------------------------------------------------------
1235 */
1236 floatx80 float32_to_floatx80( float32 a )
1237 {
1238     flag aSign;
1239     int16 aExp;
1240     bits32 aSig;
1241
1242     aSig = extractFloat32Frac( a );
1243     aExp = extractFloat32Exp( a );
1244     aSign = extractFloat32Sign( a );
1245     if ( aExp == 0xFF ) {
1246         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1247         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1248     }
1249     if ( aExp == 0 ) {
1250         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1251         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1252     }
1253     aSig |= 0x00800000;
1254     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1255
1256 }
1257
1258 #endif
1259
1260 #ifdef FLOAT128
1261
1262 /*
1263 -------------------------------------------------------------------------------
1264 Returns the result of converting the single-precision floating-point value
1265 `a' to the double-precision floating-point format.  The conversion is
1266 performed according to the IEC/IEEE Standard for Binary Floating-point
1267 Arithmetic.
1268 -------------------------------------------------------------------------------
1269 */
1270 float128 float32_to_float128( float32 a )
1271 {
1272     flag aSign;
1273     int16 aExp;
1274     bits32 aSig;
1275
1276     aSig = extractFloat32Frac( a );
1277     aExp = extractFloat32Exp( a );
1278     aSign = extractFloat32Sign( a );
1279     if ( aExp == 0xFF ) {
1280         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1281         return packFloat128( aSign, 0x7FFF, 0, 0 );
1282     }
1283     if ( aExp == 0 ) {
1284         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1285         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1286         --aExp;
1287     }
1288     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1289
1290 }
1291
1292 #endif
1293
1294 /*
1295 -------------------------------------------------------------------------------
1296 Rounds the single-precision floating-point value `a' to an integer, and
1297 returns the result as a single-precision floating-point value.  The
1298 operation is performed according to the IEC/IEEE Standard for Binary
1299 Floating-point Arithmetic.
1300 -------------------------------------------------------------------------------
1301 */
1302 float32 float32_round_to_int( float32 a )
1303 {
1304     flag aSign;
1305     int16 aExp;
1306     bits32 lastBitMask, roundBitsMask;
1307     int8 roundingMode;
1308     float32 z;
1309
1310     aExp = extractFloat32Exp( a );
1311     if ( 0x96 <= aExp ) {
1312         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1313             return propagateFloat32NaN( a, a );
1314         }
1315         return a;
1316     }
1317     if ( aExp <= 0x7E ) {
1318         if ( (bits32) ( a<<1 ) == 0 ) return a;
1319         float_exception_flags |= float_flag_inexact;
1320         aSign = extractFloat32Sign( a );
1321         switch ( float_rounding_mode ) {
1322          case float_round_nearest_even:
1323             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1324                 return packFloat32( aSign, 0x7F, 0 );
1325             }
1326             break;
1327          case float_round_down:
1328             return aSign ? 0xBF800000 : 0;
1329          case float_round_up:
1330             return aSign ? 0x80000000 : 0x3F800000;
1331         }
1332         return packFloat32( aSign, 0, 0 );
1333     }
1334     lastBitMask = 1;
1335     lastBitMask <<= 0x96 - aExp;
1336     roundBitsMask = lastBitMask - 1;
1337     z = a;
1338     roundingMode = float_rounding_mode;
1339     if ( roundingMode == float_round_nearest_even ) {
1340         z += lastBitMask>>1;
1341         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1342     }
1343     else if ( roundingMode != float_round_to_zero ) {
1344         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1345             z += roundBitsMask;
1346         }
1347     }
1348     z &= ~ roundBitsMask;
1349     if ( z != a ) float_exception_flags |= float_flag_inexact;
1350     return z;
1351
1352 }
1353
1354 /*
1355 -------------------------------------------------------------------------------
1356 Returns the result of adding the absolute values of the single-precision
1357 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1358 before being returned.  `zSign' is ignored if the result is a NaN.  The
1359 addition is performed according to the IEC/IEEE Standard for Binary
1360 Floating-point Arithmetic.
1361 -------------------------------------------------------------------------------
1362 */
1363 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1364 {
1365     int16 aExp, bExp, zExp;
1366     bits32 aSig, bSig, zSig;
1367     int16 expDiff;
1368
1369     aSig = extractFloat32Frac( a );
1370     aExp = extractFloat32Exp( a );
1371     bSig = extractFloat32Frac( b );
1372     bExp = extractFloat32Exp( b );
1373     expDiff = aExp - bExp;
1374     aSig <<= 6;
1375     bSig <<= 6;
1376     if ( 0 < expDiff ) {
1377         if ( aExp == 0xFF ) {
1378             if ( aSig ) return propagateFloat32NaN( a, b );
1379             return a;
1380         }
1381         if ( bExp == 0 ) {
1382             --expDiff;
1383         }
1384         else {
1385             bSig |= 0x20000000;
1386         }
1387         shift32RightJamming( bSig, expDiff, &bSig );
1388         zExp = aExp;
1389     }
1390     else if ( expDiff < 0 ) {
1391         if ( bExp == 0xFF ) {
1392             if ( bSig ) return propagateFloat32NaN( a, b );
1393             return packFloat32( zSign, 0xFF, 0 );
1394         }
1395         if ( aExp == 0 ) {
1396             ++expDiff;
1397         }
1398         else {
1399             aSig |= 0x20000000;
1400         }
1401         shift32RightJamming( aSig, - expDiff, &aSig );
1402         zExp = bExp;
1403     }
1404     else {
1405         if ( aExp == 0xFF ) {
1406             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1407             return a;
1408         }
1409         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1410         zSig = 0x40000000 + aSig + bSig;
1411         zExp = aExp;
1412         goto roundAndPack;
1413     }
1414     aSig |= 0x20000000;
1415     zSig = ( aSig + bSig )<<1;
1416     --zExp;
1417     if ( (sbits32) zSig < 0 ) {
1418         zSig = aSig + bSig;
1419         ++zExp;
1420     }
1421  roundAndPack:
1422     return roundAndPackFloat32( zSign, zExp, zSig );
1423
1424 }
1425
1426 /*
1427 -------------------------------------------------------------------------------
1428 Returns the result of subtracting the absolute values of the single-
1429 precision floating-point values `a' and `b'.  If `zSign' is true, the
1430 difference is negated before being returned.  `zSign' is ignored if the
1431 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1432 Standard for Binary Floating-point Arithmetic.
1433 -------------------------------------------------------------------------------
1434 */
1435 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1436 {
1437     int16 aExp, bExp, zExp;
1438     bits32 aSig, bSig, zSig;
1439     int16 expDiff;
1440
1441     aSig = extractFloat32Frac( a );
1442     aExp = extractFloat32Exp( a );
1443     bSig = extractFloat32Frac( b );
1444     bExp = extractFloat32Exp( b );
1445     expDiff = aExp - bExp;
1446     aSig <<= 7;
1447     bSig <<= 7;
1448     if ( 0 < expDiff ) goto aExpBigger;
1449     if ( expDiff < 0 ) goto bExpBigger;
1450     if ( aExp == 0xFF ) {
1451         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1452         float_raise( float_flag_invalid );
1453         return float32_default_nan;
1454     }
1455     if ( aExp == 0 ) {
1456         aExp = 1;
1457         bExp = 1;
1458     }
1459     if ( bSig < aSig ) goto aBigger;
1460     if ( aSig < bSig ) goto bBigger;
1461     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1462  bExpBigger:
1463     if ( bExp == 0xFF ) {
1464         if ( bSig ) return propagateFloat32NaN( a, b );
1465         return packFloat32( zSign ^ 1, 0xFF, 0 );
1466     }
1467     if ( aExp == 0 ) {
1468         ++expDiff;
1469     }
1470     else {
1471         aSig |= 0x40000000;
1472     }
1473     shift32RightJamming( aSig, - expDiff, &aSig );
1474     bSig |= 0x40000000;
1475  bBigger:
1476     zSig = bSig - aSig;
1477     zExp = bExp;
1478     zSign ^= 1;
1479     goto normalizeRoundAndPack;
1480  aExpBigger:
1481     if ( aExp == 0xFF ) {
1482         if ( aSig ) return propagateFloat32NaN( a, b );
1483         return a;
1484     }
1485     if ( bExp == 0 ) {
1486         --expDiff;
1487     }
1488     else {
1489         bSig |= 0x40000000;
1490     }
1491     shift32RightJamming( bSig, expDiff, &bSig );
1492     aSig |= 0x40000000;
1493  aBigger:
1494     zSig = aSig - bSig;
1495     zExp = aExp;
1496  normalizeRoundAndPack:
1497     --zExp;
1498     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1499
1500 }
1501
1502 /*
1503 -------------------------------------------------------------------------------
1504 Returns the result of adding the single-precision floating-point values `a'
1505 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1506 Binary Floating-point Arithmetic.
1507 -------------------------------------------------------------------------------
1508 */
1509 float32 float32_add( float32 a, float32 b )
1510 {
1511     flag aSign, bSign;
1512
1513     aSign = extractFloat32Sign( a );
1514     bSign = extractFloat32Sign( b );
1515     if ( aSign == bSign ) {
1516         return addFloat32Sigs( a, b, aSign );
1517     }
1518     else {
1519         return subFloat32Sigs( a, b, aSign );
1520     }
1521
1522 }
1523
1524 /*
1525 -------------------------------------------------------------------------------
1526 Returns the result of subtracting the single-precision floating-point values
1527 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1528 for Binary Floating-point Arithmetic.
1529 -------------------------------------------------------------------------------
1530 */
1531 float32 float32_sub( float32 a, float32 b )
1532 {
1533     flag aSign, bSign;
1534
1535     aSign = extractFloat32Sign( a );
1536     bSign = extractFloat32Sign( b );
1537     if ( aSign == bSign ) {
1538         return subFloat32Sigs( a, b, aSign );
1539     }
1540     else {
1541         return addFloat32Sigs( a, b, aSign );
1542     }
1543
1544 }
1545
1546 /*
1547 -------------------------------------------------------------------------------
1548 Returns the result of multiplying the single-precision floating-point values
1549 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1550 for Binary Floating-point Arithmetic.
1551 -------------------------------------------------------------------------------
1552 */
1553 float32 float32_mul( float32 a, float32 b )
1554 {
1555     flag aSign, bSign, zSign;
1556     int16 aExp, bExp, zExp;
1557     bits32 aSig, bSig;
1558     bits64 zSig64;
1559     bits32 zSig;
1560
1561     aSig = extractFloat32Frac( a );
1562     aExp = extractFloat32Exp( a );
1563     aSign = extractFloat32Sign( a );
1564     bSig = extractFloat32Frac( b );
1565     bExp = extractFloat32Exp( b );
1566     bSign = extractFloat32Sign( b );
1567     zSign = aSign ^ bSign;
1568     if ( aExp == 0xFF ) {
1569         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1570             return propagateFloat32NaN( a, b );
1571         }
1572         if ( ( bExp | bSig ) == 0 ) {
1573             float_raise( float_flag_invalid );
1574             return float32_default_nan;
1575         }
1576         return packFloat32( zSign, 0xFF, 0 );
1577     }
1578     if ( bExp == 0xFF ) {
1579         if ( bSig ) return propagateFloat32NaN( a, b );
1580         if ( ( aExp | aSig ) == 0 ) {
1581             float_raise( float_flag_invalid );
1582             return float32_default_nan;
1583         }
1584         return packFloat32( zSign, 0xFF, 0 );
1585     }
1586     if ( aExp == 0 ) {
1587         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1588         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1589     }
1590     if ( bExp == 0 ) {
1591         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1592         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1593     }
1594     zExp = aExp + bExp - 0x7F;
1595     aSig = ( aSig | 0x00800000 )<<7;
1596     bSig = ( bSig | 0x00800000 )<<8;
1597     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1598     zSig = zSig64;
1599     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1600         zSig <<= 1;
1601         --zExp;
1602     }
1603     return roundAndPackFloat32( zSign, zExp, zSig );
1604
1605 }
1606
1607 /*
1608 -------------------------------------------------------------------------------
1609 Returns the result of dividing the single-precision floating-point value `a'
1610 by the corresponding value `b'.  The operation is performed according to the
1611 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1612 -------------------------------------------------------------------------------
1613 */
1614 float32 float32_div( float32 a, float32 b )
1615 {
1616     flag aSign, bSign, zSign;
1617     int16 aExp, bExp, zExp;
1618     bits32 aSig, bSig, zSig;
1619
1620     aSig = extractFloat32Frac( a );
1621     aExp = extractFloat32Exp( a );
1622     aSign = extractFloat32Sign( a );
1623     bSig = extractFloat32Frac( b );
1624     bExp = extractFloat32Exp( b );
1625     bSign = extractFloat32Sign( b );
1626     zSign = aSign ^ bSign;
1627     if ( aExp == 0xFF ) {
1628         if ( aSig ) return propagateFloat32NaN( a, b );
1629         if ( bExp == 0xFF ) {
1630             if ( bSig ) return propagateFloat32NaN( a, b );
1631             float_raise( float_flag_invalid );
1632             return float32_default_nan;
1633         }
1634         return packFloat32( zSign, 0xFF, 0 );
1635     }
1636     if ( bExp == 0xFF ) {
1637         if ( bSig ) return propagateFloat32NaN( a, b );
1638         return packFloat32( zSign, 0, 0 );
1639     }
1640     if ( bExp == 0 ) {
1641         if ( bSig == 0 ) {
1642             if ( ( aExp | aSig ) == 0 ) {
1643                 float_raise( float_flag_invalid );
1644                 return float32_default_nan;
1645             }
1646             float_raise( float_flag_divbyzero );
1647             return packFloat32( zSign, 0xFF, 0 );
1648         }
1649         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1650     }
1651     if ( aExp == 0 ) {
1652         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1653         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1654     }
1655     zExp = aExp - bExp + 0x7D;
1656     aSig = ( aSig | 0x00800000 )<<7;
1657     bSig = ( bSig | 0x00800000 )<<8;
1658     if ( bSig <= ( aSig + aSig ) ) {
1659         aSig >>= 1;
1660         ++zExp;
1661     }
1662     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1663     if ( ( zSig & 0x3F ) == 0 ) {
1664         zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1665     }
1666     return roundAndPackFloat32( zSign, zExp, zSig );
1667
1668 }
1669
1670 /*
1671 -------------------------------------------------------------------------------
1672 Returns the remainder of the single-precision floating-point value `a'
1673 with respect to the corresponding value `b'.  The operation is performed
1674 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1675 -------------------------------------------------------------------------------
1676 */
1677 float32 float32_rem( float32 a, float32 b )
1678 {
1679     flag aSign, bSign, zSign;
1680     int16 aExp, bExp, expDiff;
1681     bits32 aSig, bSig;
1682     bits32 q;
1683     bits64 aSig64, bSig64, q64;
1684     bits32 alternateASig;
1685     sbits32 sigMean;
1686
1687     aSig = extractFloat32Frac( a );
1688     aExp = extractFloat32Exp( a );
1689     aSign = extractFloat32Sign( a );
1690     bSig = extractFloat32Frac( b );
1691     bExp = extractFloat32Exp( b );
1692     bSign = extractFloat32Sign( b );
1693     if ( aExp == 0xFF ) {
1694         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1695             return propagateFloat32NaN( a, b );
1696         }
1697         float_raise( float_flag_invalid );
1698         return float32_default_nan;
1699     }
1700     if ( bExp == 0xFF ) {
1701         if ( bSig ) return propagateFloat32NaN( a, b );
1702         return a;
1703     }
1704     if ( bExp == 0 ) {
1705         if ( bSig == 0 ) {
1706             float_raise( float_flag_invalid );
1707             return float32_default_nan;
1708         }
1709         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1710     }
1711     if ( aExp == 0 ) {
1712         if ( aSig == 0 ) return a;
1713         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1714     }
1715     expDiff = aExp - bExp;
1716     aSig |= 0x00800000;
1717     bSig |= 0x00800000;
1718     if ( expDiff < 32 ) {
1719         aSig <<= 8;
1720         bSig <<= 8;
1721         if ( expDiff < 0 ) {
1722             if ( expDiff < -1 ) return a;
1723             aSig >>= 1;
1724         }
1725         q = ( bSig <= aSig );
1726         if ( q ) aSig -= bSig;
1727         if ( 0 < expDiff ) {
1728             q = ( ( (bits64) aSig )<<32 ) / bSig;
1729             q >>= 32 - expDiff;
1730             bSig >>= 2;
1731             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1732         }
1733         else {
1734             aSig >>= 2;
1735             bSig >>= 2;
1736         }
1737     }
1738     else {
1739         if ( bSig <= aSig ) aSig -= bSig;
1740         aSig64 = ( (bits64) aSig )<<40;
1741         bSig64 = ( (bits64) bSig )<<40;
1742         expDiff -= 64;
1743         while ( 0 < expDiff ) {
1744             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1745             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1746             aSig64 = - ( ( bSig * q64 )<<38 );
1747             expDiff -= 62;
1748         }
1749         expDiff += 64;
1750         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1751         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1752         q = q64>>( 64 - expDiff );
1753         bSig <<= 6;
1754         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1755     }
1756     do {
1757         alternateASig = aSig;
1758         ++q;
1759         aSig -= bSig;
1760     } while ( 0 <= (sbits32) aSig );
1761     sigMean = aSig + alternateASig;
1762     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1763         aSig = alternateASig;
1764     }
1765     zSign = ( (sbits32) aSig < 0 );
1766     if ( zSign ) aSig = - aSig;
1767     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1768
1769 }
1770
1771 /*
1772 -------------------------------------------------------------------------------
1773 Returns the square root of the single-precision floating-point value `a'.
1774 The operation is performed according to the IEC/IEEE Standard for Binary
1775 Floating-point Arithmetic.
1776 -------------------------------------------------------------------------------
1777 */
1778 float32 float32_sqrt( float32 a )
1779 {
1780     flag aSign;
1781     int16 aExp, zExp;
1782     bits32 aSig, zSig;
1783     bits64 rem, term;
1784
1785     aSig = extractFloat32Frac( a );
1786     aExp = extractFloat32Exp( a );
1787     aSign = extractFloat32Sign( a );
1788     if ( aExp == 0xFF ) {
1789         if ( aSig ) return propagateFloat32NaN( a, 0 );
1790         if ( ! aSign ) return a;
1791         float_raise( float_flag_invalid );
1792         return float32_default_nan;
1793     }
1794     if ( aSign ) {
1795         if ( ( aExp | aSig ) == 0 ) return a;
1796         float_raise( float_flag_invalid );
1797         return float32_default_nan;
1798     }
1799     if ( aExp == 0 ) {
1800         if ( aSig == 0 ) return 0;
1801         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1802     }
1803     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1804     aSig = ( aSig | 0x00800000 )<<8;
1805     zSig = estimateSqrt32( aExp, aSig ) + 2;
1806     if ( ( zSig & 0x7F ) <= 5 ) {
1807         if ( zSig < 2 ) {
1808             zSig = 0xFFFFFFFF;
1809         }
1810         else {
1811             aSig >>= aExp & 1;
1812             term = ( (bits64) zSig ) * zSig;
1813             rem = ( ( (bits64) aSig )<<32 ) - term;
1814             while ( (sbits64) rem < 0 ) {
1815                 --zSig;
1816                 rem += ( ( (bits64) zSig )<<1 ) | 1;
1817             }
1818             zSig |= ( rem != 0 );
1819         }
1820     }
1821     shift32RightJamming( zSig, 1, &zSig );
1822     return roundAndPackFloat32( 0, zExp, zSig );
1823
1824 }
1825
1826 /*
1827 -------------------------------------------------------------------------------
1828 Returns 1 if the single-precision floating-point value `a' is equal to the
1829 corresponding value `b', and 0 otherwise.  The comparison is performed
1830 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1831 -------------------------------------------------------------------------------
1832 */
1833 flag float32_eq( float32 a, float32 b )
1834 {
1835
1836     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1837          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1838        ) {
1839         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1840             float_raise( float_flag_invalid );
1841         }
1842         return 0;
1843     }
1844     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1845
1846 }
1847
1848 /*
1849 -------------------------------------------------------------------------------
1850 Returns 1 if the single-precision floating-point value `a' is less than or
1851 equal to the corresponding value `b', and 0 otherwise.  The comparison is
1852 performed according to the IEC/IEEE Standard for Binary Floating-point
1853 Arithmetic.
1854 -------------------------------------------------------------------------------
1855 */
1856 flag float32_le( float32 a, float32 b )
1857 {
1858     flag aSign, bSign;
1859
1860     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1861          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1862        ) {
1863         float_raise( float_flag_invalid );
1864         return 0;
1865     }
1866     aSign = extractFloat32Sign( a );
1867     bSign = extractFloat32Sign( b );
1868     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1869     return ( a == b ) || ( aSign ^ ( a < b ) );
1870
1871 }
1872
1873 /*
1874 -------------------------------------------------------------------------------
1875 Returns 1 if the single-precision floating-point value `a' is less than
1876 the corresponding value `b', and 0 otherwise.  The comparison is performed
1877 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1878 -------------------------------------------------------------------------------
1879 */
1880 flag float32_lt( float32 a, float32 b )
1881 {
1882     flag aSign, bSign;
1883
1884     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1885          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1886        ) {
1887         float_raise( float_flag_invalid );
1888         return 0;
1889     }
1890     aSign = extractFloat32Sign( a );
1891     bSign = extractFloat32Sign( b );
1892     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1893     return ( a != b ) && ( aSign ^ ( a < b ) );
1894
1895 }
1896
1897 /*
1898 -------------------------------------------------------------------------------
1899 Returns 1 if the single-precision floating-point value `a' is equal to the
1900 corresponding value `b', and 0 otherwise.  The invalid exception is raised
1901 if either operand is a NaN.  Otherwise, the comparison is performed
1902 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1903 -------------------------------------------------------------------------------
1904 */
1905 flag float32_eq_signaling( float32 a, float32 b )
1906 {
1907
1908     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1909          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1910        ) {
1911         float_raise( float_flag_invalid );
1912         return 0;
1913     }
1914     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1915
1916 }
1917
1918 /*
1919 -------------------------------------------------------------------------------
1920 Returns 1 if the single-precision floating-point value `a' is less than or
1921 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1922 cause an exception.  Otherwise, the comparison is performed according to the
1923 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1924 -------------------------------------------------------------------------------
1925 */
1926 flag float32_le_quiet( float32 a, float32 b )
1927 {
1928     flag aSign, bSign;
1929     //int16 aExp, bExp;
1930
1931     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1932          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1933        ) {
1934         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1935             float_raise( float_flag_invalid );
1936         }
1937         return 0;
1938     }
1939     aSign = extractFloat32Sign( a );
1940     bSign = extractFloat32Sign( b );
1941     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1942     return ( a == b ) || ( aSign ^ ( a < b ) );
1943
1944 }
1945
1946 /*
1947 -------------------------------------------------------------------------------
1948 Returns 1 if the single-precision floating-point value `a' is less than
1949 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1950 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1951 Standard for Binary Floating-point Arithmetic.
1952 -------------------------------------------------------------------------------
1953 */
1954 flag float32_lt_quiet( float32 a, float32 b )
1955 {
1956     flag aSign, bSign;
1957
1958     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1959          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1960        ) {
1961         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1962             float_raise( float_flag_invalid );
1963         }
1964         return 0;
1965     }
1966     aSign = extractFloat32Sign( a );
1967     bSign = extractFloat32Sign( b );
1968     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1969     return ( a != b ) && ( aSign ^ ( a < b ) );
1970
1971 }
1972
1973 /*
1974 -------------------------------------------------------------------------------
1975 Returns the result of converting the double-precision floating-point value
1976 `a' to the 32-bit two's complement integer format.  The conversion is
1977 performed according to the IEC/IEEE Standard for Binary Floating-point
1978 Arithmetic---which means in particular that the conversion is rounded
1979 according to the current rounding mode.  If `a' is a NaN, the largest
1980 positive integer is returned.  Otherwise, if the conversion overflows, the
1981 largest integer with the same sign as `a' is returned.
1982 -------------------------------------------------------------------------------
1983 */
1984 int32 float64_to_int32( float64 a )
1985 {
1986     flag aSign;
1987     int16 aExp, shiftCount;
1988     bits64 aSig;
1989
1990     aSig = extractFloat64Frac( a );
1991     aExp = extractFloat64Exp( a );
1992     aSign = extractFloat64Sign( a );
1993     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1994     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1995     shiftCount = 0x42C - aExp;
1996     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1997     return roundAndPackInt32( aSign, aSig );
1998
1999 }
2000
2001 /*
2002 -------------------------------------------------------------------------------
2003 Returns the result of converting the double-precision floating-point value
2004 `a' to the 32-bit two's complement integer format.  The conversion is
2005 performed according to the IEC/IEEE Standard for Binary Floating-point
2006 Arithmetic, except that the conversion is always rounded toward zero.  If
2007 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
2008 conversion overflows, the largest integer with the same sign as `a' is
2009 returned.
2010 -------------------------------------------------------------------------------
2011 */
2012 int32 float64_to_int32_round_to_zero( float64 a )
2013 {
2014     flag aSign;
2015     int16 aExp, shiftCount;
2016     bits64 aSig, savedASig;
2017     int32 z;
2018
2019     aSig = extractFloat64Frac( a );
2020     aExp = extractFloat64Exp( a );
2021     aSign = extractFloat64Sign( a );
2022     shiftCount = 0x433 - aExp;
2023     if ( shiftCount < 21 ) {
2024         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2025         goto invalid;
2026     }
2027     else if ( 52 < shiftCount ) {
2028         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2029         return 0;
2030     }
2031     aSig |= LIT64( 0x0010000000000000 );
2032     savedASig = aSig;
2033     aSig >>= shiftCount;
2034     z = aSig;
2035     if ( aSign ) z = - z;
2036     if ( ( z < 0 ) ^ aSign ) {
2037  invalid:
2038         float_exception_flags |= float_flag_invalid;
2039         return aSign ? 0x80000000 : 0x7FFFFFFF;
2040     }
2041     if ( ( aSig<<shiftCount ) != savedASig ) {
2042         float_exception_flags |= float_flag_inexact;
2043     }
2044     return z;
2045
2046 }
2047
2048 /*
2049 -------------------------------------------------------------------------------
2050 Returns the result of converting the double-precision floating-point value
2051 `a' to the 32-bit two's complement unsigned integer format.  The conversion
2052 is performed according to the IEC/IEEE Standard for Binary Floating-point
2053 Arithmetic---which means in particular that the conversion is rounded
2054 according to the current rounding mode.  If `a' is a NaN, the largest
2055 positive integer is returned.  Otherwise, if the conversion overflows, the
2056 largest positive integer is returned.
2057 -------------------------------------------------------------------------------
2058 */
2059 int32 float64_to_uint32( float64 a )
2060 {
2061     flag aSign;
2062     int16 aExp, shiftCount;
2063     bits64 aSig;
2064
2065     aSig = extractFloat64Frac( a );
2066     aExp = extractFloat64Exp( a );
2067     aSign = 0; //extractFloat64Sign( a );
2068     //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2069     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2070     shiftCount = 0x42C - aExp;
2071     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2072     return roundAndPackInt32( aSign, aSig );
2073 }
2074
2075 /*
2076 -------------------------------------------------------------------------------
2077 Returns the result of converting the double-precision floating-point value
2078 `a' to the 32-bit two's complement integer format.  The conversion is
2079 performed according to the IEC/IEEE Standard for Binary Floating-point
2080 Arithmetic, except that the conversion is always rounded toward zero.  If
2081 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
2082 conversion overflows, the largest positive integer is returned.
2083 -------------------------------------------------------------------------------
2084 */
2085 int32 float64_to_uint32_round_to_zero( float64 a )
2086 {
2087     flag aSign;
2088     int16 aExp, shiftCount;
2089     bits64 aSig, savedASig;
2090     int32 z;
2091
2092     aSig = extractFloat64Frac( a );
2093     aExp = extractFloat64Exp( a );
2094     aSign = extractFloat64Sign( a );
2095     shiftCount = 0x433 - aExp;
2096     if ( shiftCount < 21 ) {
2097         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2098         goto invalid;
2099     }
2100     else if ( 52 < shiftCount ) {
2101         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2102         return 0;
2103     }
2104     aSig |= LIT64( 0x0010000000000000 );
2105     savedASig = aSig;
2106     aSig >>= shiftCount;
2107     z = aSig;
2108     if ( aSign ) z = - z;
2109     if ( ( z < 0 ) ^ aSign ) {
2110  invalid:
2111         float_exception_flags |= float_flag_invalid;
2112         return aSign ? 0x80000000 : 0x7FFFFFFF;
2113     }
2114     if ( ( aSig<<shiftCount ) != savedASig ) {
2115         float_exception_flags |= float_flag_inexact;
2116     }
2117     return z;
2118 }
2119
2120 /*
2121 -------------------------------------------------------------------------------
2122 Returns the result of converting the double-precision floating-point value
2123 `a' to the single-precision floating-point format.  The conversion is
2124 performed according to the IEC/IEEE Standard for Binary Floating-point
2125 Arithmetic.
2126 -------------------------------------------------------------------------------
2127 */
2128 float32 float64_to_float32( float64 a )
2129 {
2130     flag aSign;
2131     int16 aExp;
2132     bits64 aSig;
2133     bits32 zSig;
2134
2135     aSig = extractFloat64Frac( a );
2136     aExp = extractFloat64Exp( a );
2137     aSign = extractFloat64Sign( a );
2138     if ( aExp == 0x7FF ) {
2139         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2140         return packFloat32( aSign, 0xFF, 0 );
2141     }
2142     shift64RightJamming( aSig, 22, &aSig );
2143     zSig = aSig;
2144     if ( aExp || zSig ) {
2145         zSig |= 0x40000000;
2146         aExp -= 0x381;
2147     }
2148     return roundAndPackFloat32( aSign, aExp, zSig );
2149
2150 }
2151
2152 #ifdef FLOATX80
2153
2154 /*
2155 -------------------------------------------------------------------------------
2156 Returns the result of converting the double-precision floating-point value
2157 `a' to the extended double-precision floating-point format.  The conversion
2158 is performed according to the IEC/IEEE Standard for Binary Floating-point
2159 Arithmetic.
2160 -------------------------------------------------------------------------------
2161 */
2162 floatx80 float64_to_floatx80( float64 a )
2163 {
2164     flag aSign;
2165     int16 aExp;
2166     bits64 aSig;
2167
2168     aSig = extractFloat64Frac( a );
2169     aExp = extractFloat64Exp( a );
2170     aSign = extractFloat64Sign( a );
2171     if ( aExp == 0x7FF ) {
2172         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2173         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2174     }
2175     if ( aExp == 0 ) {
2176         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2177         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2178     }
2179     return
2180         packFloatx80(
2181             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2182
2183 }
2184
2185 #endif
2186
2187 #ifdef FLOAT128
2188
2189 /*
2190 -------------------------------------------------------------------------------
2191 Returns the result of converting the double-precision floating-point value
2192 `a' to the quadruple-precision floating-point format.  The conversion is
2193 performed according to the IEC/IEEE Standard for Binary Floating-point
2194 Arithmetic.
2195 -------------------------------------------------------------------------------
2196 */
2197 float128 float64_to_float128( float64 a )
2198 {
2199     flag aSign;
2200     int16 aExp;
2201     bits64 aSig, zSig0, zSig1;
2202
2203     aSig = extractFloat64Frac( a );
2204     aExp = extractFloat64Exp( a );
2205     aSign = extractFloat64Sign( a );
2206     if ( aExp == 0x7FF ) {
2207         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2208         return packFloat128( aSign, 0x7FFF, 0, 0 );
2209     }
2210     if ( aExp == 0 ) {
2211         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2212         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2213         --aExp;
2214     }
2215     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2216     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2217
2218 }
2219
2220 #endif
2221
2222 /*
2223 -------------------------------------------------------------------------------
2224 Rounds the double-precision floating-point value `a' to an integer, and
2225 returns the result as a double-precision floating-point value.  The
2226 operation is performed according to the IEC/IEEE Standard for Binary
2227 Floating-point Arithmetic.
2228 -------------------------------------------------------------------------------
2229 */
2230 float64 float64_round_to_int( float64 a )
2231 {
2232     flag aSign;
2233     int16 aExp;
2234     bits64 lastBitMask, roundBitsMask;
2235     int8 roundingMode;
2236     float64 z;
2237
2238     aExp = extractFloat64Exp( a );
2239     if ( 0x433 <= aExp ) {
2240         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2241             return propagateFloat64NaN( a, a );
2242         }
2243         return a;
2244     }
2245     if ( aExp <= 0x3FE ) {
2246         if ( (bits64) ( a<<1 ) == 0 ) return a;
2247         float_exception_flags |= float_flag_inexact;
2248         aSign = extractFloat64Sign( a );
2249         switch ( float_rounding_mode ) {
2250          case float_round_nearest_even:
2251             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2252                 return packFloat64( aSign, 0x3FF, 0 );
2253             }
2254             break;
2255          case float_round_down:
2256             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2257          case float_round_up:
2258             return
2259             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2260         }
2261         return packFloat64( aSign, 0, 0 );
2262     }
2263     lastBitMask = 1;
2264     lastBitMask <<= 0x433 - aExp;
2265     roundBitsMask = lastBitMask - 1;
2266     z = a;
2267     roundingMode = float_rounding_mode;
2268     if ( roundingMode == float_round_nearest_even ) {
2269         z += lastBitMask>>1;
2270         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2271     }
2272     else if ( roundingMode != float_round_to_zero ) {
2273         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2274             z += roundBitsMask;
2275         }
2276     }
2277     z &= ~ roundBitsMask;
2278     if ( z != a ) float_exception_flags |= float_flag_inexact;
2279     return z;
2280
2281 }
2282
2283 /*
2284 -------------------------------------------------------------------------------
2285 Returns the result of adding the absolute values of the double-precision
2286 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
2287 before being returned.  `zSign' is ignored if the result is a NaN.  The
2288 addition is performed according to the IEC/IEEE Standard for Binary
2289 Floating-point Arithmetic.
2290 -------------------------------------------------------------------------------
2291 */
2292 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2293 {
2294     int16 aExp, bExp, zExp;
2295     bits64 aSig, bSig, zSig;
2296     int16 expDiff;
2297
2298     aSig = extractFloat64Frac( a );
2299     aExp = extractFloat64Exp( a );
2300     bSig = extractFloat64Frac( b );
2301     bExp = extractFloat64Exp( b );
2302     expDiff = aExp - bExp;
2303     aSig <<= 9;
2304     bSig <<= 9;
2305     if ( 0 < expDiff ) {
2306         if ( aExp == 0x7FF ) {
2307             if ( aSig ) return propagateFloat64NaN( a, b );
2308             return a;
2309         }
2310         if ( bExp == 0 ) {
2311             --expDiff;
2312         }
2313         else {
2314             bSig |= LIT64( 0x2000000000000000 );
2315         }
2316         shift64RightJamming( bSig, expDiff, &bSig );
2317         zExp = aExp;
2318     }
2319     else if ( expDiff < 0 ) {
2320         if ( bExp == 0x7FF ) {
2321             if ( bSig ) return propagateFloat64NaN( a, b );
2322             return packFloat64( zSign, 0x7FF, 0 );
2323         }
2324         if ( aExp == 0 ) {
2325             ++expDiff;
2326         }
2327         else {
2328             aSig |= LIT64( 0x2000000000000000 );
2329         }
2330         shift64RightJamming( aSig, - expDiff, &aSig );
2331         zExp = bExp;
2332     }
2333     else {
2334         if ( aExp == 0x7FF ) {
2335             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2336             return a;
2337         }
2338         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2339         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2340         zExp = aExp;
2341         goto roundAndPack;
2342     }
2343     aSig |= LIT64( 0x2000000000000000 );
2344     zSig = ( aSig + bSig )<<1;
2345     --zExp;
2346     if ( (sbits64) zSig < 0 ) {
2347         zSig = aSig + bSig;
2348         ++zExp;
2349     }
2350  roundAndPack:
2351     return roundAndPackFloat64( zSign, zExp, zSig );
2352
2353 }
2354
2355 /*
2356 -------------------------------------------------------------------------------
2357 Returns the result of subtracting the absolute values of the double-
2358 precision floating-point values `a' and `b'.  If `zSign' is true, the
2359 difference is negated before being returned.  `zSign' is ignored if the
2360 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2361 Standard for Binary Floating-point Arithmetic.
2362 -------------------------------------------------------------------------------
2363 */
2364 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2365 {
2366     int16 aExp, bExp, zExp;
2367     bits64 aSig, bSig, zSig;
2368     int16 expDiff;
2369
2370     aSig = extractFloat64Frac( a );
2371     aExp = extractFloat64Exp( a );
2372     bSig = extractFloat64Frac( b );
2373     bExp = extractFloat64Exp( b );
2374     expDiff = aExp - bExp;
2375     aSig <<= 10;
2376     bSig <<= 10;
2377     if ( 0 < expDiff ) goto aExpBigger;
2378     if ( expDiff < 0 ) goto bExpBigger;
2379     if ( aExp == 0x7FF ) {
2380         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2381         float_raise( float_flag_invalid );
2382         return float64_default_nan;
2383     }
2384     if ( aExp == 0 ) {
2385         aExp = 1;
2386         bExp = 1;
2387     }
2388     if ( bSig < aSig ) goto aBigger;
2389     if ( aSig < bSig ) goto bBigger;
2390     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2391  bExpBigger:
2392     if ( bExp == 0x7FF ) {
2393         if ( bSig ) return propagateFloat64NaN( a, b );
2394         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2395     }
2396     if ( aExp == 0 ) {
2397         ++expDiff;
2398     }
2399     else {
2400         aSig |= LIT64( 0x4000000000000000 );
2401     }
2402     shift64RightJamming( aSig, - expDiff, &aSig );
2403     bSig |= LIT64( 0x4000000000000000 );
2404  bBigger:
2405     zSig = bSig - aSig;
2406     zExp = bExp;
2407     zSign ^= 1;
2408     goto normalizeRoundAndPack;
2409  aExpBigger:
2410     if ( aExp == 0x7FF ) {
2411         if ( aSig ) return propagateFloat64NaN( a, b );
2412         return a;
2413     }
2414     if ( bExp == 0 ) {
2415         --expDiff;
2416     }
2417     else {
2418         bSig |= LIT64( 0x4000000000000000 );
2419     }
2420     shift64RightJamming( bSig, expDiff, &bSig );
2421     aSig |= LIT64( 0x4000000000000000 );
2422  aBigger:
2423     zSig = aSig - bSig;
2424     zExp = aExp;
2425  normalizeRoundAndPack:
2426     --zExp;
2427     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2428
2429 }
2430
2431 /*
2432 -------------------------------------------------------------------------------
2433 Returns the result of adding the double-precision floating-point values `a'
2434 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2435 Binary Floating-point Arithmetic.
2436 -------------------------------------------------------------------------------
2437 */
2438 float64 float64_add( float64 a, float64 b )
2439 {
2440     flag aSign, bSign;
2441
2442     aSign = extractFloat64Sign( a );
2443     bSign = extractFloat64Sign( b );
2444     if ( aSign == bSign ) {
2445         return addFloat64Sigs( a, b, aSign );
2446     }
2447     else {
2448         return subFloat64Sigs( a, b, aSign );
2449     }
2450
2451 }
2452
2453 /*
2454 -------------------------------------------------------------------------------
2455 Returns the result of subtracting the double-precision floating-point values
2456 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2457 for Binary Floating-point Arithmetic.
2458 -------------------------------------------------------------------------------
2459 */
2460 float64 float64_sub( float64 a, float64 b )
2461 {
2462     flag aSign, bSign;
2463
2464     aSign = extractFloat64Sign( a );
2465     bSign = extractFloat64Sign( b );
2466     if ( aSign == bSign ) {
2467         return subFloat64Sigs( a, b, aSign );
2468     }
2469     else {
2470         return addFloat64Sigs( a, b, aSign );
2471     }
2472
2473 }
2474
2475 /*
2476 -------------------------------------------------------------------------------
2477 Returns the result of multiplying the double-precision floating-point values
2478 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2479 for Binary Floating-point Arithmetic.
2480 -------------------------------------------------------------------------------
2481 */
2482 float64 float64_mul( float64 a, float64 b )
2483 {
2484     flag aSign, bSign, zSign;
2485     int16 aExp, bExp, zExp;
2486     bits64 aSig, bSig, zSig0, zSig1;
2487
2488     aSig = extractFloat64Frac( a );
2489     aExp = extractFloat64Exp( a );
2490     aSign = extractFloat64Sign( a );
2491     bSig = extractFloat64Frac( b );
2492     bExp = extractFloat64Exp( b );
2493     bSign = extractFloat64Sign( b );
2494     zSign = aSign ^ bSign;
2495     if ( aExp == 0x7FF ) {
2496         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2497             return propagateFloat64NaN( a, b );
2498         }
2499         if ( ( bExp | bSig ) == 0 ) {
2500             float_raise( float_flag_invalid );
2501             return float64_default_nan;
2502         }
2503         return packFloat64( zSign, 0x7FF, 0 );
2504     }
2505     if ( bExp == 0x7FF ) {
2506         if ( bSig ) return propagateFloat64NaN( a, b );
2507         if ( ( aExp | aSig ) == 0 ) {
2508             float_raise( float_flag_invalid );
2509             return float64_default_nan;
2510         }
2511         return packFloat64( zSign, 0x7FF, 0 );
2512     }
2513     if ( aExp == 0 ) {
2514         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2515         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2516     }
2517     if ( bExp == 0 ) {
2518         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2519         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2520     }
2521     zExp = aExp + bExp - 0x3FF;
2522     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2523     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2524     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2525     zSig0 |= ( zSig1 != 0 );
2526     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2527         zSig0 <<= 1;
2528         --zExp;
2529     }
2530     return roundAndPackFloat64( zSign, zExp, zSig0 );
2531
2532 }
2533
2534 /*
2535 -------------------------------------------------------------------------------
2536 Returns the result of dividing the double-precision floating-point value `a'
2537 by the corresponding value `b'.  The operation is performed according to
2538 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2539 -------------------------------------------------------------------------------
2540 */
2541 float64 float64_div( float64 a, float64 b )
2542 {
2543     flag aSign, bSign, zSign;
2544     int16 aExp, bExp, zExp;
2545     bits64 aSig, bSig, zSig;
2546     bits64 rem0, rem1;
2547     bits64 term0, term1;
2548
2549     aSig = extractFloat64Frac( a );
2550     aExp = extractFloat64Exp( a );
2551     aSign = extractFloat64Sign( a );
2552     bSig = extractFloat64Frac( b );
2553     bExp = extractFloat64Exp( b );
2554     bSign = extractFloat64Sign( b );
2555     zSign = aSign ^ bSign;
2556     if ( aExp == 0x7FF ) {
2557         if ( aSig ) return propagateFloat64NaN( a, b );
2558         if ( bExp == 0x7FF ) {
2559             if ( bSig ) return propagateFloat64NaN( a, b );
2560             float_raise( float_flag_invalid );
2561             return float64_default_nan;
2562         }
2563         return packFloat64( zSign, 0x7FF, 0 );
2564     }
2565     if ( bExp == 0x7FF ) {
2566         if ( bSig ) return propagateFloat64NaN( a, b );
2567         return packFloat64( zSign, 0, 0 );
2568     }
2569     if ( bExp == 0 ) {
2570         if ( bSig == 0 ) {
2571             if ( ( aExp | aSig ) == 0 ) {
2572                 float_raise( float_flag_invalid );
2573                 return float64_default_nan;
2574             }
2575             float_raise( float_flag_divbyzero );
2576             return packFloat64( zSign, 0x7FF, 0 );
2577         }
2578         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2579     }
2580     if ( aExp == 0 ) {
2581         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2582         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2583     }
2584     zExp = aExp - bExp + 0x3FD;
2585     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2586     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2587     if ( bSig <= ( aSig + aSig ) ) {
2588         aSig >>= 1;
2589         ++zExp;
2590     }
2591     zSig = estimateDiv128To64( aSig, 0, bSig );
2592     if ( ( zSig & 0x1FF ) <= 2 ) {
2593         mul64To128( bSig, zSig, &term0, &term1 );
2594         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2595         while ( (sbits64) rem0 < 0 ) {
2596             --zSig;
2597             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2598         }
2599         zSig |= ( rem1 != 0 );
2600     }
2601     return roundAndPackFloat64( zSign, zExp, zSig );
2602
2603 }
2604
2605 /*
2606 -------------------------------------------------------------------------------
2607 Returns the remainder of the double-precision floating-point value `a'
2608 with respect to the corresponding value `b'.  The operation is performed
2609 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2610 -------------------------------------------------------------------------------
2611 */
2612 float64 float64_rem( float64 a, float64 b )
2613 {
2614     flag aSign, bSign, zSign;
2615     int16 aExp, bExp, expDiff;
2616     bits64 aSig, bSig;
2617     bits64 q, alternateASig;
2618     sbits64 sigMean;
2619
2620     aSig = extractFloat64Frac( a );
2621     aExp = extractFloat64Exp( a );
2622     aSign = extractFloat64Sign( a );
2623     bSig = extractFloat64Frac( b );
2624     bExp = extractFloat64Exp( b );
2625     bSign = extractFloat64Sign( b );
2626     if ( aExp == 0x7FF ) {
2627         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2628             return propagateFloat64NaN( a, b );
2629         }
2630         float_raise( float_flag_invalid );
2631         return float64_default_nan;
2632     }
2633     if ( bExp == 0x7FF ) {
2634         if ( bSig ) return propagateFloat64NaN( a, b );
2635         return a;
2636     }
2637     if ( bExp == 0 ) {
2638         if ( bSig == 0 ) {
2639             float_raise( float_flag_invalid );
2640             return float64_default_nan;
2641         }
2642         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2643     }
2644     if ( aExp == 0 ) {
2645         if ( aSig == 0 ) return a;
2646         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2647     }
2648     expDiff = aExp - bExp;
2649     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2650     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2651     if ( expDiff < 0 ) {
2652         if ( expDiff < -1 ) return a;
2653         aSig >>= 1;
2654     }
2655     q = ( bSig <= aSig );
2656     if ( q ) aSig -= bSig;
2657     expDiff -= 64;
2658     while ( 0 < expDiff ) {
2659         q = estimateDiv128To64( aSig, 0, bSig );
2660         q = ( 2 < q ) ? q - 2 : 0;
2661         aSig = - ( ( bSig>>2 ) * q );
2662         expDiff -= 62;
2663     }
2664     expDiff += 64;
2665     if ( 0 < expDiff ) {
2666         q = estimateDiv128To64( aSig, 0, bSig );
2667         q = ( 2 < q ) ? q - 2 : 0;
2668         q >>= 64 - expDiff;
2669         bSig >>= 2;
2670         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2671     }
2672     else {
2673         aSig >>= 2;
2674         bSig >>= 2;
2675     }
2676     do {
2677         alternateASig = aSig;
2678         ++q;
2679         aSig -= bSig;
2680     } while ( 0 <= (sbits64) aSig );
2681     sigMean = aSig + alternateASig;
2682     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2683         aSig = alternateASig;
2684     }
2685     zSign = ( (sbits64) aSig < 0 );
2686     if ( zSign ) aSig = - aSig;
2687     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2688
2689 }
2690
2691 /*
2692 -------------------------------------------------------------------------------
2693 Returns the square root of the double-precision floating-point value `a'.
2694 The operation is performed according to the IEC/IEEE Standard for Binary
2695 Floating-point Arithmetic.
2696 -------------------------------------------------------------------------------
2697 */
2698 float64 float64_sqrt( float64 a )
2699 {
2700     flag aSign;
2701     int16 aExp, zExp;
2702     bits64 aSig, zSig;
2703     bits64 rem0, rem1, term0, term1; //, shiftedRem;
2704     //float64 z;
2705
2706     aSig = extractFloat64Frac( a );
2707     aExp = extractFloat64Exp( a );
2708     aSign = extractFloat64Sign( a );
2709     if ( aExp == 0x7FF ) {
2710         if ( aSig ) return propagateFloat64NaN( a, a );
2711         if ( ! aSign ) return a;
2712         float_raise( float_flag_invalid );
2713         return float64_default_nan;
2714     }
2715     if ( aSign ) {
2716         if ( ( aExp | aSig ) == 0 ) return a;
2717         float_raise( float_flag_invalid );
2718         return float64_default_nan;
2719     }
2720     if ( aExp == 0 ) {
2721         if ( aSig == 0 ) return 0;
2722         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2723     }
2724     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2725     aSig |= LIT64( 0x0010000000000000 );
2726     zSig = estimateSqrt32( aExp, aSig>>21 );
2727     zSig <<= 31;
2728     aSig <<= 9 - ( aExp & 1 );
2729     zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2730     if ( ( zSig & 0x3FF ) <= 5 ) {
2731         if ( zSig < 2 ) {
2732             zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2733         }
2734         else {
2735             aSig <<= 2;
2736             mul64To128( zSig, zSig, &term0, &term1 );
2737             sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2738             while ( (sbits64) rem0 < 0 ) {
2739                 --zSig;
2740                 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2741                 term1 |= 1;
2742                 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2743             }
2744             zSig |= ( ( rem0 | rem1 ) != 0 );
2745         }
2746     }
2747     shift64RightJamming( zSig, 1, &zSig );
2748     return roundAndPackFloat64( 0, zExp, zSig );
2749
2750 }
2751
2752 /*
2753 -------------------------------------------------------------------------------
2754 Returns 1 if the double-precision floating-point value `a' is equal to the
2755 corresponding value `b', and 0 otherwise.  The comparison is performed
2756 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2757 -------------------------------------------------------------------------------
2758 */
2759 flag float64_eq( float64 a, float64 b )
2760 {
2761
2762     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2763          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2764        ) {
2765         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2766             float_raise( float_flag_invalid );
2767         }
2768         return 0;
2769     }
2770     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2771
2772 }
2773
2774 /*
2775 -------------------------------------------------------------------------------
2776 Returns 1 if the double-precision floating-point value `a' is less than or
2777 equal to the corresponding value `b', and 0 otherwise.  The comparison is
2778 performed according to the IEC/IEEE Standard for Binary Floating-point
2779 Arithmetic.
2780 -------------------------------------------------------------------------------
2781 */
2782 flag float64_le( float64 a, float64 b )
2783 {
2784     flag aSign, bSign;
2785
2786     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2787          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2788        ) {
2789         float_raise( float_flag_invalid );
2790         return 0;
2791     }
2792     aSign = extractFloat64Sign( a );
2793     bSign = extractFloat64Sign( b );
2794     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2795     return ( a == b ) || ( aSign ^ ( a < b ) );
2796
2797 }
2798
2799 /*
2800 -------------------------------------------------------------------------------
2801 Returns 1 if the double-precision floating-point value `a' is less than
2802 the corresponding value `b', and 0 otherwise.  The comparison is performed
2803 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2804 -------------------------------------------------------------------------------
2805 */
2806 flag float64_lt( float64 a, float64 b )
2807 {
2808     flag aSign, bSign;
2809
2810     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2811          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2812        ) {
2813         float_raise( float_flag_invalid );
2814         return 0;
2815     }
2816     aSign = extractFloat64Sign( a );
2817     bSign = extractFloat64Sign( b );
2818     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2819     return ( a != b ) && ( aSign ^ ( a < b ) );
2820
2821 }
2822
2823 /*
2824 -------------------------------------------------------------------------------
2825 Returns 1 if the double-precision floating-point value `a' is equal to the
2826 corresponding value `b', and 0 otherwise.  The invalid exception is raised
2827 if either operand is a NaN.  Otherwise, the comparison is performed
2828 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2829 -------------------------------------------------------------------------------
2830 */
2831 flag float64_eq_signaling( float64 a, float64 b )
2832 {
2833
2834     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2835          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2836        ) {
2837         float_raise( float_flag_invalid );
2838         return 0;
2839     }
2840     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2841
2842 }
2843
2844 /*
2845 -------------------------------------------------------------------------------
2846 Returns 1 if the double-precision floating-point value `a' is less than or
2847 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2848 cause an exception.  Otherwise, the comparison is performed according to the
2849 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2850 -------------------------------------------------------------------------------
2851 */
2852 flag float64_le_quiet( float64 a, float64 b )
2853 {
2854     flag aSign, bSign;
2855     //int16 aExp, bExp;
2856
2857     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2858          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2859        ) {
2860         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2861             float_raise( float_flag_invalid );
2862         }
2863         return 0;
2864     }
2865     aSign = extractFloat64Sign( a );
2866     bSign = extractFloat64Sign( b );
2867     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2868     return ( a == b ) || ( aSign ^ ( a < b ) );
2869
2870 }
2871
2872 /*
2873 -------------------------------------------------------------------------------
2874 Returns 1 if the double-precision floating-point value `a' is less than
2875 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2876 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2877 Standard for Binary Floating-point Arithmetic.
2878 -------------------------------------------------------------------------------
2879 */
2880 flag float64_lt_quiet( float64 a, float64 b )
2881 {
2882     flag aSign, bSign;
2883
2884     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2885          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2886        ) {
2887         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2888             float_raise( float_flag_invalid );
2889         }
2890         return 0;
2891     }
2892     aSign = extractFloat64Sign( a );
2893     bSign = extractFloat64Sign( b );
2894     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2895     return ( a != b ) && ( aSign ^ ( a < b ) );
2896
2897 }
2898
2899 #ifdef FLOATX80
2900
2901 /*
2902 -------------------------------------------------------------------------------
2903 Returns the result of converting the extended double-precision floating-
2904 point value `a' to the 32-bit two's complement integer format.  The
2905 conversion is performed according to the IEC/IEEE Standard for Binary
2906 Floating-point Arithmetic---which means in particular that the conversion
2907 is rounded according to the current rounding mode.  If `a' is a NaN, the
2908 largest positive integer is returned.  Otherwise, if the conversion
2909 overflows, the largest integer with the same sign as `a' is returned.
2910 -------------------------------------------------------------------------------
2911 */
2912 int32 floatx80_to_int32( floatx80 a )
2913 {
2914     flag aSign;
2915     int32 aExp, shiftCount;
2916     bits64 aSig;
2917
2918     aSig = extractFloatx80Frac( a );
2919     aExp = extractFloatx80Exp( a );
2920     aSign = extractFloatx80Sign( a );
2921     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2922     shiftCount = 0x4037 - aExp;
2923     if ( shiftCount <= 0 ) shiftCount = 1;
2924     shift64RightJamming( aSig, shiftCount, &aSig );
2925     return roundAndPackInt32( aSign, aSig );
2926
2927 }
2928
2929 /*
2930 -------------------------------------------------------------------------------
2931 Returns the result of converting the extended double-precision floating-
2932 point value `a' to the 32-bit two's complement integer format.  The
2933 conversion is performed according to the IEC/IEEE Standard for Binary
2934 Floating-point Arithmetic, except that the conversion is always rounded
2935 toward zero.  If `a' is a NaN, the largest positive integer is returned.
2936 Otherwise, if the conversion overflows, the largest integer with the same
2937 sign as `a' is returned.
2938 -------------------------------------------------------------------------------
2939 */
2940 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2941 {
2942     flag aSign;
2943     int32 aExp, shiftCount;
2944     bits64 aSig, savedASig;
2945     int32 z;
2946
2947     aSig = extractFloatx80Frac( a );
2948     aExp = extractFloatx80Exp( a );
2949     aSign = extractFloatx80Sign( a );
2950     shiftCount = 0x403E - aExp;
2951     if ( shiftCount < 32 ) {
2952         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2953         goto invalid;
2954     }
2955     else if ( 63 < shiftCount ) {
2956         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2957         return 0;
2958     }
2959     savedASig = aSig;
2960     aSig >>= shiftCount;
2961     z = aSig;
2962     if ( aSign ) z = - z;
2963     if ( ( z < 0 ) ^ aSign ) {
2964  invalid:
2965         float_exception_flags |= float_flag_invalid;
2966         return aSign ? 0x80000000 : 0x7FFFFFFF;
2967     }
2968     if ( ( aSig<<shiftCount ) != savedASig ) {
2969         float_exception_flags |= float_flag_inexact;
2970     }
2971     return z;
2972
2973 }
2974
2975 /*
2976 -------------------------------------------------------------------------------
2977 Returns the result of converting the extended double-precision floating-
2978 point value `a' to the single-precision floating-point format.  The
2979 conversion is performed according to the IEC/IEEE Standard for Binary
2980 Floating-point Arithmetic.
2981 -------------------------------------------------------------------------------
2982 */
2983 float32 floatx80_to_float32( floatx80 a )
2984 {
2985     flag aSign;
2986     int32 aExp;
2987     bits64 aSig;
2988
2989     aSig = extractFloatx80Frac( a );
2990     aExp = extractFloatx80Exp( a );
2991     aSign = extractFloatx80Sign( a );
2992     if ( aExp == 0x7FFF ) {
2993         if ( (bits64) ( aSig<<1 ) ) {
2994             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2995         }
2996         return packFloat32( aSign, 0xFF, 0 );
2997     }
2998     shift64RightJamming( aSig, 33, &aSig );
2999     if ( aExp || aSig ) aExp -= 0x3F81;
3000     return roundAndPackFloat32( aSign, aExp, aSig );
3001
3002 }
3003
3004 /*
3005 -------------------------------------------------------------------------------
3006 Returns the result of converting the extended double-precision floating-
3007 point value `a' to the double-precision floating-point format.  The
3008 conversion is performed according to the IEC/IEEE Standard for Binary
3009 Floating-point Arithmetic.
3010 -------------------------------------------------------------------------------
3011 */
3012 float64 floatx80_to_float64( floatx80 a )
3013 {
3014     flag aSign;
3015     int32 aExp;
3016     bits64 aSig, zSig;
3017
3018     aSig = extractFloatx80Frac( a );
3019     aExp = extractFloatx80Exp( a );
3020     aSign = extractFloatx80Sign( a );
3021     if ( aExp == 0x7FFF ) {
3022         if ( (bits64) ( aSig<<1 ) ) {
3023             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3024         }
3025         return packFloat64( aSign, 0x7FF, 0 );
3026     }
3027     shift64RightJamming( aSig, 1, &zSig );
3028     if ( aExp || aSig ) aExp -= 0x3C01;
3029     return roundAndPackFloat64( aSign, aExp, zSig );
3030
3031 }
3032
3033 #ifdef FLOAT128
3034
3035 /*
3036 -------------------------------------------------------------------------------
3037 Returns the result of converting the extended double-precision floating-
3038 point value `a' to the quadruple-precision floating-point format.  The
3039 conversion is performed according to the IEC/IEEE Standard for Binary
3040 Floating-point Arithmetic.
3041 -------------------------------------------------------------------------------
3042 */
3043 float128 floatx80_to_float128( floatx80 a )
3044 {
3045     flag aSign;
3046     int16 aExp;
3047     bits64 aSig, zSig0, zSig1;
3048
3049     aSig = extractFloatx80Frac( a );
3050     aExp = extractFloatx80Exp( a );
3051     aSign = extractFloatx80Sign( a );
3052     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3053         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3054     }
3055     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3056     return packFloat128( aSign, aExp, zSig0, zSig1 );
3057
3058 }
3059
3060 #endif
3061
3062 /*
3063 -------------------------------------------------------------------------------
3064 Rounds the extended double-precision floating-point value `a' to an integer,
3065 and returns the result as an extended quadruple-precision floating-point
3066 value.  The operation is performed according to the IEC/IEEE Standard for
3067 Binary Floating-point Arithmetic.
3068 -------------------------------------------------------------------------------
3069 */
3070 floatx80 floatx80_round_to_int( floatx80 a )
3071 {
3072     flag aSign;
3073     int32 aExp;
3074     bits64 lastBitMask, roundBitsMask;
3075     int8 roundingMode;
3076     floatx80 z;
3077
3078     aExp = extractFloatx80Exp( a );
3079     if ( 0x403E <= aExp ) {
3080         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3081             return propagateFloatx80NaN( a, a );
3082         }
3083         return a;
3084     }
3085     if ( aExp <= 0x3FFE ) {
3086         if (    ( aExp == 0 )
3087              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3088             return a;
3089         }
3090         float_exception_flags |= float_flag_inexact;
3091         aSign = extractFloatx80Sign( a );
3092         switch ( float_rounding_mode ) {
3093          case float_round_nearest_even:
3094             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3095                ) {
3096                 return
3097                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3098             }
3099             break;
3100          case float_round_down:
3101             return
3102                   aSign ?
3103                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3104                 : packFloatx80( 0, 0, 0 );
3105          case float_round_up:
3106             return
3107                   aSign ? packFloatx80( 1, 0, 0 )
3108                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3109         }
3110         return packFloatx80( aSign, 0, 0 );
3111     }
3112     lastBitMask = 1;
3113     lastBitMask <<= 0x403E - aExp;
3114     roundBitsMask = lastBitMask - 1;
3115     z = a;
3116     roundingMode = float_rounding_mode;
3117     if ( roundingMode == float_round_nearest_even ) {
3118         z.low += lastBitMask>>1;
3119         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3120     }
3121     else if ( roundingMode != float_round_to_zero ) {
3122         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3123             z.low += roundBitsMask;
3124         }
3125     }
3126     z.low &= ~ roundBitsMask;
3127     if ( z.low == 0 ) {
3128         ++z.high;
3129         z.low = LIT64( 0x8000000000000000 );
3130     }
3131     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3132     return z;
3133
3134 }
3135
3136 /*
3137 -------------------------------------------------------------------------------
3138 Returns the result of adding the absolute values of the extended double-
3139 precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
3140 negated before being returned.  `zSign' is ignored if the result is a NaN.
3141 The addition is performed according to the IEC/IEEE Standard for Binary
3142 Floating-point Arithmetic.
3143 -------------------------------------------------------------------------------
3144 */
3145 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3146 {
3147     int32 aExp, bExp, zExp;
3148     bits64 aSig, bSig, zSig0, zSig1;
3149     int32 expDiff;
3150
3151     aSig = extractFloatx80Frac( a );
3152     aExp = extractFloatx80Exp( a );
3153     bSig = extractFloatx80Frac( b );
3154     bExp = extractFloatx80Exp( b );
3155     expDiff = aExp - bExp;
3156     if ( 0 < expDiff ) {
3157         if ( aExp == 0x7FFF ) {
3158             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3159             return a;
3160         }
3161         if ( bExp == 0 ) --expDiff;
3162         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3163         zExp = aExp;
3164     }
3165     else if ( expDiff < 0 ) {
3166         if ( bExp == 0x7FFF ) {
3167             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3168             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3169         }
3170         if ( aExp == 0 ) ++expDiff;
3171         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3172         zExp = bExp;
3173     }
3174     else {
3175         if ( aExp == 0x7FFF ) {
3176             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3177                 return propagateFloatx80NaN( a, b );
3178             }
3179             return a;
3180         }
3181         zSig1 = 0;
3182         zSig0 = aSig + bSig;
3183         if ( aExp == 0 ) {
3184             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3185             goto roundAndPack;
3186         }
3187         zExp = aExp;
3188         goto shiftRight1;
3189     }
3190
3191     zSig0 = aSig + bSig;
3192
3193     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3194  shiftRight1:
3195     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3196     zSig0 |= LIT64( 0x8000000000000000 );
3197     ++zExp;
3198  roundAndPack:
3199     return
3200         roundAndPackFloatx80(
3201             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3202
3203 }
3204
3205 /*
3206 -------------------------------------------------------------------------------
3207 Returns the result of subtracting the absolute values of the extended
3208 double-precision floating-point values `a' and `b'.  If `zSign' is true,
3209 the difference is negated before being returned.  `zSign' is ignored if the
3210 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3211 Standard for Binary Floating-point Arithmetic.
3212 -------------------------------------------------------------------------------
3213 */
3214 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3215 {
3216     int32 aExp, bExp, zExp;
3217     bits64 aSig, bSig, zSig0, zSig1;
3218     int32 expDiff;
3219     floatx80 z;
3220
3221     aSig = extractFloatx80Frac( a );
3222     aExp = extractFloatx80Exp( a );
3223     bSig = extractFloatx80Frac( b );
3224     bExp = extractFloatx80Exp( b );
3225     expDiff = aExp - bExp;
3226     if ( 0 < expDiff ) goto aExpBigger;
3227     if ( expDiff < 0 ) goto bExpBigger;
3228     if ( aExp == 0x7FFF ) {
3229         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3230             return propagateFloatx80NaN( a, b );
3231         }
3232         float_raise( float_flag_invalid );
3233         z.low = floatx80_default_nan_low;
3234         z.high = floatx80_default_nan_high;
3235         return z;
3236     }
3237     if ( aExp == 0 ) {
3238         aExp = 1;
3239         bExp = 1;
3240     }
3241     zSig1 = 0;
3242     if ( bSig < aSig ) goto aBigger;
3243     if ( aSig < bSig ) goto bBigger;
3244     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3245  bExpBigger:
3246     if ( bExp == 0x7FFF ) {
3247         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3248         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3249     }
3250     if ( aExp == 0 ) ++expDiff;
3251     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3252  bBigger:
3253     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3254     zExp = bExp;
3255     zSign ^= 1;
3256     goto normalizeRoundAndPack;
3257  aExpBigger:
3258     if ( aExp == 0x7FFF ) {
3259         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3260         return a;
3261     }
3262     if ( bExp == 0 ) --expDiff;
3263     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3264  aBigger:
3265     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3266     zExp = aExp;
3267  normalizeRoundAndPack:
3268     return
3269         normalizeRoundAndPackFloatx80(
3270             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3271
3272 }
3273
3274 /*
3275 -------------------------------------------------------------------------------
3276 Returns the result of adding the extended double-precision floating-point
3277 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3278 Standard for Binary Floating-point Arithmetic.
3279 -------------------------------------------------------------------------------
3280 */
3281 floatx80 floatx80_add( floatx80 a, floatx80 b )
3282 {
3283     flag aSign, bSign;
3284
3285     aSign = extractFloatx80Sign( a );
3286     bSign = extractFloatx80Sign( b );
3287     if ( aSign == bSign ) {
3288         return addFloatx80Sigs( a, b, aSign );
3289     }
3290     else {
3291         return subFloatx80Sigs( a, b, aSign );
3292     }
3293
3294 }
3295
3296 /*
3297 -------------------------------------------------------------------------------
3298 Returns the result of subtracting the extended double-precision floating-
3299 point values `a' and `b'.  The operation is performed according to the
3300 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3301 -------------------------------------------------------------------------------
3302 */
3303 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3304 {
3305     flag aSign, bSign;
3306
3307     aSign = extractFloatx80Sign( a );
3308     bSign = extractFloatx80Sign( b );
3309     if ( aSign == bSign ) {
3310         return subFloatx80Sigs( a, b, aSign );
3311     }
3312     else {
3313         return addFloatx80Sigs( a, b, aSign );
3314     }
3315
3316 }
3317
3318 /*
3319 -------------------------------------------------------------------------------
3320 Returns the result of multiplying the extended double-precision floating-
3321 point values `a' and `b'.  The operation is performed according to the
3322 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3323 -------------------------------------------------------------------------------
3324 */
3325 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3326 {
3327     flag aSign, bSign, zSign;
3328     int32 aExp, bExp, zExp;
3329     bits64 aSig, bSig, zSig0, zSig1;
3330     floatx80 z;
3331
3332     aSig = extractFloatx80Frac( a );
3333     aExp = extractFloatx80Exp( a );
3334     aSign = extractFloatx80Sign( a );
3335     bSig = extractFloatx80Frac( b );
3336     bExp = extractFloatx80Exp( b );
3337     bSign = extractFloatx80Sign( b );
3338     zSign = aSign ^ bSign;
3339     if ( aExp == 0x7FFF ) {
3340         if (    (bits64) ( aSig<<1 )
3341              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3342             return propagateFloatx80NaN( a, b );
3343         }
3344         if ( ( bExp | bSig ) == 0 ) goto invalid;
3345         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3346     }
3347     if ( bExp == 0x7FFF ) {
3348         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3349         if ( ( aExp | aSig ) == 0 ) {
3350  invalid:
3351             float_raise( float_flag_invalid );
3352             z.low = floatx80_default_nan_low;
3353             z.high = floatx80_default_nan_high;
3354             return z;
3355         }
3356         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3357     }
3358     if ( aExp == 0 ) {
3359         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3360         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3361     }
3362     if ( bExp == 0 ) {
3363         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3364         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3365     }
3366     zExp = aExp + bExp - 0x3FFE;
3367     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3368     if ( 0 < (sbits64) zSig0 ) {
3369         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3370         --zExp;
3371     }
3372     return
3373         roundAndPackFloatx80(
3374             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3375
3376 }
3377
3378 /*
3379 -------------------------------------------------------------------------------
3380 Returns the result of dividing the extended double-precision floating-point
3381 value `a' by the corresponding value `b'.  The operation is performed
3382 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3383 -------------------------------------------------------------------------------
3384 */
3385 floatx80 floatx80_div( floatx80 a, floatx80 b )
3386 {
3387     flag aSign, bSign, zSign;
3388     int32 aExp, bExp, zExp;
3389     bits64 aSig, bSig, zSig0, zSig1;
3390     bits64 rem0, rem1, rem2, term0, term1, term2;
3391     floatx80 z;
3392
3393     aSig = extractFloatx80Frac( a );
3394     aExp = extractFloatx80Exp( a );
3395     aSign = extractFloatx80Sign( a );
3396     bSig = extractFloatx80Frac( b );
3397     bExp = extractFloatx80Exp( b );
3398     bSign = extractFloatx80Sign( b );
3399     zSign = aSign ^ bSign;
3400     if ( aExp == 0x7FFF ) {
3401         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3402         if ( bExp == 0x7FFF ) {
3403             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3404             goto invalid;
3405         }
3406         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3407     }
3408     if ( bExp == 0x7FFF ) {
3409         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3410         return packFloatx80( zSign, 0, 0 );
3411     }
3412     if ( bExp == 0 ) {
3413         if ( bSig == 0 ) {
3414             if ( ( aExp | aSig ) == 0 ) {
3415  invalid:
3416                 float_raise( float_flag_invalid );
3417                 z.low = floatx80_default_nan_low;
3418                 z.high = floatx80_default_nan_high;
3419                 return z;
3420             }
3421             float_raise( float_flag_divbyzero );
3422             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3423         }
3424         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3425     }
3426     if ( aExp == 0 ) {
3427         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3428         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3429     }
3430     zExp = aExp - bExp + 0x3FFE;
3431     rem1 = 0;
3432     if ( bSig <= aSig ) {
3433         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3434         ++zExp;
3435     }
3436     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3437     mul64To128( bSig, zSig0, &term0, &term1 );
3438     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3439     while ( (sbits64) rem0 < 0 ) {
3440         --zSig0;
3441         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3442     }
3443     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3444     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3445         mul64To128( bSig, zSig1, &term1, &term2 );
3446         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3447         while ( (sbits64) rem1 < 0 ) {
3448             --zSig1;
3449             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3450         }
3451         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3452     }
3453     return
3454         roundAndPackFloatx80(
3455             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3456
3457 }
3458
3459 /*
3460 -------------------------------------------------------------------------------
3461 Returns the remainder of the extended double-precision floating-point value
3462 `a' with respect to the corresponding value `b'.  The operation is performed
3463 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3464 -------------------------------------------------------------------------------
3465 */
3466 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3467 {
3468     flag aSign, bSign, zSign;
3469     int32 aExp, bExp, expDiff;
3470     bits64 aSig0, aSig1, bSig;
3471     bits64 q, term0, term1, alternateASig0, alternateASig1;
3472     floatx80 z;
3473
3474     aSig0 = extractFloatx80Frac( a );
3475     aExp = extractFloatx80Exp( a );
3476     aSign = extractFloatx80Sign( a );
3477     bSig = extractFloatx80Frac( b );
3478     bExp = extractFloatx80Exp( b );
3479     bSign = extractFloatx80Sign( b );
3480     if ( aExp == 0x7FFF ) {
3481         if (    (bits64) ( aSig0<<1 )
3482              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3483             return propagateFloatx80NaN( a, b );
3484         }
3485         goto invalid;
3486     }
3487     if ( bExp == 0x7FFF ) {
3488         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3489         return a;
3490     }
3491     if ( bExp == 0 ) {
3492         if ( bSig == 0 ) {
3493  invalid:
3494             float_raise( float_flag_invalid );
3495             z.low = floatx80_default_nan_low;
3496             z.high = floatx80_default_nan_high;
3497             return z;
3498         }
3499         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3500     }
3501     if ( aExp == 0 ) {
3502         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3503         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3504     }
3505     bSig |= LIT64( 0x8000000000000000 );
3506     zSign = aSign;
3507     expDiff = aExp - bExp;
3508     aSig1 = 0;
3509     if ( expDiff < 0 ) {
3510         if ( expDiff < -1 ) return a;
3511         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3512         expDiff = 0;
3513     }
3514     q = ( bSig <= aSig0 );
3515     if ( q ) aSig0 -= bSig;
3516     expDiff -= 64;
3517     while ( 0 < expDiff ) {
3518         q = estimateDiv128To64( aSig0, aSig1, bSig );
3519         q = ( 2 < q ) ? q - 2 : 0;
3520         mul64To128( bSig, q, &term0, &term1 );
3521         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3522         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3523         expDiff -= 62;
3524     }
3525     expDiff += 64;
3526     if ( 0 < expDiff ) {
3527         q = estimateDiv128To64( aSig0, aSig1, bSig );
3528         q = ( 2 < q ) ? q - 2 : 0;
3529         q >>= 64 - expDiff;
3530         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3531         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3532         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3533         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3534             ++q;
3535             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3536         }
3537     }
3538     else {
3539         term1 = 0;
3540         term0 = bSig;
3541     }
3542     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3543     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3544          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3545               && ( q & 1 ) )
3546        ) {
3547         aSig0 = alternateASig0;
3548         aSig1 = alternateASig1;
3549         zSign = ! zSign;
3550     }
3551     return
3552         normalizeRoundAndPackFloatx80(
3553             80, zSign, bExp + expDiff, aSig0, aSig1 );
3554
3555 }
3556
3557 /*
3558 -------------------------------------------------------------------------------
3559 Returns the square root of the extended double-precision floating-point
3560 value `a'.  The operation is performed according to the IEC/IEEE Standard
3561 for Binary Floating-point Arithmetic.
3562 -------------------------------------------------------------------------------
3563 */
3564 floatx80 floatx80_sqrt( floatx80 a )
3565 {
3566     flag aSign;
3567     int32 aExp, zExp;
3568     bits64 aSig0, aSig1, zSig0, zSig1;
3569     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3570     bits64 shiftedRem0, shiftedRem1;
3571     floatx80 z;
3572
3573     aSig0 = extractFloatx80Frac( a );
3574     aExp = extractFloatx80Exp( a );
3575     aSign = extractFloatx80Sign( a );
3576     if ( aExp == 0x7FFF ) {
3577         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3578         if ( ! aSign ) return a;
3579         goto invalid;
3580     }
3581     if ( aSign ) {
3582         if ( ( aExp | aSig0 ) == 0 ) return a;
3583  invalid:
3584         float_raise( float_flag_invalid );
3585         z.low = floatx80_default_nan_low;
3586         z.high = floatx80_default_nan_high;
3587         return z;
3588     }
3589     if ( aExp == 0 ) {
3590         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3591         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3592     }
3593     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3594     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3595     zSig0 <<= 31;
3596     aSig1 = 0;
3597     shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3598     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3599     if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3600     shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3601     mul64To128( zSig0, zSig0, &term0, &term1 );
3602     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3603     while ( (sbits64) rem0 < 0 ) {
3604         --zSig0;
3605         shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3606         term1 |= 1;
3607         add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3608     }
3609     shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3610     zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3611     if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3612         if ( zSig1 == 0 ) zSig1 = 1;
3613         mul64To128( zSig0, zSig1, &term1, &term2 );
3614         shortShift128Left( term1, term2, 1, &term1, &term2 );
3615         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3616         mul64To128( zSig1, zSig1, &term2, &term3 );
3617         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3618         while ( (sbits64) rem1 < 0 ) {
3619             --zSig1;
3620             shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3621             term3 |= 1;
3622             add192(
3623                 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3624         }
3625         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3626     }
3627     return
3628         roundAndPackFloatx80(
3629             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3630
3631 }
3632
3633 /*
3634 -------------------------------------------------------------------------------
3635 Returns 1 if the extended double-precision floating-point value `a' is
3636 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3637 performed according to the IEC/IEEE Standard for Binary Floating-point
3638 Arithmetic.
3639 -------------------------------------------------------------------------------
3640 */
3641 flag floatx80_eq( floatx80 a, floatx80 b )
3642 {
3643
3644     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3645               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3646          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3647               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3648        ) {
3649         if (    floatx80_is_signaling_nan( a )
3650              || floatx80_is_signaling_nan( b ) ) {
3651             float_raise( float_flag_invalid );
3652         }
3653         return 0;
3654     }
3655     return
3656            ( a.low == b.low )
3657         && (    ( a.high == b.high )
3658              || (    ( a.low == 0 )
3659                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3660            );
3661
3662 }
3663
3664 /*
3665 -------------------------------------------------------------------------------
3666 Returns 1 if the extended double-precision floating-point value `a' is
3667 less than or equal to the corresponding value `b', and 0 otherwise.  The
3668 comparison is performed according to the IEC/IEEE Standard for Binary
3669 Floating-point Arithmetic.
3670 -------------------------------------------------------------------------------
3671 */
3672 flag floatx80_le( floatx80 a, floatx80 b )
3673 {
3674     flag aSign, bSign;
3675
3676     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3677               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3678          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3679               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3680        ) {
3681         float_raise( float_flag_invalid );
3682         return 0;
3683     }
3684     aSign = extractFloatx80Sign( a );
3685     bSign = extractFloatx80Sign( b );
3686     if ( aSign != bSign ) {
3687         return
3688                aSign
3689             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3690                  == 0 );
3691     }
3692     return
3693           aSign ? le128( b.high, b.low, a.high, a.low )
3694         : le128( a.high, a.low, b.high, b.low );
3695
3696 }
3697
3698 /*
3699 -------------------------------------------------------------------------------
3700 Returns 1 if the extended double-precision floating-point value `a' is
3701 less than the corresponding value `b', and 0 otherwise.  The comparison
3702 is performed according to the IEC/IEEE Standard for Binary Floating-point
3703 Arithmetic.
3704 -------------------------------------------------------------------------------
3705 */
3706 flag floatx80_lt( floatx80 a, floatx80 b )
3707 {
3708     flag aSign, bSign;
3709
3710     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3711               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3712          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3713               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3714        ) {
3715         float_raise( float_flag_invalid );
3716         return 0;
3717     }
3718     aSign = extractFloatx80Sign( a );
3719     bSign = extractFloatx80Sign( b );
3720     if ( aSign != bSign ) {
3721         return
3722                aSign
3723             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3724                  != 0 );
3725     }
3726     return
3727           aSign ? lt128( b.high, b.low, a.high, a.low )
3728         : lt128( a.high, a.low, b.high, b.low );
3729
3730 }
3731
3732 /*
3733 -------------------------------------------------------------------------------
3734 Returns 1 if the extended double-precision floating-point value `a' is equal
3735 to the corresponding value `b', and 0 otherwise.  The invalid exception is
3736 raised if either operand is a NaN.  Otherwise, the comparison is performed
3737 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3738 -------------------------------------------------------------------------------
3739 */
3740 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3741 {
3742
3743     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3744               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3745          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3746               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3747        ) {
3748         float_raise( float_flag_invalid );
3749         return 0;
3750     }
3751     return
3752            ( a.low == b.low )
3753         && (    ( a.high == b.high )
3754              || (    ( a.low == 0 )
3755                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3756            );
3757
3758 }
3759
3760 /*
3761 -------------------------------------------------------------------------------
3762 Returns 1 if the extended double-precision floating-point value `a' is less
3763 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3764 do not cause an exception.  Otherwise, the comparison is performed according
3765 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3766 -------------------------------------------------------------------------------
3767 */
3768 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3769 {
3770     flag aSign, bSign;
3771
3772     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3773               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3774          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3775               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3776        ) {
3777         if (    floatx80_is_signaling_nan( a )
3778              || floatx80_is_signaling_nan( b ) ) {
3779             float_raise( float_flag_invalid );
3780         }
3781         return 0;
3782     }
3783     aSign = extractFloatx80Sign( a );
3784     bSign = extractFloatx80Sign( b );
3785     if ( aSign != bSign ) {
3786         return
3787                aSign
3788             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3789                  == 0 );
3790     }
3791     return
3792           aSign ? le128( b.high, b.low, a.high, a.low )
3793         : le128( a.high, a.low, b.high, b.low );
3794
3795 }
3796
3797 /*
3798 -------------------------------------------------------------------------------
3799 Returns 1 if the extended double-precision floating-point value `a' is less
3800 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3801 an exception.  Otherwise, the comparison is performed according to the
3802 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3803 -------------------------------------------------------------------------------
3804 */
3805 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3806 {
3807     flag aSign, bSign;
3808
3809     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3810               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3811          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3812               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3813        ) {
3814         if (    floatx80_is_signaling_nan( a )
3815              || floatx80_is_signaling_nan( b ) ) {
3816             float_raise( float_flag_invalid );
3817         }
3818         return 0;
3819     }
3820     aSign = extractFloatx80Sign( a );
3821     bSign = extractFloatx80Sign( b );
3822     if ( aSign != bSign ) {
3823         return
3824                aSign
3825             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3826                  != 0 );
3827     }
3828     return
3829           aSign ? lt128( b.high, b.low, a.high, a.low )
3830         : lt128( a.high, a.low, b.high, b.low );
3831
3832 }
3833
3834 #endif
3835
3836 #ifdef FLOAT128
3837
3838 /*
3839 -------------------------------------------------------------------------------
3840 Returns the result of converting the quadruple-precision floating-point
3841 value `a' to the 32-bit two's complement integer format.  The conversion
3842 is performed according to the IEC/IEEE Standard for Binary Floating-point
3843 Arithmetic---which means in particular that the conversion is rounded
3844 according to the current rounding mode.  If `a' is a NaN, the largest
3845 positive integer is returned.  Otherwise, if the conversion overflows, the
3846 largest integer with the same sign as `a' is returned.
3847 -------------------------------------------------------------------------------
3848 */
3849 int32 float128_to_int32( float128 a )
3850 {
3851     flag aSign;
3852     int32 aExp, shiftCount;
3853     bits64 aSig0, aSig1;
3854
3855     aSig1 = extractFloat128Frac1( a );
3856     aSig0 = extractFloat128Frac0( a );
3857     aExp = extractFloat128Exp( a );
3858     aSign = extractFloat128Sign( a );
3859     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
3860     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3861     aSig0 |= ( aSig1 != 0 );
3862     shiftCount = 0x4028 - aExp;
3863     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
3864     return roundAndPackInt32( aSign, aSig0 );
3865
3866 }
3867
3868 /*
3869 -------------------------------------------------------------------------------
3870 Returns the result of converting the quadruple-precision floating-point
3871 value `a' to the 32-bit two's complement integer format.  The conversion
3872 is performed according to the IEC/IEEE Standard for Binary Floating-point
3873 Arithmetic, except that the conversion is always rounded toward zero.  If
3874 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
3875 conversion overflows, the largest integer with the same sign as `a' is
3876 returned.
3877 -------------------------------------------------------------------------------
3878 */
3879 int32 float128_to_int32_round_to_zero( float128 a )
3880 {
3881     flag aSign;
3882     int32 aExp, shiftCount;
3883     bits64 aSig0, aSig1, savedASig;
3884     int32 z;
3885
3886     aSig1 = extractFloat128Frac1( a );
3887     aSig0 = extractFloat128Frac0( a );
3888     aExp = extractFloat128Exp( a );
3889     aSign = extractFloat128Sign( a );
3890     aSig0 |= ( aSig1 != 0 );
3891     shiftCount = 0x402F - aExp;
3892     if ( shiftCount < 17 ) {
3893         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
3894         goto invalid;
3895     }
3896     else if ( 48 < shiftCount ) {
3897         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
3898         return 0;
3899     }
3900     aSig0 |= LIT64( 0x0001000000000000 );
3901     savedASig = aSig0;
3902     aSig0 >>= shiftCount;
3903     z = aSig0;
3904     if ( aSign ) z = - z;
3905     if ( ( z < 0 ) ^ aSign ) {
3906  invalid:
3907         float_exception_flags |= float_flag_invalid;
3908         return aSign ? 0x80000000 : 0x7FFFFFFF;
3909     }
3910     if ( ( aSig0<<shiftCount ) != savedASig ) {
3911         float_exception_flags |= float_flag_inexact;
3912     }
3913     return z;
3914
3915 }
3916
3917 /*
3918 -------------------------------------------------------------------------------
3919 Returns the result of converting the quadruple-precision floating-point
3920 value `a' to the single-precision floating-point format.  The conversion
3921 is performed according to the IEC/IEEE Standard for Binary Floating-point
3922 Arithmetic.
3923 -------------------------------------------------------------------------------
3924 */
3925 float32 float128_to_float32( float128 a )
3926 {
3927     flag aSign;
3928     int32 aExp;
3929     bits64 aSig0, aSig1;
3930     bits32 zSig;
3931
3932     aSig1 = extractFloat128Frac1( a );
3933     aSig0 = extractFloat128Frac0( a );
3934     aExp = extractFloat128Exp( a );
3935     aSign = extractFloat128Sign( a );
3936     if ( aExp == 0x7FFF ) {
3937         if ( aSig0 | aSig1 ) {
3938             return commonNaNToFloat32( float128ToCommonNaN( a ) );
3939         }
3940         return packFloat32( aSign, 0xFF, 0 );
3941     }
3942     aSig0 |= ( aSig1 != 0 );
3943     shift64RightJamming( aSig0, 18, &aSig0 );
3944     zSig = aSig0;
3945     if ( aExp || zSig ) {
3946         zSig |= 0x40000000;
3947         aExp -= 0x3F81;
3948     }
3949     return roundAndPackFloat32( aSign, aExp, zSig );
3950
3951 }
3952
3953 /*
3954 -------------------------------------------------------------------------------
3955 Returns the result of converting the quadruple-precision floating-point
3956 value `a' to the double-precision floating-point format.  The conversion
3957 is performed according to the IEC/IEEE Standard for Binary Floating-point
3958 Arithmetic.
3959 -------------------------------------------------------------------------------
3960 */
3961 float64 float128_to_float64( float128 a )
3962 {
3963     flag aSign;
3964     int32 aExp;
3965     bits64 aSig0, aSig1;
3966
3967     aSig1 = extractFloat128Frac1( a );
3968     aSig0 = extractFloat128Frac0( a );
3969     aExp = extractFloat128Exp( a );
3970     aSign = extractFloat128Sign( a );
3971     if ( aExp == 0x7FFF ) {
3972         if ( aSig0 | aSig1 ) {
3973             return commonNaNToFloat64( float128ToCommonNaN( a ) );
3974         }
3975         return packFloat64( aSign, 0x7FF, 0 );
3976     }
3977     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
3978     aSig0 |= ( aSig1 != 0 );
3979     if ( aExp || aSig0 ) {
3980         aSig0 |= LIT64( 0x4000000000000000 );
3981         aExp -= 0x3C01;
3982     }
3983     return roundAndPackFloat64( aSign, aExp, aSig0 );
3984
3985 }
3986
3987 #ifdef FLOATX80
3988
3989 /*
3990 -------------------------------------------------------------------------------
3991 Returns the result of converting the quadruple-precision floating-point
3992 value `a' to the extended double-precision floating-point format.  The
3993 conversion is performed according to the IEC/IEEE Standard for Binary
3994 Floating-point Arithmetic.
3995 -------------------------------------------------------------------------------
3996 */
3997 floatx80 float128_to_floatx80( float128 a )
3998 {
3999     flag aSign;
4000     int32 aExp;
4001     bits64 aSig0, aSig1;
4002
4003     aSig1 = extractFloat128Frac1( a );
4004     aSig0 = extractFloat128Frac0( a );
4005     aExp = extractFloat128Exp( a );
4006     aSign = extractFloat128Sign( a );
4007     if ( aExp == 0x7FFF ) {
4008         if ( aSig0 | aSig1 ) {
4009             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4010         }
4011         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4012     }
4013     if ( aExp == 0 ) {
4014         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4015         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4016     }
4017     else {
4018         aSig0 |= LIT64( 0x0001000000000000 );
4019     }
4020     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4021     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4022
4023 }
4024
4025 #endif
4026
4027 /*
4028 -------------------------------------------------------------------------------
4029 Rounds the quadruple-precision floating-point value `a' to an integer, and
4030 returns the result as a quadruple-precision floating-point value.  The
4031 operation is performed according to the IEC/IEEE Standard for Binary
4032 Floating-point Arithmetic.
4033 -------------------------------------------------------------------------------
4034 */
4035 float128 float128_round_to_int( float128 a )
4036 {
4037     flag aSign;
4038     int32 aExp;
4039     bits64 lastBitMask, roundBitsMask;
4040     int8 roundingMode;
4041     float128 z;
4042
4043     aExp = extractFloat128Exp( a );
4044     if ( 0x402F <= aExp ) {
4045         if ( 0x406F <= aExp ) {
4046             if (    ( aExp == 0x7FFF )
4047                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4048                ) {
4049                 return propagateFloat128NaN( a, a );
4050             }
4051             return a;
4052         }
4053         lastBitMask = 1;
4054         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4055         roundBitsMask = lastBitMask - 1;
4056         z = a;
4057         roundingMode = float_rounding_mode;
4058         if ( roundingMode == float_round_nearest_even ) {
4059             if ( lastBitMask ) {
4060                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4061                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4062             }
4063             else {
4064                 if ( (sbits64) z.low < 0 ) {
4065                     ++z.high;
4066                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4067                 }
4068             }
4069         }
4070         else if ( roundingMode != float_round_to_zero ) {
4071             if (   extractFloat128Sign( z )
4072                  ^ ( roundingMode == float_round_up ) ) {
4073                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4074             }
4075         }
4076         z.low &= ~ roundBitsMask;
4077     }
4078     else {
4079         if ( aExp <= 0x3FFE ) {
4080             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4081             float_exception_flags |= float_flag_inexact;
4082             aSign = extractFloat128Sign( a );
4083             switch ( float_rounding_mode ) {
4084              case float_round_nearest_even:
4085                 if (    ( aExp == 0x3FFE )
4086                      && (   extractFloat128Frac0( a )
4087                           | extractFloat128Frac1( a ) )
4088                    ) {
4089                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4090                 }
4091                 break;
4092              case float_round_down:
4093                 return
4094                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4095                     : packFloat128( 0, 0, 0, 0 );
4096              case float_round_up:
4097                 return
4098                       aSign ? packFloat128( 1, 0, 0, 0 )
4099                     : packFloat128( 0, 0x3FFF, 0, 0 );
4100             }
4101             return packFloat128( aSign, 0, 0, 0 );
4102         }
4103         lastBitMask = 1;
4104         lastBitMask <<= 0x402F - aExp;
4105         roundBitsMask = lastBitMask - 1;
4106         z.low = 0;
4107         z.high = a.high;
4108         roundingMode = float_rounding_mode;
4109         if ( roundingMode == float_round_nearest_even ) {
4110             z.high += lastBitMask>>1;
4111             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4112                 z.high &= ~ lastBitMask;
4113             }
4114         }
4115         else if ( roundingMode != float_round_to_zero ) {
4116             if (   extractFloat128Sign( z )
4117                  ^ ( roundingMode == float_round_up ) ) {
4118                 z.high |= ( a.low != 0 );
4119                 z.high += roundBitsMask;
4120             }
4121         }
4122         z.high &= ~ roundBitsMask;
4123     }
4124     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4125         float_exception_flags |= float_flag_inexact;
4126     }
4127     return z;
4128
4129 }
4130
4131 /*
4132 -------------------------------------------------------------------------------
4133 Returns the result of adding the absolute values of the quadruple-precision
4134 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
4135 before being returned.  `zSign' is ignored if the result is a NaN.  The
4136 addition is performed according to the IEC/IEEE Standard for Binary
4137 Floating-point Arithmetic.
4138 -------------------------------------------------------------------------------
4139 */
4140 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4141 {
4142     int32 aExp, bExp, zExp;
4143     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4144     int32 expDiff;
4145
4146     aSig1 = extractFloat128Frac1( a );
4147     aSig0 = extractFloat128Frac0( a );
4148     aExp = extractFloat128Exp( a );
4149     bSig1 = extractFloat128Frac1( b );
4150     bSig0 = extractFloat128Frac0( b );
4151     bExp = extractFloat128Exp( b );
4152     expDiff = aExp - bExp;
4153     if ( 0 < expDiff ) {
4154         if ( aExp == 0x7FFF ) {
4155             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4156             return a;
4157         }
4158         if ( bExp == 0 ) {
4159             --expDiff;
4160         }
4161         else {
4162             bSig0 |= LIT64( 0x0001000000000000 );
4163         }
4164         shift128ExtraRightJamming(
4165             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4166         zExp = aExp;
4167     }
4168     else if ( expDiff < 0 ) {
4169         if ( bExp == 0x7FFF ) {
4170             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4171             return packFloat128( zSign, 0x7FFF, 0, 0 );
4172         }
4173         if ( aExp == 0 ) {
4174             ++expDiff;
4175         }
4176         else {
4177             aSig0 |= LIT64( 0x0001000000000000 );
4178         }
4179         shift128ExtraRightJamming(
4180             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4181         zExp = bExp;
4182     }
4183     else {
4184         if ( aExp == 0x7FFF ) {
4185             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4186                 return propagateFloat128NaN( a, b );
4187             }
4188             return a;
4189         }
4190         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4191         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4192         zSig2 = 0;
4193         zSig0 |= LIT64( 0x0002000000000000 );
4194         zExp = aExp;
4195         goto shiftRight1;
4196     }
4197     aSig0 |= LIT64( 0x0001000000000000 );
4198     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4199     --zExp;
4200     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4201     ++zExp;
4202  shiftRight1:
4203     shift128ExtraRightJamming(
4204         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4205  roundAndPack:
4206     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4207
4208 }
4209
4210 /*
4211 -------------------------------------------------------------------------------
4212 Returns the result of subtracting the absolute values of the quadruple-
4213 precision floating-point values `a' and `b'.  If `zSign' is true, the
4214 difference is negated before being returned.  `zSign' is ignored if the
4215 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4216 Standard for Binary Floating-point Arithmetic.
4217 -------------------------------------------------------------------------------
4218 */
4219 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4220 {
4221     int32 aExp, bExp, zExp;
4222     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4223     int32 expDiff;
4224     float128 z;
4225
4226     aSig1 = extractFloat128Frac1( a );
4227     aSig0 = extractFloat128Frac0( a );
4228     aExp = extractFloat128Exp( a );
4229     bSig1 = extractFloat128Frac1( b );
4230     bSig0 = extractFloat128Frac0( b );
4231     bExp = extractFloat128Exp( b );
4232     expDiff = aExp - bExp;
4233     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4234     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4235     if ( 0 < expDiff ) goto aExpBigger;
4236     if ( expDiff < 0 ) goto bExpBigger;
4237     if ( aExp == 0x7FFF ) {
4238         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4239             return propagateFloat128NaN( a, b );
4240         }
4241         float_raise( float_flag_invalid );
4242         z.low = float128_default_nan_low;
4243         z.high = float128_default_nan_high;
4244         return z;
4245     }
4246     if ( aExp == 0 ) {
4247         aExp = 1;
4248         bExp = 1;
4249     }
4250     if ( bSig0 < aSig0 ) goto aBigger;
4251     if ( aSig0 < bSig0 ) goto bBigger;
4252     if ( bSig1 < aSig1 ) goto aBigger;
4253     if ( aSig1 < bSig1 ) goto bBigger;
4254     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4255  bExpBigger:
4256     if ( bExp == 0x7FFF ) {
4257         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4258         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4259     }
4260     if ( aExp == 0 ) {
4261         ++expDiff;
4262     }
4263     else {
4264         aSig0 |= LIT64( 0x4000000000000000 );
4265     }
4266     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4267     bSig0 |= LIT64( 0x4000000000000000 );
4268  bBigger:
4269     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4270     zExp = bExp;
4271     zSign ^= 1;
4272     goto normalizeRoundAndPack;
4273  aExpBigger:
4274     if ( aExp == 0x7FFF ) {
4275         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4276         return a;
4277     }
4278     if ( bExp == 0 ) {
4279         --expDiff;
4280     }
4281     else {
4282         bSig0 |= LIT64( 0x4000000000000000 );
4283     }
4284     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4285     aSig0 |= LIT64( 0x4000000000000000 );
4286  aBigger:
4287     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4288     zExp = aExp;
4289  normalizeRoundAndPack:
4290     --zExp;
4291     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4292
4293 }
4294
4295 /*
4296 -------------------------------------------------------------------------------
4297 Returns the result of adding the quadruple-precision floating-point values
4298 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4299 for Binary Floating-point Arithmetic.
4300 -------------------------------------------------------------------------------
4301 */
4302 float128 float128_add( float128 a, float128 b )
4303 {
4304     flag aSign, bSign;
4305
4306     aSign = extractFloat128Sign( a );
4307     bSign = extractFloat128Sign( b );
4308     if ( aSign == bSign ) {
4309         return addFloat128Sigs( a, b, aSign );
4310     }
4311     else {
4312         return subFloat128Sigs( a, b, aSign );
4313     }
4314
4315 }
4316
4317 /*
4318 -------------------------------------------------------------------------------
4319 Returns the result of subtracting the quadruple-precision floating-point
4320 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4321 Standard for Binary Floating-point Arithmetic.
4322 -------------------------------------------------------------------------------
4323 */
4324 float128 float128_sub( float128 a, float128 b )
4325 {
4326     flag aSign, bSign;
4327
4328     aSign = extractFloat128Sign( a );
4329     bSign = extractFloat128Sign( b );
4330     if ( aSign == bSign ) {
4331         return subFloat128Sigs( a, b, aSign );
4332     }
4333     else {
4334         return addFloat128Sigs( a, b, aSign );
4335     }
4336
4337 }
4338
4339 /*
4340 -------------------------------------------------------------------------------
4341 Returns the result of multiplying the quadruple-precision floating-point
4342 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4343 Standard for Binary Floating-point Arithmetic.
4344 -------------------------------------------------------------------------------
4345 */
4346 float128 float128_mul( float128 a, float128 b )
4347 {
4348     flag aSign, bSign, zSign;
4349     int32 aExp, bExp, zExp;
4350     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4351     float128 z;
4352
4353     aSig1 = extractFloat128Frac1( a );
4354     aSig0 = extractFloat128Frac0( a );
4355     aExp = extractFloat128Exp( a );
4356     aSign = extractFloat128Sign( a );
4357     bSig1 = extractFloat128Frac1( b );
4358     bSig0 = extractFloat128Frac0( b );
4359     bExp = extractFloat128Exp( b );
4360     bSign = extractFloat128Sign( b );
4361     zSign = aSign ^ bSign;
4362     if ( aExp == 0x7FFF ) {
4363         if (    ( aSig0 | aSig1 )
4364              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4365             return propagateFloat128NaN( a, b );
4366         }
4367         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4368         return packFloat128( zSign, 0x7FFF, 0, 0 );
4369     }
4370     if ( bExp == 0x7FFF ) {
4371         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4372         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4373  invalid:
4374             float_raise( float_flag_invalid );
4375             z.low = float128_default_nan_low;
4376             z.high = float128_default_nan_high;
4377             return z;
4378         }
4379         return packFloat128( zSign, 0x7FFF, 0, 0 );
4380     }
4381     if ( aExp == 0 ) {
4382         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4383         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4384     }
4385     if ( bExp == 0 ) {
4386         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4387         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4388     }
4389     zExp = aExp + bExp - 0x4000;
4390     aSig0 |= LIT64( 0x0001000000000000 );
4391     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4392     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4393     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4394     zSig2 |= ( zSig3 != 0 );
4395     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4396         shift128ExtraRightJamming(
4397             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4398         ++zExp;
4399     }
4400     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4401
4402 }
4403
4404 /*
4405 -------------------------------------------------------------------------------
4406 Returns the result of dividing the quadruple-precision floating-point value
4407 `a' by the corresponding value `b'.  The operation is performed according to
4408 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4409 -------------------------------------------------------------------------------
4410 */
4411 float128 float128_div( float128 a, float128 b )
4412 {
4413     flag aSign, bSign, zSign;
4414     int32 aExp, bExp, zExp;
4415     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4416     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4417     float128 z;
4418
4419     aSig1 = extractFloat128Frac1( a );
4420     aSig0 = extractFloat128Frac0( a );
4421     aExp = extractFloat128Exp( a );
4422     aSign = extractFloat128Sign( a );
4423     bSig1 = extractFloat128Frac1( b );
4424     bSig0 = extractFloat128Frac0( b );
4425     bExp = extractFloat128Exp( b );
4426     bSign = extractFloat128Sign( b );
4427     zSign = aSign ^ bSign;
4428     if ( aExp == 0x7FFF ) {
4429         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4430         if ( bExp == 0x7FFF ) {
4431             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4432             goto invalid;
4433         }
4434         return packFloat128( zSign, 0x7FFF, 0, 0 );
4435     }
4436     if ( bExp == 0x7FFF ) {
4437         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4438         return packFloat128( zSign, 0, 0, 0 );
4439     }
4440     if ( bExp == 0 ) {
4441         if ( ( bSig0 | bSig1 ) == 0 ) {
4442             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4443  invalid:
4444                 float_raise( float_flag_invalid );
4445                 z.low = float128_default_nan_low;
4446                 z.high = float128_default_nan_high;
4447                 return z;
4448             }
4449             float_raise( float_flag_divbyzero );
4450             return packFloat128( zSign, 0x7FFF, 0, 0 );
4451         }
4452         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4453     }
4454     if ( aExp == 0 ) {
4455         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4456         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4457     }
4458     zExp = aExp - bExp + 0x3FFD;
4459     shortShift128Left(
4460         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4461     shortShift128Left(
4462         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4463     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4464         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4465         ++zExp;
4466     }
4467     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4468     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4469     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4470     while ( (sbits64) rem0 < 0 ) {
4471         --zSig0;
4472         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4473     }
4474     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4475     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4476         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4477         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4478         while ( (sbits64) rem1 < 0 ) {
4479             --zSig1;
4480             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4481         }
4482         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4483     }
4484     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4485     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4486
4487 }
4488
4489 /*
4490 -------------------------------------------------------------------------------
4491 Returns the remainder of the quadruple-precision floating-point value `a'
4492 with respect to the corresponding value `b'.  The operation is performed
4493 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4494 -------------------------------------------------------------------------------
4495 */
4496 float128 float128_rem( float128 a, float128 b )
4497 {
4498     flag aSign, bSign, zSign;
4499     int32 aExp, bExp, expDiff;
4500     bits64 aSig0, aSig1, bSig0, bSig1;
4501     bits64 q, term0, term1, term2, allZero, alternateASig0, alternateASig1;
4502     bits64 sigMean1;
4503     sbits64 sigMean0;
4504     float128 z;
4505
4506     aSig1 = extractFloat128Frac1( a );
4507     aSig0 = extractFloat128Frac0( a );
4508     aExp = extractFloat128Exp( a );
4509     aSign = extractFloat128Sign( a );
4510     bSig1 = extractFloat128Frac1( b );
4511     bSig0 = extractFloat128Frac0( b );
4512     bExp = extractFloat128Exp( b );
4513     bSign = extractFloat128Sign( b );
4514     if ( aExp == 0x7FFF ) {
4515         if (    ( aSig0 | aSig1 )
4516              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4517             return propagateFloat128NaN( a, b );
4518         }
4519         goto invalid;
4520     }
4521     if ( bExp == 0x7FFF ) {
4522         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4523         return a;
4524     }
4525     if ( bExp == 0 ) {
4526         if ( ( bSig0 | bSig1 ) == 0 ) {
4527  invalid:
4528             float_raise( float_flag_invalid );
4529             z.low = float128_default_nan_low;
4530             z.high = float128_default_nan_high;
4531             return z;
4532         }
4533         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4534     }
4535     if ( aExp == 0 ) {
4536         if ( ( aSig0 | aSig1 ) == 0 ) return a;
4537         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4538     }
4539     expDiff = aExp - bExp;
4540     if ( expDiff < -1 ) return a;
4541     shortShift128Left(
4542         aSig0 | LIT64( 0x0001000000000000 ),
4543         aSig1,
4544         15 - ( expDiff < 0 ),
4545         &aSig0,
4546         &aSig1
4547     );
4548     shortShift128Left(
4549         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4550     q = le128( bSig0, bSig1, aSig0, aSig1 );
4551     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4552     expDiff -= 64;
4553     while ( 0 < expDiff ) {
4554         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4555         q = ( 4 < q ) ? q - 4 : 0;
4556         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4557         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4558         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4559         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4560         expDiff -= 61;
4561     }
4562     if ( -64 < expDiff ) {
4563         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4564         q = ( 4 < q ) ? q - 4 : 0;
4565         q >>= - expDiff;
4566         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4567         expDiff += 52;
4568         if ( expDiff < 0 ) {
4569             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4570         }
4571         else {
4572             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4573         }
4574         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4575         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4576     }
4577     else {
4578         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4579         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4580     }
4581     do {
4582         alternateASig0 = aSig0;
4583         alternateASig1 = aSig1;
4584         ++q;
4585         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4586     } while ( 0 <= (sbits64) aSig0 );
4587     add128(
4588         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4589     if (    ( sigMean0 < 0 )
4590          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4591         aSig0 = alternateASig0;
4592         aSig1 = alternateASig1;
4593     }
4594     zSign = ( (sbits64) aSig0 < 0 );
4595     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4596     return
4597         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
4598
4599 }
4600
4601 /*
4602 -------------------------------------------------------------------------------
4603 Returns the square root of the quadruple-precision floating-point value `a'.
4604 The operation is performed according to the IEC/IEEE Standard for Binary
4605 Floating-point Arithmetic.
4606 -------------------------------------------------------------------------------
4607 */
4608 float128 float128_sqrt( float128 a )
4609 {
4610     flag aSign;
4611     int32 aExp, zExp;
4612     bits64 aSig0, aSig1, zSig0, zSig1, zSig2;
4613     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4614     bits64 shiftedRem0, shiftedRem1;
4615     float128 z;
4616
4617     aSig1 = extractFloat128Frac1( a );
4618     aSig0 = extractFloat128Frac0( a );
4619     aExp = extractFloat128Exp( a );
4620     aSign = extractFloat128Sign( a );
4621     if ( aExp == 0x7FFF ) {
4622         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
4623         if ( ! aSign ) return a;
4624         goto invalid;
4625     }
4626     if ( aSign ) {
4627         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4628  invalid:
4629         float_raise( float_flag_invalid );
4630         z.low = float128_default_nan_low;
4631         z.high = float128_default_nan_high;
4632         return z;
4633     }
4634     if ( aExp == 0 ) {
4635         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4636         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4637     }
4638     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4639     aSig0 |= LIT64( 0x0001000000000000 );
4640     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4641     zSig0 <<= 31;
4642     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4643     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
4644     if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
4645     shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
4646     mul64To128( zSig0, zSig0, &term0, &term1 );
4647     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4648     while ( (sbits64) rem0 < 0 ) {
4649         --zSig0;
4650         shortShift128Left( 0, zSig0, 1, &term0, &term1 );
4651         term1 |= 1;
4652         add128( rem0, rem1, term0, term1, &rem0, &rem1 );
4653     }
4654     shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
4655     zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
4656     if ( ( zSig1 & 0x3FFF ) <= 5 ) {
4657         if ( zSig1 == 0 ) zSig1 = 1;
4658         mul64To128( zSig0, zSig1, &term1, &term2 );
4659         shortShift128Left( term1, term2, 1, &term1, &term2 );
4660         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4661         mul64To128( zSig1, zSig1, &term2, &term3 );
4662         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4663         while ( (sbits64) rem1 < 0 ) {
4664             --zSig1;
4665             shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
4666             term3 |= 1;
4667             add192(
4668                 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
4669         }
4670         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4671     }
4672     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4673     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
4674
4675 }
4676
4677 /*
4678 -------------------------------------------------------------------------------
4679 Returns 1 if the quadruple-precision floating-point value `a' is equal to
4680 the corresponding value `b', and 0 otherwise.  The comparison is performed
4681 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4682 -------------------------------------------------------------------------------
4683 */
4684 flag float128_eq( float128 a, float128 b )
4685 {
4686
4687     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4688               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4689          || (    ( extractFloat128Exp( b ) == 0x7FFF )
4690               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4691        ) {
4692         if (    float128_is_signaling_nan( a )
4693              || float128_is_signaling_nan( b ) ) {
4694             float_raise( float_flag_invalid );
4695         }
4696         return 0;
4697     }
4698     return
4699            ( a.low == b.low )
4700         && (    ( a.high == b.high )
4701              || (    ( a.low == 0 )
4702                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4703            );
4704
4705 }
4706
4707 /*
4708 -------------------------------------------------------------------------------
4709 Returns 1 if the quadruple-precision floating-point value `a' is less than
4710 or equal to the corresponding value `b', and 0 otherwise.  The comparison
4711 is performed according to the IEC/IEEE Standard for Binary Floating-point
4712 Arithmetic.
4713 -------------------------------------------------------------------------------
4714 */
4715 flag float128_le( float128 a, float128 b )
4716 {
4717     flag aSign, bSign;
4718
4719     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4720               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4721          || (    ( extractFloat128Exp( b ) == 0x7FFF )
4722               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4723        ) {
4724         float_raise( float_flag_invalid );
4725         return 0;
4726     }
4727     aSign = extractFloat128Sign( a );
4728     bSign = extractFloat128Sign( b );
4729     if ( aSign != bSign ) {
4730         return
4731                aSign
4732             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4733                  == 0 );
4734     }
4735     return
4736           aSign ? le128( b.high, b.low, a.high, a.low )
4737         : le128( a.high, a.low, b.high, b.low );
4738
4739 }
4740
4741 /*
4742 -------------------------------------------------------------------------------
4743 Returns 1 if the quadruple-precision floating-point value `a' is less than
4744 the corresponding value `b', and 0 otherwise.  The comparison is performed
4745 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4746 -------------------------------------------------------------------------------
4747 */
4748 flag float128_lt( float128 a, float128 b )
4749 {
4750     flag aSign, bSign;
4751
4752     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4753               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4754          || (    ( extractFloat128Exp( b ) == 0x7FFF )
4755               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4756        ) {
4757         float_raise( float_flag_invalid );
4758         return 0;
4759     }
4760     aSign = extractFloat128Sign( a );
4761     bSign = extractFloat128Sign( b );
4762     if ( aSign != bSign ) {
4763         return
4764                aSign
4765             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4766                  != 0 );
4767     }
4768     return
4769           aSign ? lt128( b.high, b.low, a.high, a.low )
4770         : lt128( a.high, a.low, b.high, b.low );
4771
4772 }
4773
4774 /*
4775 -------------------------------------------------------------------------------
4776 Returns 1 if the quadruple-precision floating-point value `a' is equal to
4777 the corresponding value `b', and 0 otherwise.  The invalid exception is
4778 raised if either operand is a NaN.  Otherwise, the comparison is performed
4779 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4780 -------------------------------------------------------------------------------
4781 */
4782 flag float128_eq_signaling( float128 a, float128 b )
4783 {
4784
4785     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4786               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4787          || (    ( extractFloat128Exp( b ) == 0x7FFF )
4788               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4789        ) {
4790         float_raise( float_flag_invalid );
4791         return 0;
4792     }
4793     return
4794            ( a.low == b.low )
4795         && (    ( a.high == b.high )
4796              || (    ( a.low == 0 )
4797                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4798            );
4799
4800 }
4801
4802 /*
4803 -------------------------------------------------------------------------------
4804 Returns 1 if the quadruple-precision floating-point value `a' is less than
4805 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4806 cause an exception.  Otherwise, the comparison is performed according to the
4807 IEC/IEEE Standard for Binary Floating-point Arithmetic.
4808 -------------------------------------------------------------------------------
4809 */
4810 flag float128_le_quiet( float128 a, float128 b )
4811 {
4812     flag aSign, bSign;
4813
4814     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4815               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4816          || (    ( extractFloat128Exp( b ) == 0x7FFF )
4817               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4818        ) {
4819         if (    float128_is_signaling_nan( a )
4820              || float128_is_signaling_nan( b ) ) {
4821             float_raise( float_flag_invalid );
4822         }
4823         return 0;
4824     }
4825     aSign = extractFloat128Sign( a );
4826     bSign = extractFloat128Sign( b );
4827     if ( aSign != bSign ) {
4828         return
4829                aSign
4830             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4831                  == 0 );
4832     }
4833     return
4834           aSign ? le128( b.high, b.low, a.high, a.low )
4835         : le128( a.high, a.low, b.high, b.low );
4836
4837 }
4838
4839 /*
4840 -------------------------------------------------------------------------------
4841 Returns 1 if the quadruple-precision floating-point value `a' is less than
4842 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4843 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4844 Standard for Binary Floating-point Arithmetic.
4845 -------------------------------------------------------------------------------
4846 */
4847 flag float128_lt_quiet( float128 a, float128 b )
4848 {
4849     flag aSign, bSign;
4850
4851     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4852               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4853          || (    ( extractFloat128Exp( b ) == 0x7FFF )
4854               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4855        ) {
4856         if (    float128_is_signaling_nan( a )
4857              || float128_is_signaling_nan( b ) ) {
4858             float_raise( float_flag_invalid );
4859         }
4860         return 0;
4861     }
4862     aSign = extractFloat128Sign( a );
4863     bSign = extractFloat128Sign( b );
4864     if ( aSign != bSign ) {
4865         return
4866                aSign
4867             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4868                  != 0 );
4869     }
4870     return
4871           aSign ? lt128( b.high, b.low, a.high, a.low )
4872         : lt128( a.high, a.low, b.high, b.low );
4873
4874 }
4875
4876 #endif
4877