arch/arm/nwfpe/softfloat.c

   1 /*
   2 ===============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
   5 Arithmetic Package, Release 2.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  15 arithmetic/softfloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22
  23 Derivative works are acceptable, even for commercial purposes, so long as
  24 (1) they include prominent notice that the work is derivative, and (2) they
  25 include prominent notice akin to these three paragraphs for those parts of
  26 this code that are retained.
  27
  28 ===============================================================================
  29 */
  30
  31 #include <asm/div64.h>
  32
  33 #include "fpa11.h"
  34 //#include "milieu.h"
  35 //#include "softfloat.h"
  36
  37 /*
  38 -------------------------------------------------------------------------------
  39 Primitive arithmetic functions, including multi-word arithmetic, and
  40 division and square root approximations.  (Can be specialized to target if
  41 desired.)
  42 -------------------------------------------------------------------------------
  43 */
  44 #include "softfloat-macros"
  45
  46 /*
  47 -------------------------------------------------------------------------------
  48 Functions and definitions to determine:  (1) whether tininess for underflow
  49 is detected before or after rounding by default, (2) what (if anything)
  50 happens when exceptions are raised, (3) how signaling NaNs are distinguished
  51 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  52 are propagated from function inputs to output.  These details are target-
  53 specific.
  54 -------------------------------------------------------------------------------
  55 */
  56 #include "softfloat-specialize"
  57
  58 /*
  59 -------------------------------------------------------------------------------
  60 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  61 and 7, and returns the properly rounded 32-bit integer corresponding to the
  62 input.  If `zSign' is nonzero, the input is negated before being converted
  63 to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
  64 input is simply rounded to an integer, with the inexact exception raised if
  65 the input cannot be represented exactly as an integer.  If the fixed-point
  66 input is too large, however, the invalid exception is raised and the largest
  67 positive or negative integer is returned.
  68 -------------------------------------------------------------------------------
  69 */
  70 static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ )
  71 {
  72     int8 roundingMode;
  73     flag roundNearestEven;
  74     int8 roundIncrement, roundBits;
  75     int32 z;
  76
  77     roundingMode = roundData->mode;
  78     roundNearestEven = ( roundingMode == float_round_nearest_even );
  79     roundIncrement = 0x40;
  80     if ( ! roundNearestEven ) {
  81         if ( roundingMode == float_round_to_zero ) {
  82             roundIncrement = 0;
  83         }
  84         else {
  85             roundIncrement = 0x7F;
  86             if ( zSign ) {
  87                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  88             }
  89             else {
  90                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  91             }
  92         }
  93     }
  94     roundBits = absZ & 0x7F;
  95     absZ = ( absZ + roundIncrement )>>7;
  96     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
  97     z = absZ;
  98     if ( zSign ) z = - z;
  99     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 100         roundData->exception |= float_flag_invalid;
 101         return zSign ? 0x80000000 : 0x7FFFFFFF;
 102     }
 103     if ( roundBits ) roundData->exception |= float_flag_inexact;
 104     return z;
 105
 106 }
 107
 108 /*
 109 -------------------------------------------------------------------------------
 110 Returns the fraction bits of the single-precision floating-point value `a'.
 111 -------------------------------------------------------------------------------
 112 */
 113 INLINE bits32 extractFloat32Frac( float32 a )
 114 {
 115
 116     return a & 0x007FFFFF;
 117
 118 }
 119
 120 /*
 121 -------------------------------------------------------------------------------
 122 Returns the exponent bits of the single-precision floating-point value `a'.
 123 -------------------------------------------------------------------------------
 124 */
 125 INLINE int16 extractFloat32Exp( float32 a )
 126 {
 127
 128     return ( a>>23 ) & 0xFF;
 129
 130 }
 131
 132 /*
 133 -------------------------------------------------------------------------------
 134 Returns the sign bit of the single-precision floating-point value `a'.
 135 -------------------------------------------------------------------------------
 136 */
 137 #if 0   /* in softfloat.h */
 138 INLINE flag extractFloat32Sign( float32 a )
 139 {
 140
 141     return a>>31;
 142
 143 }
 144 #endif
 145
 146 /*
 147 -------------------------------------------------------------------------------
 148 Normalizes the subnormal single-precision floating-point value represented
 149 by the denormalized significand `aSig'.  The normalized exponent and
 150 significand are stored at the locations pointed to by `zExpPtr' and
 151 `zSigPtr', respectively.
 152 -------------------------------------------------------------------------------
 153 */
 154 static void
 155  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 156 {
 157     int8 shiftCount;
 158
 159     shiftCount = countLeadingZeros32( aSig ) - 8;
 160     *zSigPtr = aSig<<shiftCount;
 161     *zExpPtr = 1 - shiftCount;
 162
 163 }
 164
 165 /*
 166 -------------------------------------------------------------------------------
 167 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 168 single-precision floating-point value, returning the result.  After being
 169 shifted into the proper positions, the three fields are simply added
 170 together to form the result.  This means that any integer portion of `zSig'
 171 will be added into the exponent.  Since a properly normalized significand
 172 will have an integer portion equal to 1, the `zExp' input should be 1 less
 173 than the desired result exponent whenever `zSig' is a complete, normalized
 174 significand.
 175 -------------------------------------------------------------------------------
 176 */
 177 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 178 {
 179 #if 0
 180    float32 f;
 181    __asm__("@ packFloat32                               \n\
 182             mov %0, %1, asl #31                         \n\
 183             orr %0, %2, asl #23                         \n\
 184             orr %0, %3"
 185             : /* no outputs */
 186             : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
 187             : "cc");
 188    return f;
 189 #else
 190     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 191 #endif
 192 }
 193
 194 /*
 195 -------------------------------------------------------------------------------
 196 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 197 and significand `zSig', and returns the proper single-precision floating-
 198 point value corresponding to the abstract input.  Ordinarily, the abstract
 199 value is simply rounded and packed into the single-precision format, with
 200 the inexact exception raised if the abstract input cannot be represented
 201 exactly.  If the abstract value is too large, however, the overflow and
 202 inexact exceptions are raised and an infinity or maximal finite value is
 203 returned.  If the abstract value is too small, the input value is rounded to
 204 a subnormal number, and the underflow and inexact exceptions are raised if
 205 the abstract input cannot be represented exactly as a subnormal single-
 206 precision floating-point number.
 207     The input significand `zSig' has its binary point between bits 30
 208 and 29, which is 7 bits to the left of the usual location.  This shifted
 209 significand must be normalized or smaller.  If `zSig' is not normalized,
 210 `zExp' must be 0; in that case, the result returned is a subnormal number,
 211 and it must not require rounding.  In the usual case that `zSig' is
 212 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 213 The handling of underflow and overflow follows the IEC/IEEE Standard for
 214 Binary Floating-point Arithmetic.
 215 -------------------------------------------------------------------------------
 216 */
 217 static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
 218 {
 219     int8 roundingMode;
 220     flag roundNearestEven;
 221     int8 roundIncrement, roundBits;
 222     flag isTiny;
 223
 224     roundingMode = roundData->mode;
 225     roundNearestEven = ( roundingMode == float_round_nearest_even );
 226     roundIncrement = 0x40;
 227     if ( ! roundNearestEven ) {
 228         if ( roundingMode == float_round_to_zero ) {
 229             roundIncrement = 0;
 230         }
 231         else {
 232             roundIncrement = 0x7F;
 233             if ( zSign ) {
 234                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 235             }
 236             else {
 237                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 238             }
 239         }
 240     }
 241     roundBits = zSig & 0x7F;
 242     if ( 0xFD <= (bits16) zExp ) {
 243         if (    ( 0xFD < zExp )
 244              || (    ( zExp == 0xFD )
 245                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 246            ) {
 247             roundData->exception |= float_flag_overflow | float_flag_inexact;
 248             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 249         }
 250         if ( zExp < 0 ) {
 251             isTiny =
 252                    ( float_detect_tininess == float_tininess_before_rounding )
 253                 || ( zExp < -1 )
 254                 || ( zSig + roundIncrement < 0x80000000 );
 255             shift32RightJamming( zSig, - zExp, &zSig );
 256             zExp = 0;
 257             roundBits = zSig & 0x7F;
 258             if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
 259         }
 260     }
 261     if ( roundBits ) roundData->exception |= float_flag_inexact;
 262     zSig = ( zSig + roundIncrement )>>7;
 263     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 264     if ( zSig == 0 ) zExp = 0;
 265     return packFloat32( zSign, zExp, zSig );
 266
 267 }
 268
 269 /*
 270 -------------------------------------------------------------------------------
 271 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 272 and significand `zSig', and returns the proper single-precision floating-
 273 point value corresponding to the abstract input.  This routine is just like
 274 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
 275 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 276 point exponent.
 277 -------------------------------------------------------------------------------
 278 */
 279 static float32
 280  normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
 281 {
 282     int8 shiftCount;
 283
 284     shiftCount = countLeadingZeros32( zSig ) - 1;
 285     return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
 286
 287 }
 288
 289 /*
 290 -------------------------------------------------------------------------------
 291 Returns the fraction bits of the double-precision floating-point value `a'.
 292 -------------------------------------------------------------------------------
 293 */
 294 INLINE bits64 extractFloat64Frac( float64 a )
 295 {
 296
 297     return a & LIT64( 0x000FFFFFFFFFFFFF );
 298
 299 }
 300
 301 /*
 302 -------------------------------------------------------------------------------
 303 Returns the exponent bits of the double-precision floating-point value `a'.
 304 -------------------------------------------------------------------------------
 305 */
 306 INLINE int16 extractFloat64Exp( float64 a )
 307 {
 308
 309     return ( a>>52 ) & 0x7FF;
 310
 311 }
 312
 313 /*
 314 -------------------------------------------------------------------------------
 315 Returns the sign bit of the double-precision floating-point value `a'.
 316 -------------------------------------------------------------------------------
 317 */
 318 #if 0   /* in softfloat.h */
 319 INLINE flag extractFloat64Sign( float64 a )
 320 {
 321
 322     return a>>63;
 323
 324 }
 325 #endif
 326
 327 /*
 328 -------------------------------------------------------------------------------
 329 Normalizes the subnormal double-precision floating-point value represented
 330 by the denormalized significand `aSig'.  The normalized exponent and
 331 significand are stored at the locations pointed to by `zExpPtr' and
 332 `zSigPtr', respectively.
 333 -------------------------------------------------------------------------------
 334 */
 335 static void
 336  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 337 {
 338     int8 shiftCount;
 339
 340     shiftCount = countLeadingZeros64( aSig ) - 11;
 341     *zSigPtr = aSig<<shiftCount;
 342     *zExpPtr = 1 - shiftCount;
 343
 344 }
 345
 346 /*
 347 -------------------------------------------------------------------------------
 348 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 349 double-precision floating-point value, returning the result.  After being
 350 shifted into the proper positions, the three fields are simply added
 351 together to form the result.  This means that any integer portion of `zSig'
 352 will be added into the exponent.  Since a properly normalized significand
 353 will have an integer portion equal to 1, the `zExp' input should be 1 less
 354 than the desired result exponent whenever `zSig' is a complete, normalized
 355 significand.
 356 -------------------------------------------------------------------------------
 357 */
 358 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 359 {
 360
 361     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
 362
 363 }
 364
 365 /*
 366 -------------------------------------------------------------------------------
 367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 368 and significand `zSig', and returns the proper double-precision floating-
 369 point value corresponding to the abstract input.  Ordinarily, the abstract
 370 value is simply rounded and packed into the double-precision format, with
 371 the inexact exception raised if the abstract input cannot be represented
 372 exactly.  If the abstract value is too large, however, the overflow and
 373 inexact exceptions are raised and an infinity or maximal finite value is
 374 returned.  If the abstract value is too small, the input value is rounded to
 375 a subnormal number, and the underflow and inexact exceptions are raised if
 376 the abstract input cannot be represented exactly as a subnormal double-
 377 precision floating-point number.
 378     The input significand `zSig' has its binary point between bits 62
 379 and 61, which is 10 bits to the left of the usual location.  This shifted
 380 significand must be normalized or smaller.  If `zSig' is not normalized,
 381 `zExp' must be 0; in that case, the result returned is a subnormal number,
 382 and it must not require rounding.  In the usual case that `zSig' is
 383 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 384 The handling of underflow and overflow follows the IEC/IEEE Standard for
 385 Binary Floating-point Arithmetic.
 386 -------------------------------------------------------------------------------
 387 */
 388 static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
 389 {
 390     int8 roundingMode;
 391     flag roundNearestEven;
 392     int16 roundIncrement, roundBits;
 393     flag isTiny;
 394
 395     roundingMode = roundData->mode;
 396     roundNearestEven = ( roundingMode == float_round_nearest_even );
 397     roundIncrement = 0x200;
 398     if ( ! roundNearestEven ) {
 399         if ( roundingMode == float_round_to_zero ) {
 400             roundIncrement = 0;
 401         }
 402         else {
 403             roundIncrement = 0x3FF;
 404             if ( zSign ) {
 405                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 406             }
 407             else {
 408                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 409             }
 410         }
 411     }
 412     roundBits = zSig & 0x3FF;
 413     if ( 0x7FD <= (bits16) zExp ) {
 414         if (    ( 0x7FD < zExp )
 415              || (    ( zExp == 0x7FD )
 416                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 417            ) {
 418             //register int lr = __builtin_return_address(0);
 419             //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
 420             roundData->exception |= float_flag_overflow | float_flag_inexact;
 421             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
 422         }
 423         if ( zExp < 0 ) {
 424             isTiny =
 425                    ( float_detect_tininess == float_tininess_before_rounding )
 426                 || ( zExp < -1 )
 427                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 428             shift64RightJamming( zSig, - zExp, &zSig );
 429             zExp = 0;
 430             roundBits = zSig & 0x3FF;
 431             if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
 432         }
 433     }
 434     if ( roundBits ) roundData->exception |= float_flag_inexact;
 435     zSig = ( zSig + roundIncrement )>>10;
 436     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 437     if ( zSig == 0 ) zExp = 0;
 438     return packFloat64( zSign, zExp, zSig );
 439
 440 }
 441
 442 /*
 443 -------------------------------------------------------------------------------
 444 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 445 and significand `zSig', and returns the proper double-precision floating-
 446 point value corresponding to the abstract input.  This routine is just like
 447 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
 448 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 449 point exponent.
 450 -------------------------------------------------------------------------------
 451 */
 452 static float64
 453  normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
 454 {
 455     int8 shiftCount;
 456
 457     shiftCount = countLeadingZeros64( zSig ) - 1;
 458     return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
 459
 460 }
 461
 462 #ifdef FLOATX80
 463
 464 /*
 465 -------------------------------------------------------------------------------
 466 Returns the fraction bits of the extended double-precision floating-point
 467 value `a'.
 468 -------------------------------------------------------------------------------
 469 */
 470 INLINE bits64 extractFloatx80Frac( floatx80 a )
 471 {
 472
 473     return a.low;
 474
 475 }
 476
 477 /*
 478 -------------------------------------------------------------------------------
 479 Returns the exponent bits of the extended double-precision floating-point
 480 value `a'.
 481 -------------------------------------------------------------------------------
 482 */
 483 INLINE int32 extractFloatx80Exp( floatx80 a )
 484 {
 485
 486     return a.high & 0x7FFF;
 487
 488 }
 489
 490 /*
 491 -------------------------------------------------------------------------------
 492 Returns the sign bit of the extended double-precision floating-point value
 493 `a'.
 494 -------------------------------------------------------------------------------
 495 */
 496 INLINE flag extractFloatx80Sign( floatx80 a )
 497 {
 498
 499     return a.high>>15;
 500
 501 }
 502
 503 /*
 504 -------------------------------------------------------------------------------
 505 Normalizes the subnormal extended double-precision floating-point value
 506 represented by the denormalized significand `aSig'.  The normalized exponent
 507 and significand are stored at the locations pointed to by `zExpPtr' and
 508 `zSigPtr', respectively.
 509 -------------------------------------------------------------------------------
 510 */
 511 static void
 512  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 513 {
 514     int8 shiftCount;
 515
 516     shiftCount = countLeadingZeros64( aSig );
 517     *zSigPtr = aSig<<shiftCount;
 518     *zExpPtr = 1 - shiftCount;
 519
 520 }
 521
 522 /*
 523 -------------------------------------------------------------------------------
 524 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 525 extended double-precision floating-point value, returning the result.
 526 -------------------------------------------------------------------------------
 527 */
 528 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 529 {
 530     floatx80 z;
 531
 532     z.low = zSig;
 533     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 534     return z;
 535
 536 }
 537
 538 /*
 539 -------------------------------------------------------------------------------
 540 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 541 and extended significand formed by the concatenation of `zSig0' and `zSig1',
 542 and returns the proper extended double-precision floating-point value
 543 corresponding to the abstract input.  Ordinarily, the abstract value is
 544 rounded and packed into the extended double-precision format, with the
 545 inexact exception raised if the abstract input cannot be represented
 546 exactly.  If the abstract value is too large, however, the overflow and
 547 inexact exceptions are raised and an infinity or maximal finite value is
 548 returned.  If the abstract value is too small, the input value is rounded to
 549 a subnormal number, and the underflow and inexact exceptions are raised if
 550 the abstract input cannot be represented exactly as a subnormal extended
 551 double-precision floating-point number.
 552     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 553 number of bits as single or double precision, respectively.  Otherwise, the
 554 result is rounded to the full precision of the extended double-precision
 555 format.
 556     The input significand must be normalized or smaller.  If the input
 557 significand is not normalized, `zExp' must be 0; in that case, the result
 558 returned is a subnormal number, and it must not require rounding.  The
 559 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 560 Floating-point Arithmetic.
 561 -------------------------------------------------------------------------------
 562 */
 563 static floatx80
 564  roundAndPackFloatx80(
 565      struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 566  )
 567 {
 568     int8 roundingMode, roundingPrecision;
 569     flag roundNearestEven, increment, isTiny;
 570     int64 roundIncrement, roundMask, roundBits;
 571
 572     roundingMode = roundData->mode;
 573     roundingPrecision = roundData->precision;
 574     roundNearestEven = ( roundingMode == float_round_nearest_even );
 575     if ( roundingPrecision == 80 ) goto precision80;
 576     if ( roundingPrecision == 64 ) {
 577         roundIncrement = LIT64( 0x0000000000000400 );
 578         roundMask = LIT64( 0x00000000000007FF );
 579     }
 580     else if ( roundingPrecision == 32 ) {
 581         roundIncrement = LIT64( 0x0000008000000000 );
 582         roundMask = LIT64( 0x000000FFFFFFFFFF );
 583     }
 584     else {
 585         goto precision80;
 586     }
 587     zSig0 |= ( zSig1 != 0 );
 588     if ( ! roundNearestEven ) {
 589         if ( roundingMode == float_round_to_zero ) {
 590             roundIncrement = 0;
 591         }
 592         else {
 593             roundIncrement = roundMask;
 594             if ( zSign ) {
 595                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 596             }
 597             else {
 598                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 599             }
 600         }
 601     }
 602     roundBits = zSig0 & roundMask;
 603     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 604         if (    ( 0x7FFE < zExp )
 605              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 606            ) {
 607             goto overflow;
 608         }
 609         if ( zExp <= 0 ) {
 610             isTiny =
 611                    ( float_detect_tininess == float_tininess_before_rounding )
 612                 || ( zExp < 0 )
 613                 || ( zSig0 <= zSig0 + roundIncrement );
 614             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 615             zExp = 0;
 616             roundBits = zSig0 & roundMask;
 617             if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
 618             if ( roundBits ) roundData->exception |= float_flag_inexact;
 619             zSig0 += roundIncrement;
 620             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 621             roundIncrement = roundMask + 1;
 622             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 623                 roundMask |= roundIncrement;
 624             }
 625             zSig0 &= ~ roundMask;
 626             return packFloatx80( zSign, zExp, zSig0 );
 627         }
 628     }
 629     if ( roundBits ) roundData->exception |= float_flag_inexact;
 630     zSig0 += roundIncrement;
 631     if ( zSig0 < roundIncrement ) {
 632         ++zExp;
 633         zSig0 = LIT64( 0x8000000000000000 );
 634     }
 635     roundIncrement = roundMask + 1;
 636     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 637         roundMask |= roundIncrement;
 638     }
 639     zSig0 &= ~ roundMask;
 640     if ( zSig0 == 0 ) zExp = 0;
 641     return packFloatx80( zSign, zExp, zSig0 );
 642  precision80:
 643     increment = ( (sbits64) zSig1 < 0 );
 644     if ( ! roundNearestEven ) {
 645         if ( roundingMode == float_round_to_zero ) {
 646             increment = 0;
 647         }
 648         else {
 649             if ( zSign ) {
 650                 increment = ( roundingMode == float_round_down ) && zSig1;
 651             }
 652             else {
 653                 increment = ( roundingMode == float_round_up ) && zSig1;
 654             }
 655         }
 656     }
 657     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 658         if (    ( 0x7FFE < zExp )
 659              || (    ( zExp == 0x7FFE )
 660                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 661                   && increment
 662                 )
 663            ) {
 664             roundMask = 0;
 665  overflow:
 666             roundData->exception |= float_flag_overflow | float_flag_inexact;
 667             if (    ( roundingMode == float_round_to_zero )
 668                  || ( zSign && ( roundingMode == float_round_up ) )
 669                  || ( ! zSign && ( roundingMode == float_round_down ) )
 670                ) {
 671                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 672             }
 673             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 674         }
 675         if ( zExp <= 0 ) {
 676             isTiny =
 677                    ( float_detect_tininess == float_tininess_before_rounding )
 678                 || ( zExp < 0 )
 679                 || ! increment
 680                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 681             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 682             zExp = 0;
 683             if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow;
 684             if ( zSig1 ) roundData->exception |= float_flag_inexact;
 685             if ( roundNearestEven ) {
 686                 increment = ( (sbits64) zSig1 < 0 );
 687             }
 688             else {
 689                 if ( zSign ) {
 690                     increment = ( roundingMode == float_round_down ) && zSig1;
 691                 }
 692                 else {
 693                     increment = ( roundingMode == float_round_up ) && zSig1;
 694                 }
 695             }
 696             if ( increment ) {
 697                 ++zSig0;
 698                 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
 699                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 700             }
 701             return packFloatx80( zSign, zExp, zSig0 );
 702         }
 703     }
 704     if ( zSig1 ) roundData->exception |= float_flag_inexact;
 705     if ( increment ) {
 706         ++zSig0;
 707         if ( zSig0 == 0 ) {
 708             ++zExp;
 709             zSig0 = LIT64( 0x8000000000000000 );
 710         }
 711         else {
 712             zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
 713         }
 714     }
 715     else {
 716         if ( zSig0 == 0 ) zExp = 0;
 717     }
 718
 719     return packFloatx80( zSign, zExp, zSig0 );
 720 }
 721
 722 /*
 723 -------------------------------------------------------------------------------
 724 Takes an abstract floating-point value having sign `zSign', exponent
 725 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 726 and returns the proper extended double-precision floating-point value
 727 corresponding to the abstract input.  This routine is just like
 728 `roundAndPackFloatx80' except that the input significand does not have to be
 729 normalized.
 730 -------------------------------------------------------------------------------
 731 */
 732 static floatx80
 733  normalizeRoundAndPackFloatx80(
 734      struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 735  )
 736 {
 737     int8 shiftCount;
 738
 739     if ( zSig0 == 0 ) {
 740         zSig0 = zSig1;
 741         zSig1 = 0;
 742         zExp -= 64;
 743     }
 744     shiftCount = countLeadingZeros64( zSig0 );
 745     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 746     zExp -= shiftCount;
 747     return
 748         roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 );
 749
 750 }
 751
 752 #endif
 753
 754 /*
 755 -------------------------------------------------------------------------------
 756 Returns the result of converting the 32-bit two's complement integer `a' to
 757 the single-precision floating-point format.  The conversion is performed
 758 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
 759 -------------------------------------------------------------------------------
 760 */
 761 float32 int32_to_float32(struct roundingData *roundData, int32 a)
 762 {
 763     flag zSign;
 764
 765     if ( a == 0 ) return 0;
 766     if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
 767     zSign = ( a < 0 );
 768     return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a );
 769
 770 }
 771
 772 /*
 773 -------------------------------------------------------------------------------
 774 Returns the result of converting the 32-bit two's complement integer `a' to
 775 the double-precision floating-point format.  The conversion is performed
 776 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
 777 -------------------------------------------------------------------------------
 778 */
 779 float64 int32_to_float64( int32 a )
 780 {
 781     flag aSign;
 782     uint32 absA;
 783     int8 shiftCount;
 784     bits64 zSig;
 785
 786     if ( a == 0 ) return 0;
 787     aSign = ( a < 0 );
 788     absA = aSign ? - a : a;
 789     shiftCount = countLeadingZeros32( absA ) + 21;
 790     zSig = absA;
 791     return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
 792
 793 }
 794
 795 #ifdef FLOATX80
 796
 797 /*
 798 -------------------------------------------------------------------------------
 799 Returns the result of converting the 32-bit two's complement integer `a'
 800 to the extended double-precision floating-point format.  The conversion
 801 is performed according to the IEC/IEEE Standard for Binary Floating-point
 802 Arithmetic.
 803 -------------------------------------------------------------------------------
 804 */
 805 floatx80 int32_to_floatx80( int32 a )
 806 {
 807     flag zSign;
 808     uint32 absA;
 809     int8 shiftCount;
 810     bits64 zSig;
 811
 812     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 813     zSign = ( a < 0 );
 814     absA = zSign ? - a : a;
 815     shiftCount = countLeadingZeros32( absA ) + 32;
 816     zSig = absA;
 817     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
 818
 819 }
 820
 821 #endif
 822
 823 /*
 824 -------------------------------------------------------------------------------
 825 Returns the result of converting the single-precision floating-point value
 826 `a' to the 32-bit two's complement integer format.  The conversion is
 827 performed according to the IEC/IEEE Standard for Binary Floating-point
 828 Arithmetic---which means in particular that the conversion is rounded
 829 according to the current rounding mode.  If `a' is a NaN, the largest
 830 positive integer is returned.  Otherwise, if the conversion overflows, the
 831 largest integer with the same sign as `a' is returned.
 832 -------------------------------------------------------------------------------
 833 */
 834 int32 float32_to_int32( struct roundingData *roundData, float32 a )
 835 {
 836     flag aSign;
 837     int16 aExp, shiftCount;
 838     bits32 aSig;
 839     bits64 zSig;
 840
 841     aSig = extractFloat32Frac( a );
 842     aExp = extractFloat32Exp( a );
 843     aSign = extractFloat32Sign( a );
 844     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
 845     if ( aExp ) aSig |= 0x00800000;
 846     shiftCount = 0xAF - aExp;
 847     zSig = aSig;
 848     zSig <<= 32;
 849     if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
 850     return roundAndPackInt32( roundData, aSign, zSig );
 851
 852 }
 853
 854 /*
 855 -------------------------------------------------------------------------------
 856 Returns the result of converting the single-precision floating-point value
 857 `a' to the 32-bit two's complement integer format.  The conversion is
 858 performed according to the IEC/IEEE Standard for Binary Floating-point
 859 Arithmetic, except that the conversion is always rounded toward zero.  If
 860 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
 861 conversion overflows, the largest integer with the same sign as `a' is
 862 returned.
 863 -------------------------------------------------------------------------------
 864 */
 865 int32 float32_to_int32_round_to_zero( float32 a )
 866 {
 867     flag aSign;
 868     int16 aExp, shiftCount;
 869     bits32 aSig;
 870     int32 z;
 871
 872     aSig = extractFloat32Frac( a );
 873     aExp = extractFloat32Exp( a );
 874     aSign = extractFloat32Sign( a );
 875     shiftCount = aExp - 0x9E;
 876     if ( 0 <= shiftCount ) {
 877         if ( a == 0xCF000000 ) return 0x80000000;
 878         float_raise( float_flag_invalid );
 879         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
 880         return 0x80000000;
 881     }
 882     else if ( aExp <= 0x7E ) {
 883         if ( aExp | aSig ) float_raise( float_flag_inexact );
 884         return 0;
 885     }
 886     aSig = ( aSig | 0x00800000 )<<8;
 887     z = aSig>>( - shiftCount );
 888     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
 889         float_raise( float_flag_inexact );
 890     }
 891     return aSign ? - z : z;
 892
 893 }
 894
 895 /*
 896 -------------------------------------------------------------------------------
 897 Returns the result of converting the single-precision floating-point value
 898 `a' to the double-precision floating-point format.  The conversion is
 899 performed according to the IEC/IEEE Standard for Binary Floating-point
 900 Arithmetic.
 901 -------------------------------------------------------------------------------
 902 */
 903 float64 float32_to_float64( float32 a )
 904 {
 905     flag aSign;
 906     int16 aExp;
 907     bits32 aSig;
 908
 909     aSig = extractFloat32Frac( a );
 910     aExp = extractFloat32Exp( a );
 911     aSign = extractFloat32Sign( a );
 912     if ( aExp == 0xFF ) {
 913         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
 914         return packFloat64( aSign, 0x7FF, 0 );
 915     }
 916     if ( aExp == 0 ) {
 917         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
 918         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 919         --aExp;
 920     }
 921     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
 922
 923 }
 924
 925 #ifdef FLOATX80
 926
 927 /*
 928 -------------------------------------------------------------------------------
 929 Returns the result of converting the single-precision floating-point value
 930 `a' to the extended double-precision floating-point format.  The conversion
 931 is performed according to the IEC/IEEE Standard for Binary Floating-point
 932 Arithmetic.
 933 -------------------------------------------------------------------------------
 934 */
 935 floatx80 float32_to_floatx80( float32 a )
 936 {
 937     flag aSign;
 938     int16 aExp;
 939     bits32 aSig;
 940
 941     aSig = extractFloat32Frac( a );
 942     aExp = extractFloat32Exp( a );
 943     aSign = extractFloat32Sign( a );
 944     if ( aExp == 0xFF ) {
 945         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
 946         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 947     }
 948     if ( aExp == 0 ) {
 949         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
 950         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 951     }
 952     aSig |= 0x00800000;
 953     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
 954
 955 }
 956
 957 #endif
 958
 959 /*
 960 -------------------------------------------------------------------------------
 961 Rounds the single-precision floating-point value `a' to an integer, and
 962 returns the result as a single-precision floating-point value.  The
 963 operation is performed according to the IEC/IEEE Standard for Binary
 964 Floating-point Arithmetic.
 965 -------------------------------------------------------------------------------
 966 */
 967 float32 float32_round_to_int( struct roundingData *roundData, float32 a )
 968 {
 969     flag aSign;
 970     int16 aExp;
 971     bits32 lastBitMask, roundBitsMask;
 972     int8 roundingMode;
 973     float32 z;
 974
 975     aExp = extractFloat32Exp( a );
 976     if ( 0x96 <= aExp ) {
 977         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
 978             return propagateFloat32NaN( a, a );
 979         }
 980         return a;
 981     }
 982     roundingMode = roundData->mode;
 983     if ( aExp <= 0x7E ) {
 984         if ( (bits32) ( a<<1 ) == 0 ) return a;
 985         roundData->exception |= float_flag_inexact;
 986         aSign = extractFloat32Sign( a );
 987         switch ( roundingMode ) {
 988          case float_round_nearest_even:
 989             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
 990                 return packFloat32( aSign, 0x7F, 0 );
 991             }
 992             break;
 993          case float_round_down:
 994             return aSign ? 0xBF800000 : 0;
 995          case float_round_up:
 996             return aSign ? 0x80000000 : 0x3F800000;
 997         }
 998         return packFloat32( aSign, 0, 0 );
 999     }
1000     lastBitMask = 1;
1001     lastBitMask <<= 0x96 - aExp;
1002     roundBitsMask = lastBitMask - 1;
1003     z = a;
1004     if ( roundingMode == float_round_nearest_even ) {
1005         z += lastBitMask>>1;
1006         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1007     }
1008     else if ( roundingMode != float_round_to_zero ) {
1009         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1010             z += roundBitsMask;
1011         }
1012     }
1013     z &= ~ roundBitsMask;
1014     if ( z != a ) roundData->exception |= float_flag_inexact;
1015     return z;
1016
1017 }
1018
1019 /*
1020 -------------------------------------------------------------------------------
1021 Returns the result of adding the absolute values of the single-precision
1022 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1023 before being returned.  `zSign' is ignored if the result is a NaN.  The
1024 addition is performed according to the IEC/IEEE Standard for Binary
1025 Floating-point Arithmetic.
1026 -------------------------------------------------------------------------------
1027 */
1028 static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1029 {
1030     int16 aExp, bExp, zExp;
1031     bits32 aSig, bSig, zSig;
1032     int16 expDiff;
1033
1034     aSig = extractFloat32Frac( a );
1035     aExp = extractFloat32Exp( a );
1036     bSig = extractFloat32Frac( b );
1037     bExp = extractFloat32Exp( b );
1038     expDiff = aExp - bExp;
1039     aSig <<= 6;
1040     bSig <<= 6;
1041     if ( 0 < expDiff ) {
1042         if ( aExp == 0xFF ) {
1043             if ( aSig ) return propagateFloat32NaN( a, b );
1044             return a;
1045         }
1046         if ( bExp == 0 ) {
1047             --expDiff;
1048         }
1049         else {
1050             bSig |= 0x20000000;
1051         }
1052         shift32RightJamming( bSig, expDiff, &bSig );
1053         zExp = aExp;
1054     }
1055     else if ( expDiff < 0 ) {
1056         if ( bExp == 0xFF ) {
1057             if ( bSig ) return propagateFloat32NaN( a, b );
1058             return packFloat32( zSign, 0xFF, 0 );
1059         }
1060         if ( aExp == 0 ) {
1061             ++expDiff;
1062         }
1063         else {
1064             aSig |= 0x20000000;
1065         }
1066         shift32RightJamming( aSig, - expDiff, &aSig );
1067         zExp = bExp;
1068     }
1069     else {
1070         if ( aExp == 0xFF ) {
1071             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1072             return a;
1073         }
1074         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1075         zSig = 0x40000000 + aSig + bSig;
1076         zExp = aExp;
1077         goto roundAndPack;
1078     }
1079     aSig |= 0x20000000;
1080     zSig = ( aSig + bSig )<<1;
1081     --zExp;
1082     if ( (sbits32) zSig < 0 ) {
1083         zSig = aSig + bSig;
1084         ++zExp;
1085     }
1086  roundAndPack:
1087     return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1088
1089 }
1090
1091 /*
1092 -------------------------------------------------------------------------------
1093 Returns the result of subtracting the absolute values of the single-
1094 precision floating-point values `a' and `b'.  If `zSign' is true, the
1095 difference is negated before being returned.  `zSign' is ignored if the
1096 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1097 Standard for Binary Floating-point Arithmetic.
1098 -------------------------------------------------------------------------------
1099 */
1100 static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1101 {
1102     int16 aExp, bExp, zExp;
1103     bits32 aSig, bSig, zSig;
1104     int16 expDiff;
1105
1106     aSig = extractFloat32Frac( a );
1107     aExp = extractFloat32Exp( a );
1108     bSig = extractFloat32Frac( b );
1109     bExp = extractFloat32Exp( b );
1110     expDiff = aExp - bExp;
1111     aSig <<= 7;
1112     bSig <<= 7;
1113     if ( 0 < expDiff ) goto aExpBigger;
1114     if ( expDiff < 0 ) goto bExpBigger;
1115     if ( aExp == 0xFF ) {
1116         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1117         roundData->exception |= float_flag_invalid;
1118         return float32_default_nan;
1119     }
1120     if ( aExp == 0 ) {
1121         aExp = 1;
1122         bExp = 1;
1123     }
1124     if ( bSig < aSig ) goto aBigger;
1125     if ( aSig < bSig ) goto bBigger;
1126     return packFloat32( roundData->mode == float_round_down, 0, 0 );
1127  bExpBigger:
1128     if ( bExp == 0xFF ) {
1129         if ( bSig ) return propagateFloat32NaN( a, b );
1130         return packFloat32( zSign ^ 1, 0xFF, 0 );
1131     }
1132     if ( aExp == 0 ) {
1133         ++expDiff;
1134     }
1135     else {
1136         aSig |= 0x40000000;
1137     }
1138     shift32RightJamming( aSig, - expDiff, &aSig );
1139     bSig |= 0x40000000;
1140  bBigger:
1141     zSig = bSig - aSig;
1142     zExp = bExp;
1143     zSign ^= 1;
1144     goto normalizeRoundAndPack;
1145  aExpBigger:
1146     if ( aExp == 0xFF ) {
1147         if ( aSig ) return propagateFloat32NaN( a, b );
1148         return a;
1149     }
1150     if ( bExp == 0 ) {
1151         --expDiff;
1152     }
1153     else {
1154         bSig |= 0x40000000;
1155     }
1156     shift32RightJamming( bSig, expDiff, &bSig );
1157     aSig |= 0x40000000;
1158  aBigger:
1159     zSig = aSig - bSig;
1160     zExp = aExp;
1161  normalizeRoundAndPack:
1162     --zExp;
1163     return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig );
1164
1165 }
1166
1167 /*
1168 -------------------------------------------------------------------------------
1169 Returns the result of adding the single-precision floating-point values `a'
1170 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1171 Binary Floating-point Arithmetic.
1172 -------------------------------------------------------------------------------
1173 */
1174 float32 float32_add( struct roundingData *roundData, float32 a, float32 b )
1175 {
1176     flag aSign, bSign;
1177
1178     aSign = extractFloat32Sign( a );
1179     bSign = extractFloat32Sign( b );
1180     if ( aSign == bSign ) {
1181         return addFloat32Sigs( roundData, a, b, aSign );
1182     }
1183     else {
1184         return subFloat32Sigs( roundData, a, b, aSign );
1185     }
1186
1187 }
1188
1189 /*
1190 -------------------------------------------------------------------------------
1191 Returns the result of subtracting the single-precision floating-point values
1192 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1193 for Binary Floating-point Arithmetic.
1194 -------------------------------------------------------------------------------
1195 */
1196 float32 float32_sub( struct roundingData *roundData, float32 a, float32 b )
1197 {
1198     flag aSign, bSign;
1199
1200     aSign = extractFloat32Sign( a );
1201     bSign = extractFloat32Sign( b );
1202     if ( aSign == bSign ) {
1203         return subFloat32Sigs( roundData, a, b, aSign );
1204     }
1205     else {
1206         return addFloat32Sigs( roundData, a, b, aSign );
1207     }
1208
1209 }
1210
1211 /*
1212 -------------------------------------------------------------------------------
1213 Returns the result of multiplying the single-precision floating-point values
1214 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1215 for Binary Floating-point Arithmetic.
1216 -------------------------------------------------------------------------------
1217 */
1218 float32 float32_mul( struct roundingData *roundData, float32 a, float32 b )
1219 {
1220     flag aSign, bSign, zSign;
1221     int16 aExp, bExp, zExp;
1222     bits32 aSig, bSig;
1223     bits64 zSig64;
1224     bits32 zSig;
1225
1226     aSig = extractFloat32Frac( a );
1227     aExp = extractFloat32Exp( a );
1228     aSign = extractFloat32Sign( a );
1229     bSig = extractFloat32Frac( b );
1230     bExp = extractFloat32Exp( b );
1231     bSign = extractFloat32Sign( b );
1232     zSign = aSign ^ bSign;
1233     if ( aExp == 0xFF ) {
1234         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1235             return propagateFloat32NaN( a, b );
1236         }
1237         if ( ( bExp | bSig ) == 0 ) {
1238             roundData->exception |= float_flag_invalid;
1239             return float32_default_nan;
1240         }
1241         return packFloat32( zSign, 0xFF, 0 );
1242     }
1243     if ( bExp == 0xFF ) {
1244         if ( bSig ) return propagateFloat32NaN( a, b );
1245         if ( ( aExp | aSig ) == 0 ) {
1246             roundData->exception |= float_flag_invalid;
1247             return float32_default_nan;
1248         }
1249         return packFloat32( zSign, 0xFF, 0 );
1250     }
1251     if ( aExp == 0 ) {
1252         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1253         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1254     }
1255     if ( bExp == 0 ) {
1256         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1257         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1258     }
1259     zExp = aExp + bExp - 0x7F;
1260     aSig = ( aSig | 0x00800000 )<<7;
1261     bSig = ( bSig | 0x00800000 )<<8;
1262     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1263     zSig = zSig64;
1264     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1265         zSig <<= 1;
1266         --zExp;
1267     }
1268     return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1269
1270 }
1271
1272 /*
1273 -------------------------------------------------------------------------------
1274 Returns the result of dividing the single-precision floating-point value `a'
1275 by the corresponding value `b'.  The operation is performed according to the
1276 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1277 -------------------------------------------------------------------------------
1278 */
1279 float32 float32_div( struct roundingData *roundData, float32 a, float32 b )
1280 {
1281     flag aSign, bSign, zSign;
1282     int16 aExp, bExp, zExp;
1283     bits32 aSig, bSig, zSig;
1284
1285     aSig = extractFloat32Frac( a );
1286     aExp = extractFloat32Exp( a );
1287     aSign = extractFloat32Sign( a );
1288     bSig = extractFloat32Frac( b );
1289     bExp = extractFloat32Exp( b );
1290     bSign = extractFloat32Sign( b );
1291     zSign = aSign ^ bSign;
1292     if ( aExp == 0xFF ) {
1293         if ( aSig ) return propagateFloat32NaN( a, b );
1294         if ( bExp == 0xFF ) {
1295             if ( bSig ) return propagateFloat32NaN( a, b );
1296             roundData->exception |= float_flag_invalid;
1297             return float32_default_nan;
1298         }
1299         return packFloat32( zSign, 0xFF, 0 );
1300     }
1301     if ( bExp == 0xFF ) {
1302         if ( bSig ) return propagateFloat32NaN( a, b );
1303         return packFloat32( zSign, 0, 0 );
1304     }
1305     if ( bExp == 0 ) {
1306         if ( bSig == 0 ) {
1307             if ( ( aExp | aSig ) == 0 ) {
1308                 roundData->exception |= float_flag_invalid;
1309                 return float32_default_nan;
1310             }
1311             roundData->exception |= float_flag_divbyzero;
1312             return packFloat32( zSign, 0xFF, 0 );
1313         }
1314         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1315     }
1316     if ( aExp == 0 ) {
1317         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1318         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1319     }
1320     zExp = aExp - bExp + 0x7D;
1321     aSig = ( aSig | 0x00800000 )<<7;
1322     bSig = ( bSig | 0x00800000 )<<8;
1323     if ( bSig <= ( aSig + aSig ) ) {
1324         aSig >>= 1;
1325         ++zExp;
1326     }
1327     {
1328         bits64 tmp = ( (bits64) aSig )<<32;
1329         do_div( tmp, bSig );
1330         zSig = tmp;
1331     }
1332     if ( ( zSig & 0x3F ) == 0 ) {
1333         zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1334     }
1335     return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1336
1337 }
1338
1339 /*
1340 -------------------------------------------------------------------------------
1341 Returns the remainder of the single-precision floating-point value `a'
1342 with respect to the corresponding value `b'.  The operation is performed
1343 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1344 -------------------------------------------------------------------------------
1345 */
1346 float32 float32_rem( struct roundingData *roundData, float32 a, float32 b )
1347 {
1348     flag aSign, bSign, zSign;
1349     int16 aExp, bExp, expDiff;
1350     bits32 aSig, bSig;
1351     bits32 q;
1352     bits64 aSig64, bSig64, q64;
1353     bits32 alternateASig;
1354     sbits32 sigMean;
1355
1356     aSig = extractFloat32Frac( a );
1357     aExp = extractFloat32Exp( a );
1358     aSign = extractFloat32Sign( a );
1359     bSig = extractFloat32Frac( b );
1360     bExp = extractFloat32Exp( b );
1361     bSign = extractFloat32Sign( b );
1362     if ( aExp == 0xFF ) {
1363         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1364             return propagateFloat32NaN( a, b );
1365         }
1366         roundData->exception |= float_flag_invalid;
1367         return float32_default_nan;
1368     }
1369     if ( bExp == 0xFF ) {
1370         if ( bSig ) return propagateFloat32NaN( a, b );
1371         return a;
1372     }
1373     if ( bExp == 0 ) {
1374         if ( bSig == 0 ) {
1375             roundData->exception |= float_flag_invalid;
1376             return float32_default_nan;
1377         }
1378         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1379     }
1380     if ( aExp == 0 ) {
1381         if ( aSig == 0 ) return a;
1382         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1383     }
1384     expDiff = aExp - bExp;
1385     aSig |= 0x00800000;
1386     bSig |= 0x00800000;
1387     if ( expDiff < 32 ) {
1388         aSig <<= 8;
1389         bSig <<= 8;
1390         if ( expDiff < 0 ) {
1391             if ( expDiff < -1 ) return a;
1392             aSig >>= 1;
1393         }
1394         q = ( bSig <= aSig );
1395         if ( q ) aSig -= bSig;
1396         if ( 0 < expDiff ) {
1397             bits64 tmp = ( (bits64) aSig )<<32;
1398             do_div( tmp, bSig );
1399             q = tmp;
1400             q >>= 32 - expDiff;
1401             bSig >>= 2;
1402             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1403         }
1404         else {
1405             aSig >>= 2;
1406             bSig >>= 2;
1407         }
1408     }
1409     else {
1410         if ( bSig <= aSig ) aSig -= bSig;
1411         aSig64 = ( (bits64) aSig )<<40;
1412         bSig64 = ( (bits64) bSig )<<40;
1413         expDiff -= 64;
1414         while ( 0 < expDiff ) {
1415             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1416             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1417             aSig64 = - ( ( bSig * q64 )<<38 );
1418             expDiff -= 62;
1419         }
1420         expDiff += 64;
1421         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1422         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1423         q = q64>>( 64 - expDiff );
1424         bSig <<= 6;
1425         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1426     }
1427     do {
1428         alternateASig = aSig;
1429         ++q;
1430         aSig -= bSig;
1431     } while ( 0 <= (sbits32) aSig );
1432     sigMean = aSig + alternateASig;
1433     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1434         aSig = alternateASig;
1435     }
1436     zSign = ( (sbits32) aSig < 0 );
1437     if ( zSign ) aSig = - aSig;
1438     return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig );
1439
1440 }
1441
1442 /*
1443 -------------------------------------------------------------------------------
1444 Returns the square root of the single-precision floating-point value `a'.
1445 The operation is performed according to the IEC/IEEE Standard for Binary
1446 Floating-point Arithmetic.
1447 -------------------------------------------------------------------------------
1448 */
1449 float32 float32_sqrt( struct roundingData *roundData, float32 a )
1450 {
1451     flag aSign;
1452     int16 aExp, zExp;
1453     bits32 aSig, zSig;
1454     bits64 rem, term;
1455
1456     aSig = extractFloat32Frac( a );
1457     aExp = extractFloat32Exp( a );
1458     aSign = extractFloat32Sign( a );
1459     if ( aExp == 0xFF ) {
1460         if ( aSig ) return propagateFloat32NaN( a, 0 );
1461         if ( ! aSign ) return a;
1462         roundData->exception |= float_flag_invalid;
1463         return float32_default_nan;
1464     }
1465     if ( aSign ) {
1466         if ( ( aExp | aSig ) == 0 ) return a;
1467         roundData->exception |= float_flag_invalid;
1468         return float32_default_nan;
1469     }
1470     if ( aExp == 0 ) {
1471         if ( aSig == 0 ) return 0;
1472         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1473     }
1474     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1475     aSig = ( aSig | 0x00800000 )<<8;
1476     zSig = estimateSqrt32( aExp, aSig ) + 2;
1477     if ( ( zSig & 0x7F ) <= 5 ) {
1478         if ( zSig < 2 ) {
1479             zSig = 0xFFFFFFFF;
1480         }
1481         else {
1482             aSig >>= aExp & 1;
1483             term = ( (bits64) zSig ) * zSig;
1484             rem = ( ( (bits64) aSig )<<32 ) - term;
1485             while ( (sbits64) rem < 0 ) {
1486                 --zSig;
1487                 rem += ( ( (bits64) zSig )<<1 ) | 1;
1488             }
1489             zSig |= ( rem != 0 );
1490         }
1491     }
1492     shift32RightJamming( zSig, 1, &zSig );
1493     return roundAndPackFloat32( roundData, 0, zExp, zSig );
1494
1495 }
1496
1497 /*
1498 -------------------------------------------------------------------------------
1499 Returns 1 if the single-precision floating-point value `a' is equal to the
1500 corresponding value `b', and 0 otherwise.  The comparison is performed
1501 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1502 -------------------------------------------------------------------------------
1503 */
1504 flag float32_eq( float32 a, float32 b )
1505 {
1506
1507     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1508          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1509        ) {
1510         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1511             float_raise( float_flag_invalid );
1512         }
1513         return 0;
1514     }
1515     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1516
1517 }
1518
1519 /*
1520 -------------------------------------------------------------------------------
1521 Returns 1 if the single-precision floating-point value `a' is less than or
1522 equal to the corresponding value `b', and 0 otherwise.  The comparison is
1523 performed according to the IEC/IEEE Standard for Binary Floating-point
1524 Arithmetic.
1525 -------------------------------------------------------------------------------
1526 */
1527 flag float32_le( float32 a, float32 b )
1528 {
1529     flag aSign, bSign;
1530
1531     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1532          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1533        ) {
1534         float_raise( float_flag_invalid );
1535         return 0;
1536     }
1537     aSign = extractFloat32Sign( a );
1538     bSign = extractFloat32Sign( b );
1539     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1540     return ( a == b ) || ( aSign ^ ( a < b ) );
1541
1542 }
1543
1544 /*
1545 -------------------------------------------------------------------------------
1546 Returns 1 if the single-precision floating-point value `a' is less than
1547 the corresponding value `b', and 0 otherwise.  The comparison is performed
1548 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1549 -------------------------------------------------------------------------------
1550 */
1551 flag float32_lt( float32 a, float32 b )
1552 {
1553     flag aSign, bSign;
1554
1555     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1556          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1557        ) {
1558         float_raise( float_flag_invalid );
1559         return 0;
1560     }
1561     aSign = extractFloat32Sign( a );
1562     bSign = extractFloat32Sign( b );
1563     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1564     return ( a != b ) && ( aSign ^ ( a < b ) );
1565
1566 }
1567
1568 /*
1569 -------------------------------------------------------------------------------
1570 Returns 1 if the single-precision floating-point value `a' is equal to the
1571 corresponding value `b', and 0 otherwise.  The invalid exception is raised
1572 if either operand is a NaN.  Otherwise, the comparison is performed
1573 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1574 -------------------------------------------------------------------------------
1575 */
1576 flag float32_eq_signaling( float32 a, float32 b )
1577 {
1578
1579     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1580          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1581        ) {
1582         float_raise( float_flag_invalid );
1583         return 0;
1584     }
1585     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1586
1587 }
1588
1589 /*
1590 -------------------------------------------------------------------------------
1591 Returns 1 if the single-precision floating-point value `a' is less than or
1592 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1593 cause an exception.  Otherwise, the comparison is performed according to the
1594 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1595 -------------------------------------------------------------------------------
1596 */
1597 flag float32_le_quiet( float32 a, float32 b )
1598 {
1599     flag aSign, bSign;
1600     //int16 aExp, bExp;
1601
1602     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1603          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1604        ) {
1605         /* Do nothing, even if NaN as we're quiet */
1606         return 0;
1607     }
1608     aSign = extractFloat32Sign( a );
1609     bSign = extractFloat32Sign( b );
1610     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1611     return ( a == b ) || ( aSign ^ ( a < b ) );
1612
1613 }
1614
1615 /*
1616 -------------------------------------------------------------------------------
1617 Returns 1 if the single-precision floating-point value `a' is less than
1618 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1619 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1620 Standard for Binary Floating-point Arithmetic.
1621 -------------------------------------------------------------------------------
1622 */
1623 flag float32_lt_quiet( float32 a, float32 b )
1624 {
1625     flag aSign, bSign;
1626
1627     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1628          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1629        ) {
1630         /* Do nothing, even if NaN as we're quiet */
1631         return 0;
1632     }
1633     aSign = extractFloat32Sign( a );
1634     bSign = extractFloat32Sign( b );
1635     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1636     return ( a != b ) && ( aSign ^ ( a < b ) );
1637
1638 }
1639
1640 /*
1641 -------------------------------------------------------------------------------
1642 Returns the result of converting the double-precision floating-point value
1643 `a' to the 32-bit two's complement integer format.  The conversion is
1644 performed according to the IEC/IEEE Standard for Binary Floating-point
1645 Arithmetic---which means in particular that the conversion is rounded
1646 according to the current rounding mode.  If `a' is a NaN, the largest
1647 positive integer is returned.  Otherwise, if the conversion overflows, the
1648 largest integer with the same sign as `a' is returned.
1649 -------------------------------------------------------------------------------
1650 */
1651 int32 float64_to_int32( struct roundingData *roundData, float64 a )
1652 {
1653     flag aSign;
1654     int16 aExp, shiftCount;
1655     bits64 aSig;
1656
1657     aSig = extractFloat64Frac( a );
1658     aExp = extractFloat64Exp( a );
1659     aSign = extractFloat64Sign( a );
1660     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1661     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1662     shiftCount = 0x42C - aExp;
1663     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1664     return roundAndPackInt32( roundData, aSign, aSig );
1665
1666 }
1667
1668 /*
1669 -------------------------------------------------------------------------------
1670 Returns the result of converting the double-precision floating-point value
1671 `a' to the 32-bit two's complement integer format.  The conversion is
1672 performed according to the IEC/IEEE Standard for Binary Floating-point
1673 Arithmetic, except that the conversion is always rounded toward zero.  If
1674 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1675 conversion overflows, the largest integer with the same sign as `a' is
1676 returned.
1677 -------------------------------------------------------------------------------
1678 */
1679 int32 float64_to_int32_round_to_zero( float64 a )
1680 {
1681     flag aSign;
1682     int16 aExp, shiftCount;
1683     bits64 aSig, savedASig;
1684     int32 z;
1685
1686     aSig = extractFloat64Frac( a );
1687     aExp = extractFloat64Exp( a );
1688     aSign = extractFloat64Sign( a );
1689     shiftCount = 0x433 - aExp;
1690     if ( shiftCount < 21 ) {
1691         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1692         goto invalid;
1693     }
1694     else if ( 52 < shiftCount ) {
1695         if ( aExp || aSig ) float_raise( float_flag_inexact );
1696         return 0;
1697     }
1698     aSig |= LIT64( 0x0010000000000000 );
1699     savedASig = aSig;
1700     aSig >>= shiftCount;
1701     z = aSig;
1702     if ( aSign ) z = - z;
1703     if ( ( z < 0 ) ^ aSign ) {
1704  invalid:
1705         float_raise( float_flag_invalid );
1706         return aSign ? 0x80000000 : 0x7FFFFFFF;
1707     }
1708     if ( ( aSig<<shiftCount ) != savedASig ) {
1709         float_raise( float_flag_inexact );
1710     }
1711     return z;
1712
1713 }
1714
1715 /*
1716 -------------------------------------------------------------------------------
1717 Returns the result of converting the double-precision floating-point value
1718 `a' to the 32-bit two's complement unsigned integer format.  The conversion
1719 is performed according to the IEC/IEEE Standard for Binary Floating-point
1720 Arithmetic---which means in particular that the conversion is rounded
1721 according to the current rounding mode.  If `a' is a NaN, the largest
1722 positive integer is returned.  Otherwise, if the conversion overflows, the
1723 largest positive integer is returned.
1724 -------------------------------------------------------------------------------
1725 */
1726 int32 float64_to_uint32( struct roundingData *roundData, float64 a )
1727 {
1728     flag aSign;
1729     int16 aExp, shiftCount;
1730     bits64 aSig;
1731
1732     aSig = extractFloat64Frac( a );
1733     aExp = extractFloat64Exp( a );
1734     aSign = 0; //extractFloat64Sign( a );
1735     //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1736     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1737     shiftCount = 0x42C - aExp;
1738     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1739     return roundAndPackInt32( roundData, aSign, aSig );
1740 }
1741
1742 /*
1743 -------------------------------------------------------------------------------
1744 Returns the result of converting the double-precision floating-point value
1745 `a' to the 32-bit two's complement integer format.  The conversion is
1746 performed according to the IEC/IEEE Standard for Binary Floating-point
1747 Arithmetic, except that the conversion is always rounded toward zero.  If
1748 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1749 conversion overflows, the largest positive integer is returned.
1750 -------------------------------------------------------------------------------
1751 */
1752 int32 float64_to_uint32_round_to_zero( float64 a )
1753 {
1754     flag aSign;
1755     int16 aExp, shiftCount;
1756     bits64 aSig, savedASig;
1757     int32 z;
1758
1759     aSig = extractFloat64Frac( a );
1760     aExp = extractFloat64Exp( a );
1761     aSign = extractFloat64Sign( a );
1762     shiftCount = 0x433 - aExp;
1763     if ( shiftCount < 21 ) {
1764         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1765         goto invalid;
1766     }
1767     else if ( 52 < shiftCount ) {
1768         if ( aExp || aSig ) float_raise( float_flag_inexact );
1769         return 0;
1770     }
1771     aSig |= LIT64( 0x0010000000000000 );
1772     savedASig = aSig;
1773     aSig >>= shiftCount;
1774     z = aSig;
1775     if ( aSign ) z = - z;
1776     if ( ( z < 0 ) ^ aSign ) {
1777  invalid:
1778         float_raise( float_flag_invalid );
1779         return aSign ? 0x80000000 : 0x7FFFFFFF;
1780     }
1781     if ( ( aSig<<shiftCount ) != savedASig ) {
1782         float_raise( float_flag_inexact );
1783     }
1784     return z;
1785 }
1786
1787 /*
1788 -------------------------------------------------------------------------------
1789 Returns the result of converting the double-precision floating-point value
1790 `a' to the single-precision floating-point format.  The conversion is
1791 performed according to the IEC/IEEE Standard for Binary Floating-point
1792 Arithmetic.
1793 -------------------------------------------------------------------------------
1794 */
1795 float32 float64_to_float32( struct roundingData *roundData, float64 a )
1796 {
1797     flag aSign;
1798     int16 aExp;
1799     bits64 aSig;
1800     bits32 zSig;
1801
1802     aSig = extractFloat64Frac( a );
1803     aExp = extractFloat64Exp( a );
1804     aSign = extractFloat64Sign( a );
1805     if ( aExp == 0x7FF ) {
1806         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1807         return packFloat32( aSign, 0xFF, 0 );
1808     }
1809     shift64RightJamming( aSig, 22, &aSig );
1810     zSig = aSig;
1811     if ( aExp || zSig ) {
1812         zSig |= 0x40000000;
1813         aExp -= 0x381;
1814     }
1815     return roundAndPackFloat32( roundData, aSign, aExp, zSig );
1816
1817 }
1818
1819 #ifdef FLOATX80
1820
1821 /*
1822 -------------------------------------------------------------------------------
1823 Returns the result of converting the double-precision floating-point value
1824 `a' to the extended double-precision floating-point format.  The conversion
1825 is performed according to the IEC/IEEE Standard for Binary Floating-point
1826 Arithmetic.
1827 -------------------------------------------------------------------------------
1828 */
1829 floatx80 float64_to_floatx80( float64 a )
1830 {
1831     flag aSign;
1832     int16 aExp;
1833     bits64 aSig;
1834
1835     aSig = extractFloat64Frac( a );
1836     aExp = extractFloat64Exp( a );
1837     aSign = extractFloat64Sign( a );
1838     if ( aExp == 0x7FF ) {
1839         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1840         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1841     }
1842     if ( aExp == 0 ) {
1843         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1844         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1845     }
1846     return
1847         packFloatx80(
1848             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1849
1850 }
1851
1852 #endif
1853
1854 /*
1855 -------------------------------------------------------------------------------
1856 Rounds the double-precision floating-point value `a' to an integer, and
1857 returns the result as a double-precision floating-point value.  The
1858 operation is performed according to the IEC/IEEE Standard for Binary
1859 Floating-point Arithmetic.
1860 -------------------------------------------------------------------------------
1861 */
1862 float64 float64_round_to_int( struct roundingData *roundData, float64 a )
1863 {
1864     flag aSign;
1865     int16 aExp;
1866     bits64 lastBitMask, roundBitsMask;
1867     int8 roundingMode;
1868     float64 z;
1869
1870     aExp = extractFloat64Exp( a );
1871     if ( 0x433 <= aExp ) {
1872         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1873             return propagateFloat64NaN( a, a );
1874         }
1875         return a;
1876     }
1877     if ( aExp <= 0x3FE ) {
1878         if ( (bits64) ( a<<1 ) == 0 ) return a;
1879         roundData->exception |= float_flag_inexact;
1880         aSign = extractFloat64Sign( a );
1881         switch ( roundData->mode ) {
1882          case float_round_nearest_even:
1883             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1884                 return packFloat64( aSign, 0x3FF, 0 );
1885             }
1886             break;
1887          case float_round_down:
1888             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1889          case float_round_up:
1890             return
1891             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1892         }
1893         return packFloat64( aSign, 0, 0 );
1894     }
1895     lastBitMask = 1;
1896     lastBitMask <<= 0x433 - aExp;
1897     roundBitsMask = lastBitMask - 1;
1898     z = a;
1899     roundingMode = roundData->mode;
1900     if ( roundingMode == float_round_nearest_even ) {
1901         z += lastBitMask>>1;
1902         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1903     }
1904     else if ( roundingMode != float_round_to_zero ) {
1905         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1906             z += roundBitsMask;
1907         }
1908     }
1909     z &= ~ roundBitsMask;
1910     if ( z != a ) roundData->exception |= float_flag_inexact;
1911     return z;
1912
1913 }
1914
1915 /*
1916 -------------------------------------------------------------------------------
1917 Returns the result of adding the absolute values of the double-precision
1918 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1919 before being returned.  `zSign' is ignored if the result is a NaN.  The
1920 addition is performed according to the IEC/IEEE Standard for Binary
1921 Floating-point Arithmetic.
1922 -------------------------------------------------------------------------------
1923 */
1924 static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1925 {
1926     int16 aExp, bExp, zExp;
1927     bits64 aSig, bSig, zSig;
1928     int16 expDiff;
1929
1930     aSig = extractFloat64Frac( a );
1931     aExp = extractFloat64Exp( a );
1932     bSig = extractFloat64Frac( b );
1933     bExp = extractFloat64Exp( b );
1934     expDiff = aExp - bExp;
1935     aSig <<= 9;
1936     bSig <<= 9;
1937     if ( 0 < expDiff ) {
1938         if ( aExp == 0x7FF ) {
1939             if ( aSig ) return propagateFloat64NaN( a, b );
1940             return a;
1941         }
1942         if ( bExp == 0 ) {
1943             --expDiff;
1944         }
1945         else {
1946             bSig |= LIT64( 0x2000000000000000 );
1947         }
1948         shift64RightJamming( bSig, expDiff, &bSig );
1949         zExp = aExp;
1950     }
1951     else if ( expDiff < 0 ) {
1952         if ( bExp == 0x7FF ) {
1953             if ( bSig ) return propagateFloat64NaN( a, b );
1954             return packFloat64( zSign, 0x7FF, 0 );
1955         }
1956         if ( aExp == 0 ) {
1957             ++expDiff;
1958         }
1959         else {
1960             aSig |= LIT64( 0x2000000000000000 );
1961         }
1962         shift64RightJamming( aSig, - expDiff, &aSig );
1963         zExp = bExp;
1964     }
1965     else {
1966         if ( aExp == 0x7FF ) {
1967             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1968             return a;
1969         }
1970         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1971         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1972         zExp = aExp;
1973         goto roundAndPack;
1974     }
1975     aSig |= LIT64( 0x2000000000000000 );
1976     zSig = ( aSig + bSig )<<1;
1977     --zExp;
1978     if ( (sbits64) zSig < 0 ) {
1979         zSig = aSig + bSig;
1980         ++zExp;
1981     }
1982  roundAndPack:
1983     return roundAndPackFloat64( roundData, zSign, zExp, zSig );
1984
1985 }
1986
1987 /*
1988 -------------------------------------------------------------------------------
1989 Returns the result of subtracting the absolute values of the double-
1990 precision floating-point values `a' and `b'.  If `zSign' is true, the
1991 difference is negated before being returned.  `zSign' is ignored if the
1992 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1993 Standard for Binary Floating-point Arithmetic.
1994 -------------------------------------------------------------------------------
1995 */
1996 static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1997 {
1998     int16 aExp, bExp, zExp;
1999     bits64 aSig, bSig, zSig;
2000     int16 expDiff;
2001
2002     aSig = extractFloat64Frac( a );
2003     aExp = extractFloat64Exp( a );
2004     bSig = extractFloat64Frac( b );
2005     bExp = extractFloat64Exp( b );
2006     expDiff = aExp - bExp;
2007     aSig <<= 10;
2008     bSig <<= 10;
2009     if ( 0 < expDiff ) goto aExpBigger;
2010     if ( expDiff < 0 ) goto bExpBigger;
2011     if ( aExp == 0x7FF ) {
2012         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2013         roundData->exception |= float_flag_invalid;
2014         return float64_default_nan;
2015     }
2016     if ( aExp == 0 ) {
2017         aExp = 1;
2018         bExp = 1;
2019     }
2020     if ( bSig < aSig ) goto aBigger;
2021     if ( aSig < bSig ) goto bBigger;
2022     return packFloat64( roundData->mode == float_round_down, 0, 0 );
2023  bExpBigger:
2024     if ( bExp == 0x7FF ) {
2025         if ( bSig ) return propagateFloat64NaN( a, b );
2026         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2027     }
2028     if ( aExp == 0 ) {
2029         ++expDiff;
2030     }
2031     else {
2032         aSig |= LIT64( 0x4000000000000000 );
2033     }
2034     shift64RightJamming( aSig, - expDiff, &aSig );
2035     bSig |= LIT64( 0x4000000000000000 );
2036  bBigger:
2037     zSig = bSig - aSig;
2038     zExp = bExp;
2039     zSign ^= 1;
2040     goto normalizeRoundAndPack;
2041  aExpBigger:
2042     if ( aExp == 0x7FF ) {
2043         if ( aSig ) return propagateFloat64NaN( a, b );
2044         return a;
2045     }
2046     if ( bExp == 0 ) {
2047         --expDiff;
2048     }
2049     else {
2050         bSig |= LIT64( 0x4000000000000000 );
2051     }
2052     shift64RightJamming( bSig, expDiff, &bSig );
2053     aSig |= LIT64( 0x4000000000000000 );
2054  aBigger:
2055     zSig = aSig - bSig;
2056     zExp = aExp;
2057  normalizeRoundAndPack:
2058     --zExp;
2059     return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig );
2060
2061 }
2062
2063 /*
2064 -------------------------------------------------------------------------------
2065 Returns the result of adding the double-precision floating-point values `a'
2066 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2067 Binary Floating-point Arithmetic.
2068 -------------------------------------------------------------------------------
2069 */
2070 float64 float64_add( struct roundingData *roundData, float64 a, float64 b )
2071 {
2072     flag aSign, bSign;
2073
2074     aSign = extractFloat64Sign( a );
2075     bSign = extractFloat64Sign( b );
2076     if ( aSign == bSign ) {
2077         return addFloat64Sigs( roundData, a, b, aSign );
2078     }
2079     else {
2080         return subFloat64Sigs( roundData, a, b, aSign );
2081     }
2082
2083 }
2084
2085 /*
2086 -------------------------------------------------------------------------------
2087 Returns the result of subtracting the double-precision floating-point values
2088 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2089 for Binary Floating-point Arithmetic.
2090 -------------------------------------------------------------------------------
2091 */
2092 float64 float64_sub( struct roundingData *roundData, float64 a, float64 b )
2093 {
2094     flag aSign, bSign;
2095
2096     aSign = extractFloat64Sign( a );
2097     bSign = extractFloat64Sign( b );
2098     if ( aSign == bSign ) {
2099         return subFloat64Sigs( roundData, a, b, aSign );
2100     }
2101     else {
2102         return addFloat64Sigs( roundData, a, b, aSign );
2103     }
2104
2105 }
2106
2107 /*
2108 -------------------------------------------------------------------------------
2109 Returns the result of multiplying the double-precision floating-point values
2110 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2111 for Binary Floating-point Arithmetic.
2112 -------------------------------------------------------------------------------
2113 */
2114 float64 float64_mul( struct roundingData *roundData, float64 a, float64 b )
2115 {
2116     flag aSign, bSign, zSign;
2117     int16 aExp, bExp, zExp;
2118     bits64 aSig, bSig, zSig0, zSig1;
2119
2120     aSig = extractFloat64Frac( a );
2121     aExp = extractFloat64Exp( a );
2122     aSign = extractFloat64Sign( a );
2123     bSig = extractFloat64Frac( b );
2124     bExp = extractFloat64Exp( b );
2125     bSign = extractFloat64Sign( b );
2126     zSign = aSign ^ bSign;
2127     if ( aExp == 0x7FF ) {
2128         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2129             return propagateFloat64NaN( a, b );
2130         }
2131         if ( ( bExp | bSig ) == 0 ) {
2132             roundData->exception |= float_flag_invalid;
2133             return float64_default_nan;
2134         }
2135         return packFloat64( zSign, 0x7FF, 0 );
2136     }
2137     if ( bExp == 0x7FF ) {
2138         if ( bSig ) return propagateFloat64NaN( a, b );
2139         if ( ( aExp | aSig ) == 0 ) {
2140             roundData->exception |= float_flag_invalid;
2141             return float64_default_nan;
2142         }
2143         return packFloat64( zSign, 0x7FF, 0 );
2144     }
2145     if ( aExp == 0 ) {
2146         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2147         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2148     }
2149     if ( bExp == 0 ) {
2150         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2151         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2152     }
2153     zExp = aExp + bExp - 0x3FF;
2154     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2155     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2156     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2157     zSig0 |= ( zSig1 != 0 );
2158     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2159         zSig0 <<= 1;
2160         --zExp;
2161     }
2162     return roundAndPackFloat64( roundData, zSign, zExp, zSig0 );
2163
2164 }
2165
2166 /*
2167 -------------------------------------------------------------------------------
2168 Returns the result of dividing the double-precision floating-point value `a'
2169 by the corresponding value `b'.  The operation is performed according to
2170 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2171 -------------------------------------------------------------------------------
2172 */
2173 float64 float64_div( struct roundingData *roundData, float64 a, float64 b )
2174 {
2175     flag aSign, bSign, zSign;
2176     int16 aExp, bExp, zExp;
2177     bits64 aSig, bSig, zSig;
2178     bits64 rem0, rem1;
2179     bits64 term0, term1;
2180
2181     aSig = extractFloat64Frac( a );
2182     aExp = extractFloat64Exp( a );
2183     aSign = extractFloat64Sign( a );
2184     bSig = extractFloat64Frac( b );
2185     bExp = extractFloat64Exp( b );
2186     bSign = extractFloat64Sign( b );
2187     zSign = aSign ^ bSign;
2188     if ( aExp == 0x7FF ) {
2189         if ( aSig ) return propagateFloat64NaN( a, b );
2190         if ( bExp == 0x7FF ) {
2191             if ( bSig ) return propagateFloat64NaN( a, b );
2192             roundData->exception |= float_flag_invalid;
2193             return float64_default_nan;
2194         }
2195         return packFloat64( zSign, 0x7FF, 0 );
2196     }
2197     if ( bExp == 0x7FF ) {
2198         if ( bSig ) return propagateFloat64NaN( a, b );
2199         return packFloat64( zSign, 0, 0 );
2200     }
2201     if ( bExp == 0 ) {
2202         if ( bSig == 0 ) {
2203             if ( ( aExp | aSig ) == 0 ) {
2204                 roundData->exception |= float_flag_invalid;
2205                 return float64_default_nan;
2206             }
2207             roundData->exception |= float_flag_divbyzero;
2208             return packFloat64( zSign, 0x7FF, 0 );
2209         }
2210         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2211     }
2212     if ( aExp == 0 ) {
2213         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2214         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2215     }
2216     zExp = aExp - bExp + 0x3FD;
2217     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2218     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2219     if ( bSig <= ( aSig + aSig ) ) {
2220         aSig >>= 1;
2221         ++zExp;
2222     }
2223     zSig = estimateDiv128To64( aSig, 0, bSig );
2224     if ( ( zSig & 0x1FF ) <= 2 ) {
2225         mul64To128( bSig, zSig, &term0, &term1 );
2226         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2227         while ( (sbits64) rem0 < 0 ) {
2228             --zSig;
2229             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2230         }
2231         zSig |= ( rem1 != 0 );
2232     }
2233     return roundAndPackFloat64( roundData, zSign, zExp, zSig );
2234
2235 }
2236
2237 /*
2238 -------------------------------------------------------------------------------
2239 Returns the remainder of the double-precision floating-point value `a'
2240 with respect to the corresponding value `b'.  The operation is performed
2241 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2242 -------------------------------------------------------------------------------
2243 */
2244 float64 float64_rem( struct roundingData *roundData, float64 a, float64 b )
2245 {
2246     flag aSign, bSign, zSign;
2247     int16 aExp, bExp, expDiff;
2248     bits64 aSig, bSig;
2249     bits64 q, alternateASig;
2250     sbits64 sigMean;
2251
2252     aSig = extractFloat64Frac( a );
2253     aExp = extractFloat64Exp( a );
2254     aSign = extractFloat64Sign( a );
2255     bSig = extractFloat64Frac( b );
2256     bExp = extractFloat64Exp( b );
2257     bSign = extractFloat64Sign( b );
2258     if ( aExp == 0x7FF ) {
2259         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2260             return propagateFloat64NaN( a, b );
2261         }
2262         roundData->exception |= float_flag_invalid;
2263         return float64_default_nan;
2264     }
2265     if ( bExp == 0x7FF ) {
2266         if ( bSig ) return propagateFloat64NaN( a, b );
2267         return a;
2268     }
2269     if ( bExp == 0 ) {
2270         if ( bSig == 0 ) {
2271             roundData->exception |= float_flag_invalid;
2272             return float64_default_nan;
2273         }
2274         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2275     }
2276     if ( aExp == 0 ) {
2277         if ( aSig == 0 ) return a;
2278         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2279     }
2280     expDiff = aExp - bExp;
2281     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2282     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2283     if ( expDiff < 0 ) {
2284         if ( expDiff < -1 ) return a;
2285         aSig >>= 1;
2286     }
2287     q = ( bSig <= aSig );
2288     if ( q ) aSig -= bSig;
2289     expDiff -= 64;
2290     while ( 0 < expDiff ) {
2291         q = estimateDiv128To64( aSig, 0, bSig );
2292         q = ( 2 < q ) ? q - 2 : 0;
2293         aSig = - ( ( bSig>>2 ) * q );
2294         expDiff -= 62;
2295     }
2296     expDiff += 64;
2297     if ( 0 < expDiff ) {
2298         q = estimateDiv128To64( aSig, 0, bSig );
2299         q = ( 2 < q ) ? q - 2 : 0;
2300         q >>= 64 - expDiff;
2301         bSig >>= 2;
2302         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2303     }
2304     else {
2305         aSig >>= 2;
2306         bSig >>= 2;
2307     }
2308     do {
2309         alternateASig = aSig;
2310         ++q;
2311         aSig -= bSig;
2312     } while ( 0 <= (sbits64) aSig );
2313     sigMean = aSig + alternateASig;
2314     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2315         aSig = alternateASig;
2316     }
2317     zSign = ( (sbits64) aSig < 0 );
2318     if ( zSign ) aSig = - aSig;
2319     return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig );
2320
2321 }
2322
2323 /*
2324 -------------------------------------------------------------------------------
2325 Returns the square root of the double-precision floating-point value `a'.
2326 The operation is performed according to the IEC/IEEE Standard for Binary
2327 Floating-point Arithmetic.
2328 -------------------------------------------------------------------------------
2329 */
2330 float64 float64_sqrt( struct roundingData *roundData, float64 a )
2331 {
2332     flag aSign;
2333     int16 aExp, zExp;
2334     bits64 aSig, zSig;
2335     bits64 rem0, rem1, term0, term1; //, shiftedRem;
2336     //float64 z;
2337
2338     aSig = extractFloat64Frac( a );
2339     aExp = extractFloat64Exp( a );
2340     aSign = extractFloat64Sign( a );
2341     if ( aExp == 0x7FF ) {
2342         if ( aSig ) return propagateFloat64NaN( a, a );
2343         if ( ! aSign ) return a;
2344         roundData->exception |= float_flag_invalid;
2345         return float64_default_nan;
2346     }
2347     if ( aSign ) {
2348         if ( ( aExp | aSig ) == 0 ) return a;
2349         roundData->exception |= float_flag_invalid;
2350         return float64_default_nan;
2351     }
2352     if ( aExp == 0 ) {
2353         if ( aSig == 0 ) return 0;
2354         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2355     }
2356     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2357     aSig |= LIT64( 0x0010000000000000 );
2358     zSig = estimateSqrt32( aExp, aSig>>21 );
2359     zSig <<= 31;
2360     aSig <<= 9 - ( aExp & 1 );
2361     zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2362     if ( ( zSig & 0x3FF ) <= 5 ) {
2363         if ( zSig < 2 ) {
2364             zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2365         }
2366         else {
2367             aSig <<= 2;
2368             mul64To128( zSig, zSig, &term0, &term1 );
2369             sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2370             while ( (sbits64) rem0 < 0 ) {
2371                 --zSig;
2372                 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2373                 term1 |= 1;
2374                 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2375             }
2376             zSig |= ( ( rem0 | rem1 ) != 0 );
2377         }
2378     }
2379     shift64RightJamming( zSig, 1, &zSig );
2380     return roundAndPackFloat64( roundData, 0, zExp, zSig );
2381
2382 }
2383
2384 /*
2385 -------------------------------------------------------------------------------
2386 Returns 1 if the double-precision floating-point value `a' is equal to the
2387 corresponding value `b', and 0 otherwise.  The comparison is performed
2388 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2389 -------------------------------------------------------------------------------
2390 */
2391 flag float64_eq( float64 a, float64 b )
2392 {
2393
2394     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2395          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2396        ) {
2397         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2398             float_raise( float_flag_invalid );
2399         }
2400         return 0;
2401     }
2402     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2403
2404 }
2405
2406 /*
2407 -------------------------------------------------------------------------------
2408 Returns 1 if the double-precision floating-point value `a' is less than or
2409 equal to the corresponding value `b', and 0 otherwise.  The comparison is
2410 performed according to the IEC/IEEE Standard for Binary Floating-point
2411 Arithmetic.
2412 -------------------------------------------------------------------------------
2413 */
2414 flag float64_le( float64 a, float64 b )
2415 {
2416     flag aSign, bSign;
2417
2418     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2419          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2420        ) {
2421         float_raise( float_flag_invalid );
2422         return 0;
2423     }
2424     aSign = extractFloat64Sign( a );
2425     bSign = extractFloat64Sign( b );
2426     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2427     return ( a == b ) || ( aSign ^ ( a < b ) );
2428
2429 }
2430
2431 /*
2432 -------------------------------------------------------------------------------
2433 Returns 1 if the double-precision floating-point value `a' is less than
2434 the corresponding value `b', and 0 otherwise.  The comparison is performed
2435 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2436 -------------------------------------------------------------------------------
2437 */
2438 flag float64_lt( float64 a, float64 b )
2439 {
2440     flag aSign, bSign;
2441
2442     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2443          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2444        ) {
2445         float_raise( float_flag_invalid );
2446         return 0;
2447     }
2448     aSign = extractFloat64Sign( a );
2449     bSign = extractFloat64Sign( b );
2450     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2451     return ( a != b ) && ( aSign ^ ( a < b ) );
2452
2453 }
2454
2455 /*
2456 -------------------------------------------------------------------------------
2457 Returns 1 if the double-precision floating-point value `a' is equal to the
2458 corresponding value `b', and 0 otherwise.  The invalid exception is raised
2459 if either operand is a NaN.  Otherwise, the comparison is performed
2460 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2461 -------------------------------------------------------------------------------
2462 */
2463 flag float64_eq_signaling( float64 a, float64 b )
2464 {
2465
2466     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2467          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2468        ) {
2469         float_raise( float_flag_invalid );
2470         return 0;
2471     }
2472     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2473
2474 }
2475
2476 /*
2477 -------------------------------------------------------------------------------
2478 Returns 1 if the double-precision floating-point value `a' is less than or
2479 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2480 cause an exception.  Otherwise, the comparison is performed according to the
2481 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2482 -------------------------------------------------------------------------------
2483 */
2484 flag float64_le_quiet( float64 a, float64 b )
2485 {
2486     flag aSign, bSign;
2487     //int16 aExp, bExp;
2488
2489     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2490          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2491        ) {
2492         /* Do nothing, even if NaN as we're quiet */
2493         return 0;
2494     }
2495     aSign = extractFloat64Sign( a );
2496     bSign = extractFloat64Sign( b );
2497     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2498     return ( a == b ) || ( aSign ^ ( a < b ) );
2499
2500 }
2501
2502 /*
2503 -------------------------------------------------------------------------------
2504 Returns 1 if the double-precision floating-point value `a' is less than
2505 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2506 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2507 Standard for Binary Floating-point Arithmetic.
2508 -------------------------------------------------------------------------------
2509 */
2510 flag float64_lt_quiet( float64 a, float64 b )
2511 {
2512     flag aSign, bSign;
2513
2514     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2515          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2516        ) {
2517         /* Do nothing, even if NaN as we're quiet */
2518         return 0;
2519     }
2520     aSign = extractFloat64Sign( a );
2521     bSign = extractFloat64Sign( b );
2522     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2523     return ( a != b ) && ( aSign ^ ( a < b ) );
2524
2525 }
2526
2527 #ifdef FLOATX80
2528
2529 /*
2530 -------------------------------------------------------------------------------
2531 Returns the result of converting the extended double-precision floating-
2532 point value `a' to the 32-bit two's complement integer format.  The
2533 conversion is performed according to the IEC/IEEE Standard for Binary
2534 Floating-point Arithmetic---which means in particular that the conversion
2535 is rounded according to the current rounding mode.  If `a' is a NaN, the
2536 largest positive integer is returned.  Otherwise, if the conversion
2537 overflows, the largest integer with the same sign as `a' is returned.
2538 -------------------------------------------------------------------------------
2539 */
2540 int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a )
2541 {
2542     flag aSign;
2543     int32 aExp, shiftCount;
2544     bits64 aSig;
2545
2546     aSig = extractFloatx80Frac( a );
2547     aExp = extractFloatx80Exp( a );
2548     aSign = extractFloatx80Sign( a );
2549     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2550     shiftCount = 0x4037 - aExp;
2551     if ( shiftCount <= 0 ) shiftCount = 1;
2552     shift64RightJamming( aSig, shiftCount, &aSig );
2553     return roundAndPackInt32( roundData, aSign, aSig );
2554
2555 }
2556
2557 /*
2558 -------------------------------------------------------------------------------
2559 Returns the result of converting the extended double-precision floating-
2560 point value `a' to the 32-bit two's complement integer format.  The
2561 conversion is performed according to the IEC/IEEE Standard for Binary
2562 Floating-point Arithmetic, except that the conversion is always rounded
2563 toward zero.  If `a' is a NaN, the largest positive integer is returned.
2564 Otherwise, if the conversion overflows, the largest integer with the same
2565 sign as `a' is returned.
2566 -------------------------------------------------------------------------------
2567 */
2568 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2569 {
2570     flag aSign;
2571     int32 aExp, shiftCount;
2572     bits64 aSig, savedASig;
2573     int32 z;
2574
2575     aSig = extractFloatx80Frac( a );
2576     aExp = extractFloatx80Exp( a );
2577     aSign = extractFloatx80Sign( a );
2578     shiftCount = 0x403E - aExp;
2579     if ( shiftCount < 32 ) {
2580         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2581         goto invalid;
2582     }
2583     else if ( 63 < shiftCount ) {
2584         if ( aExp || aSig ) float_raise( float_flag_inexact );
2585         return 0;
2586     }
2587     savedASig = aSig;
2588     aSig >>= shiftCount;
2589     z = aSig;
2590     if ( aSign ) z = - z;
2591     if ( ( z < 0 ) ^ aSign ) {
2592  invalid:
2593         float_raise( float_flag_invalid );
2594         return aSign ? 0x80000000 : 0x7FFFFFFF;
2595     }
2596     if ( ( aSig<<shiftCount ) != savedASig ) {
2597         float_raise( float_flag_inexact );
2598     }
2599     return z;
2600
2601 }
2602
2603 /*
2604 -------------------------------------------------------------------------------
2605 Returns the result of converting the extended double-precision floating-
2606 point value `a' to the single-precision floating-point format.  The
2607 conversion is performed according to the IEC/IEEE Standard for Binary
2608 Floating-point Arithmetic.
2609 -------------------------------------------------------------------------------
2610 */
2611 float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a )
2612 {
2613     flag aSign;
2614     int32 aExp;
2615     bits64 aSig;
2616
2617     aSig = extractFloatx80Frac( a );
2618     aExp = extractFloatx80Exp( a );
2619     aSign = extractFloatx80Sign( a );
2620     if ( aExp == 0x7FFF ) {
2621         if ( (bits64) ( aSig<<1 ) ) {
2622             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2623         }
2624         return packFloat32( aSign, 0xFF, 0 );
2625     }
2626     shift64RightJamming( aSig, 33, &aSig );
2627     if ( aExp || aSig ) aExp -= 0x3F81;
2628     return roundAndPackFloat32( roundData, aSign, aExp, aSig );
2629
2630 }
2631
2632 /*
2633 -------------------------------------------------------------------------------
2634 Returns the result of converting the extended double-precision floating-
2635 point value `a' to the double-precision floating-point format.  The
2636 conversion is performed according to the IEC/IEEE Standard for Binary
2637 Floating-point Arithmetic.
2638 -------------------------------------------------------------------------------
2639 */
2640 float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a )
2641 {
2642     flag aSign;
2643     int32 aExp;
2644     bits64 aSig, zSig;
2645
2646     aSig = extractFloatx80Frac( a );
2647     aExp = extractFloatx80Exp( a );
2648     aSign = extractFloatx80Sign( a );
2649     if ( aExp == 0x7FFF ) {
2650         if ( (bits64) ( aSig<<1 ) ) {
2651             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2652         }
2653         return packFloat64( aSign, 0x7FF, 0 );
2654     }
2655     shift64RightJamming( aSig, 1, &zSig );
2656     if ( aExp || aSig ) aExp -= 0x3C01;
2657     return roundAndPackFloat64( roundData, aSign, aExp, zSig );
2658
2659 }
2660
2661 /*
2662 -------------------------------------------------------------------------------
2663 Rounds the extended double-precision floating-point value `a' to an integer,
2664 and returns the result as an extended quadruple-precision floating-point
2665 value.  The operation is performed according to the IEC/IEEE Standard for
2666 Binary Floating-point Arithmetic.
2667 -------------------------------------------------------------------------------
2668 */
2669 floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a )
2670 {
2671     flag aSign;
2672     int32 aExp;
2673     bits64 lastBitMask, roundBitsMask;
2674     int8 roundingMode;
2675     floatx80 z;
2676
2677     aExp = extractFloatx80Exp( a );
2678     if ( 0x403E <= aExp ) {
2679         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2680             return propagateFloatx80NaN( a, a );
2681         }
2682         return a;
2683     }
2684     if ( aExp <= 0x3FFE ) {
2685         if (    ( aExp == 0 )
2686              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2687             return a;
2688         }
2689         roundData->exception |= float_flag_inexact;
2690         aSign = extractFloatx80Sign( a );
2691         switch ( roundData->mode ) {
2692          case float_round_nearest_even:
2693             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2694                ) {
2695                 return
2696                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2697             }
2698             break;
2699          case float_round_down:
2700             return
2701                   aSign ?
2702                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2703                 : packFloatx80( 0, 0, 0 );
2704          case float_round_up:
2705             return
2706                   aSign ? packFloatx80( 1, 0, 0 )
2707                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2708         }
2709         return packFloatx80( aSign, 0, 0 );
2710     }
2711     lastBitMask = 1;
2712     lastBitMask <<= 0x403E - aExp;
2713     roundBitsMask = lastBitMask - 1;
2714     z = a;
2715     roundingMode = roundData->mode;
2716     if ( roundingMode == float_round_nearest_even ) {
2717         z.low += lastBitMask>>1;
2718         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2719     }
2720     else if ( roundingMode != float_round_to_zero ) {
2721         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2722             z.low += roundBitsMask;
2723         }
2724     }
2725     z.low &= ~ roundBitsMask;
2726     if ( z.low == 0 ) {
2727         ++z.high;
2728         z.low = LIT64( 0x8000000000000000 );
2729     }
2730     if ( z.low != a.low ) roundData->exception |= float_flag_inexact;
2731     return z;
2732
2733 }
2734
2735 /*
2736 -------------------------------------------------------------------------------
2737 Returns the result of adding the absolute values of the extended double-
2738 precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
2739 negated before being returned.  `zSign' is ignored if the result is a NaN.
2740 The addition is performed according to the IEC/IEEE Standard for Binary
2741 Floating-point Arithmetic.
2742 -------------------------------------------------------------------------------
2743 */
2744 static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2745 {
2746     int32 aExp, bExp, zExp;
2747     bits64 aSig, bSig, zSig0, zSig1;
2748     int32 expDiff;
2749
2750     aSig = extractFloatx80Frac( a );
2751     aExp = extractFloatx80Exp( a );
2752     bSig = extractFloatx80Frac( b );
2753     bExp = extractFloatx80Exp( b );
2754     expDiff = aExp - bExp;
2755     if ( 0 < expDiff ) {
2756         if ( aExp == 0x7FFF ) {
2757             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2758             return a;
2759         }
2760         if ( bExp == 0 ) --expDiff;
2761         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2762         zExp = aExp;
2763     }
2764     else if ( expDiff < 0 ) {
2765         if ( bExp == 0x7FFF ) {
2766             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2767             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2768         }
2769         if ( aExp == 0 ) ++expDiff;
2770         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2771         zExp = bExp;
2772     }
2773     else {
2774         if ( aExp == 0x7FFF ) {
2775             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2776                 return propagateFloatx80NaN( a, b );
2777             }
2778             return a;
2779         }
2780         zSig1 = 0;
2781         zSig0 = aSig + bSig;
2782         if ( aExp == 0 ) {
2783             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2784             goto roundAndPack;
2785         }
2786         zExp = aExp;
2787         goto shiftRight1;
2788     }
2789
2790     zSig0 = aSig + bSig;
2791
2792     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2793  shiftRight1:
2794     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2795     zSig0 |= LIT64( 0x8000000000000000 );
2796     ++zExp;
2797  roundAndPack:
2798     return
2799         roundAndPackFloatx80(
2800             roundData, zSign, zExp, zSig0, zSig1 );
2801
2802 }
2803
2804 /*
2805 -------------------------------------------------------------------------------
2806 Returns the result of subtracting the absolute values of the extended
2807 double-precision floating-point values `a' and `b'.  If `zSign' is true,
2808 the difference is negated before being returned.  `zSign' is ignored if the
2809 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2810 Standard for Binary Floating-point Arithmetic.
2811 -------------------------------------------------------------------------------
2812 */
2813 static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2814 {
2815     int32 aExp, bExp, zExp;
2816     bits64 aSig, bSig, zSig0, zSig1;
2817     int32 expDiff;
2818     floatx80 z;
2819
2820     aSig = extractFloatx80Frac( a );
2821     aExp = extractFloatx80Exp( a );
2822     bSig = extractFloatx80Frac( b );
2823     bExp = extractFloatx80Exp( b );
2824     expDiff = aExp - bExp;
2825     if ( 0 < expDiff ) goto aExpBigger;
2826     if ( expDiff < 0 ) goto bExpBigger;
2827     if ( aExp == 0x7FFF ) {
2828         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2829             return propagateFloatx80NaN( a, b );
2830         }
2831         roundData->exception |= float_flag_invalid;
2832         z.low = floatx80_default_nan_low;
2833         z.high = floatx80_default_nan_high;
2834         return z;
2835     }
2836     if ( aExp == 0 ) {
2837         aExp = 1;
2838         bExp = 1;
2839     }
2840     zSig1 = 0;
2841     if ( bSig < aSig ) goto aBigger;
2842     if ( aSig < bSig ) goto bBigger;
2843     return packFloatx80( roundData->mode == float_round_down, 0, 0 );
2844  bExpBigger:
2845     if ( bExp == 0x7FFF ) {
2846         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2847         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2848     }
2849     if ( aExp == 0 ) ++expDiff;
2850     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2851  bBigger:
2852     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2853     zExp = bExp;
2854     zSign ^= 1;
2855     goto normalizeRoundAndPack;
2856  aExpBigger:
2857     if ( aExp == 0x7FFF ) {
2858         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2859         return a;
2860     }
2861     if ( bExp == 0 ) --expDiff;
2862     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2863  aBigger:
2864     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2865     zExp = aExp;
2866  normalizeRoundAndPack:
2867     return
2868         normalizeRoundAndPackFloatx80(
2869             roundData, zSign, zExp, zSig0, zSig1 );
2870
2871 }
2872
2873 /*
2874 -------------------------------------------------------------------------------
2875 Returns the result of adding the extended double-precision floating-point
2876 values `a' and `b'.  The operation is performed according to the IEC/IEEE
2877 Standard for Binary Floating-point Arithmetic.
2878 -------------------------------------------------------------------------------
2879 */
2880 floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b )
2881 {
2882     flag aSign, bSign;
2883
2884     aSign = extractFloatx80Sign( a );
2885     bSign = extractFloatx80Sign( b );
2886     if ( aSign == bSign ) {
2887         return addFloatx80Sigs( roundData, a, b, aSign );
2888     }
2889     else {
2890         return subFloatx80Sigs( roundData, a, b, aSign );
2891     }
2892
2893 }
2894
2895 /*
2896 -------------------------------------------------------------------------------
2897 Returns the result of subtracting the extended double-precision floating-
2898 point values `a' and `b'.  The operation is performed according to the
2899 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2900 -------------------------------------------------------------------------------
2901 */
2902 floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b )
2903 {
2904     flag aSign, bSign;
2905
2906     aSign = extractFloatx80Sign( a );
2907     bSign = extractFloatx80Sign( b );
2908     if ( aSign == bSign ) {
2909         return subFloatx80Sigs( roundData, a, b, aSign );
2910     }
2911     else {
2912         return addFloatx80Sigs( roundData, a, b, aSign );
2913     }
2914
2915 }
2916
2917 /*
2918 -------------------------------------------------------------------------------
2919 Returns the result of multiplying the extended double-precision floating-
2920 point values `a' and `b'.  The operation is performed according to the
2921 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2922 -------------------------------------------------------------------------------
2923 */
2924 floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b )
2925 {
2926     flag aSign, bSign, zSign;
2927     int32 aExp, bExp, zExp;
2928     bits64 aSig, bSig, zSig0, zSig1;
2929     floatx80 z;
2930
2931     aSig = extractFloatx80Frac( a );
2932     aExp = extractFloatx80Exp( a );
2933     aSign = extractFloatx80Sign( a );
2934     bSig = extractFloatx80Frac( b );
2935     bExp = extractFloatx80Exp( b );
2936     bSign = extractFloatx80Sign( b );
2937     zSign = aSign ^ bSign;
2938     if ( aExp == 0x7FFF ) {
2939         if (    (bits64) ( aSig<<1 )
2940              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2941             return propagateFloatx80NaN( a, b );
2942         }
2943         if ( ( bExp | bSig ) == 0 ) goto invalid;
2944         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2945     }
2946     if ( bExp == 0x7FFF ) {
2947         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2948         if ( ( aExp | aSig ) == 0 ) {
2949  invalid:
2950             roundData->exception |= float_flag_invalid;
2951             z.low = floatx80_default_nan_low;
2952             z.high = floatx80_default_nan_high;
2953             return z;
2954         }
2955         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2956     }
2957     if ( aExp == 0 ) {
2958         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2959         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2960     }
2961     if ( bExp == 0 ) {
2962         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2963         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2964     }
2965     zExp = aExp + bExp - 0x3FFE;
2966     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2967     if ( 0 < (sbits64) zSig0 ) {
2968         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2969         --zExp;
2970     }
2971     return
2972         roundAndPackFloatx80(
2973             roundData, zSign, zExp, zSig0, zSig1 );
2974
2975 }
2976
2977 /*
2978 -------------------------------------------------------------------------------
2979 Returns the result of dividing the extended double-precision floating-point
2980 value `a' by the corresponding value `b'.  The operation is performed
2981 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2982 -------------------------------------------------------------------------------
2983 */
2984 floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b )
2985 {
2986     flag aSign, bSign, zSign;
2987     int32 aExp, bExp, zExp;
2988     bits64 aSig, bSig, zSig0, zSig1;
2989     bits64 rem0, rem1, rem2, term0, term1, term2;
2990     floatx80 z;
2991
2992     aSig = extractFloatx80Frac( a );
2993     aExp = extractFloatx80Exp( a );
2994     aSign = extractFloatx80Sign( a );
2995     bSig = extractFloatx80Frac( b );
2996     bExp = extractFloatx80Exp( b );
2997     bSign = extractFloatx80Sign( b );
2998     zSign = aSign ^ bSign;
2999     if ( aExp == 0x7FFF ) {
3000         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3001         if ( bExp == 0x7FFF ) {
3002             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3003             goto invalid;
3004         }
3005         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3006     }
3007     if ( bExp == 0x7FFF ) {
3008         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3009         return packFloatx80( zSign, 0, 0 );
3010     }
3011     if ( bExp == 0 ) {
3012         if ( bSig == 0 ) {
3013             if ( ( aExp | aSig ) == 0 ) {
3014  invalid:
3015                 roundData->exception |= float_flag_invalid;
3016                 z.low = floatx80_default_nan_low;
3017                 z.high = floatx80_default_nan_high;
3018                 return z;
3019             }
3020             roundData->exception |= float_flag_divbyzero;
3021             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3022         }
3023         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3024     }
3025     if ( aExp == 0 ) {
3026         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3027         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3028     }
3029     zExp = aExp - bExp + 0x3FFE;
3030     rem1 = 0;
3031     if ( bSig <= aSig ) {
3032         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3033         ++zExp;
3034     }
3035     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3036     mul64To128( bSig, zSig0, &term0, &term1 );
3037     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3038     while ( (sbits64) rem0 < 0 ) {
3039         --zSig0;
3040         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3041     }
3042     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3043     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3044         mul64To128( bSig, zSig1, &term1, &term2 );
3045         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3046         while ( (sbits64) rem1 < 0 ) {
3047             --zSig1;
3048             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3049         }
3050         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3051     }
3052     return
3053         roundAndPackFloatx80(
3054             roundData, zSign, zExp, zSig0, zSig1 );
3055
3056 }
3057
3058 /*
3059 -------------------------------------------------------------------------------
3060 Returns the remainder of the extended double-precision floating-point value
3061 `a' with respect to the corresponding value `b'.  The operation is performed
3062 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3063 -------------------------------------------------------------------------------
3064 */
3065 floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b )
3066 {
3067     flag aSign, bSign, zSign;
3068     int32 aExp, bExp, expDiff;
3069     bits64 aSig0, aSig1, bSig;
3070     bits64 q, term0, term1, alternateASig0, alternateASig1;
3071     floatx80 z;
3072
3073     aSig0 = extractFloatx80Frac( a );
3074     aExp = extractFloatx80Exp( a );
3075     aSign = extractFloatx80Sign( a );
3076     bSig = extractFloatx80Frac( b );
3077     bExp = extractFloatx80Exp( b );
3078     bSign = extractFloatx80Sign( b );
3079     if ( aExp == 0x7FFF ) {
3080         if (    (bits64) ( aSig0<<1 )
3081              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3082             return propagateFloatx80NaN( a, b );
3083         }
3084         goto invalid;
3085     }
3086     if ( bExp == 0x7FFF ) {
3087         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3088         return a;
3089     }
3090     if ( bExp == 0 ) {
3091         if ( bSig == 0 ) {
3092  invalid:
3093             roundData->exception |= float_flag_invalid;
3094             z.low = floatx80_default_nan_low;
3095             z.high = floatx80_default_nan_high;
3096             return z;
3097         }
3098         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3099     }
3100     if ( aExp == 0 ) {
3101         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3102         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3103     }
3104     bSig |= LIT64( 0x8000000000000000 );
3105     zSign = aSign;
3106     expDiff = aExp - bExp;
3107     aSig1 = 0;
3108     if ( expDiff < 0 ) {
3109         if ( expDiff < -1 ) return a;
3110         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3111         expDiff = 0;
3112     }
3113     q = ( bSig <= aSig0 );
3114     if ( q ) aSig0 -= bSig;
3115     expDiff -= 64;
3116     while ( 0 < expDiff ) {
3117         q = estimateDiv128To64( aSig0, aSig1, bSig );
3118         q = ( 2 < q ) ? q - 2 : 0;
3119         mul64To128( bSig, q, &term0, &term1 );
3120         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3121         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3122         expDiff -= 62;
3123     }
3124     expDiff += 64;
3125     if ( 0 < expDiff ) {
3126         q = estimateDiv128To64( aSig0, aSig1, bSig );
3127         q = ( 2 < q ) ? q - 2 : 0;
3128         q >>= 64 - expDiff;
3129         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3130         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3131         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3132         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3133             ++q;
3134             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3135         }
3136     }
3137     else {
3138         term1 = 0;
3139         term0 = bSig;
3140     }
3141     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3142     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3143          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3144               && ( q & 1 ) )
3145        ) {
3146         aSig0 = alternateASig0;
3147         aSig1 = alternateASig1;
3148         zSign = ! zSign;
3149     }
3150
3151     return
3152         normalizeRoundAndPackFloatx80(
3153             roundData, zSign, bExp + expDiff, aSig0, aSig1 );
3154
3155 }
3156
3157 /*
3158 -------------------------------------------------------------------------------
3159 Returns the square root of the extended double-precision floating-point
3160 value `a'.  The operation is performed according to the IEC/IEEE Standard
3161 for Binary Floating-point Arithmetic.
3162 -------------------------------------------------------------------------------
3163 */
3164 floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a )
3165 {
3166     flag aSign;
3167     int32 aExp, zExp;
3168     bits64 aSig0, aSig1, zSig0, zSig1;
3169     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3170     bits64 shiftedRem0, shiftedRem1;
3171     floatx80 z;
3172
3173     aSig0 = extractFloatx80Frac( a );
3174     aExp = extractFloatx80Exp( a );
3175     aSign = extractFloatx80Sign( a );
3176     if ( aExp == 0x7FFF ) {
3177         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3178         if ( ! aSign ) return a;
3179         goto invalid;
3180     }
3181     if ( aSign ) {
3182         if ( ( aExp | aSig0 ) == 0 ) return a;
3183  invalid:
3184         roundData->exception |= float_flag_invalid;
3185         z.low = floatx80_default_nan_low;
3186         z.high = floatx80_default_nan_high;
3187         return z;
3188     }
3189     if ( aExp == 0 ) {
3190         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3191         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3192     }
3193     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3194     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3195     zSig0 <<= 31;
3196     aSig1 = 0;
3197     shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3198     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3199     if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3200     shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3201     mul64To128( zSig0, zSig0, &term0, &term1 );
3202     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3203     while ( (sbits64) rem0 < 0 ) {
3204         --zSig0;
3205         shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3206         term1 |= 1;
3207         add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3208     }
3209     shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3210     zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3211     if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3212         if ( zSig1 == 0 ) zSig1 = 1;
3213         mul64To128( zSig0, zSig1, &term1, &term2 );
3214         shortShift128Left( term1, term2, 1, &term1, &term2 );
3215         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3216         mul64To128( zSig1, zSig1, &term2, &term3 );
3217         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3218         while ( (sbits64) rem1 < 0 ) {
3219             --zSig1;
3220             shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3221             term3 |= 1;
3222             add192(
3223                 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3224         }
3225         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3226     }
3227     return
3228         roundAndPackFloatx80(
3229             roundData, 0, zExp, zSig0, zSig1 );
3230
3231 }
3232
3233 /*
3234 -------------------------------------------------------------------------------
3235 Returns 1 if the extended double-precision floating-point value `a' is
3236 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3237 performed according to the IEC/IEEE Standard for Binary Floating-point
3238 Arithmetic.
3239 -------------------------------------------------------------------------------
3240 */
3241 flag floatx80_eq( floatx80 a, floatx80 b )
3242 {
3243
3244     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3245               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3246          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3247               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3248        ) {
3249         if (    floatx80_is_signaling_nan( a )
3250              || floatx80_is_signaling_nan( b ) ) {
3251             float_raise( float_flag_invalid );
3252         }
3253         return 0;
3254     }
3255     return
3256            ( a.low == b.low )
3257         && (    ( a.high == b.high )
3258              || (    ( a.low == 0 )
3259                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3260            );
3261
3262 }
3263
3264 /*
3265 -------------------------------------------------------------------------------
3266 Returns 1 if the extended double-precision floating-point value `a' is
3267 less than or equal to the corresponding value `b', and 0 otherwise.  The
3268 comparison is performed according to the IEC/IEEE Standard for Binary
3269 Floating-point Arithmetic.
3270 -------------------------------------------------------------------------------
3271 */
3272 flag floatx80_le( floatx80 a, floatx80 b )
3273 {
3274     flag aSign, bSign;
3275
3276     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3277               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3278          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3279               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3280        ) {
3281         float_raise( float_flag_invalid );
3282         return 0;
3283     }
3284     aSign = extractFloatx80Sign( a );
3285     bSign = extractFloatx80Sign( b );
3286     if ( aSign != bSign ) {
3287         return
3288                aSign
3289             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3290                  == 0 );
3291     }
3292     return
3293           aSign ? le128( b.high, b.low, a.high, a.low )
3294         : le128( a.high, a.low, b.high, b.low );
3295
3296 }
3297
3298 /*
3299 -------------------------------------------------------------------------------
3300 Returns 1 if the extended double-precision floating-point value `a' is
3301 less than the corresponding value `b', and 0 otherwise.  The comparison
3302 is performed according to the IEC/IEEE Standard for Binary Floating-point
3303 Arithmetic.
3304 -------------------------------------------------------------------------------
3305 */
3306 flag floatx80_lt( floatx80 a, floatx80 b )
3307 {
3308     flag aSign, bSign;
3309
3310     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3311               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3312          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3313               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3314        ) {
3315         float_raise( float_flag_invalid );
3316         return 0;
3317     }
3318     aSign = extractFloatx80Sign( a );
3319     bSign = extractFloatx80Sign( b );
3320     if ( aSign != bSign ) {
3321         return
3322                aSign
3323             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3324                  != 0 );
3325     }
3326     return
3327           aSign ? lt128( b.high, b.low, a.high, a.low )
3328         : lt128( a.high, a.low, b.high, b.low );
3329
3330 }
3331
3332 /*
3333 -------------------------------------------------------------------------------
3334 Returns 1 if the extended double-precision floating-point value `a' is equal
3335 to the corresponding value `b', and 0 otherwise.  The invalid exception is
3336 raised if either operand is a NaN.  Otherwise, the comparison is performed
3337 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3338 -------------------------------------------------------------------------------
3339 */
3340 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3341 {
3342
3343     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3344               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3345          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3346               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3347        ) {
3348         float_raise( float_flag_invalid );
3349         return 0;
3350     }
3351     return
3352            ( a.low == b.low )
3353         && (    ( a.high == b.high )
3354              || (    ( a.low == 0 )
3355                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3356            );
3357
3358 }
3359
3360 /*
3361 -------------------------------------------------------------------------------
3362 Returns 1 if the extended double-precision floating-point value `a' is less
3363 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3364 do not cause an exception.  Otherwise, the comparison is performed according
3365 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3366 -------------------------------------------------------------------------------
3367 */
3368 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3369 {
3370     flag aSign, bSign;
3371
3372     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3373               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3374          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3375               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3376        ) {
3377         /* Do nothing, even if NaN as we're quiet */
3378         return 0;
3379     }
3380     aSign = extractFloatx80Sign( a );
3381     bSign = extractFloatx80Sign( b );
3382     if ( aSign != bSign ) {
3383         return
3384                aSign
3385             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3386                  == 0 );
3387     }
3388     return
3389           aSign ? le128( b.high, b.low, a.high, a.low )
3390         : le128( a.high, a.low, b.high, b.low );
3391
3392 }
3393
3394 /*
3395 -------------------------------------------------------------------------------
3396 Returns 1 if the extended double-precision floating-point value `a' is less
3397 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3398 an exception.  Otherwise, the comparison is performed according to the
3399 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3400 -------------------------------------------------------------------------------
3401 */
3402 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3403 {
3404     flag aSign, bSign;
3405
3406     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3407               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3408          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3409               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3410        ) {
3411         /* Do nothing, even if NaN as we're quiet */
3412         return 0;
3413     }
3414     aSign = extractFloatx80Sign( a );
3415     bSign = extractFloatx80Sign( b );
3416     if ( aSign != bSign ) {
3417         return
3418                aSign
3419             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3420                  != 0 );
3421     }
3422     return
3423           aSign ? lt128( b.high, b.low, a.high, a.low )
3424         : lt128( a.high, a.low, b.high, b.low );
3425
3426 }
3427
3428 #endif
3429